| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- """
- Unit tests for TxtReader.
- """
- import pytest
- from pathlib import Path
- from src.cleaning.reader import TxtReader, TxtReaderError
- @pytest.fixture
- def reader():
- """Create a TxtReader instance."""
- return TxtReader()
- @pytest.fixture
- def temp_dir(tmp_path):
- """Create temporary directory for test files."""
- return tmp_path
- @pytest.fixture
- def utf8_file(temp_dir):
- """Create a UTF-8 encoded test file."""
- file_path = temp_dir / "utf8_test.txt"
- content = "这是一个测试文件。\n第二行内容。\n第三行内容。"
- file_path.write_text(content, encoding="utf-8")
- return file_path, content
- @pytest.fixture
- def gbk_file(temp_dir):
- """Create a GBK encoded test file."""
- file_path = temp_dir / "gbk_test.txt"
- content = "这是一个GBK编码的测试文件。\n第二行内容。"
- file_path.write_bytes(content.encode("gbk"))
- return file_path, content
- @pytest.fixture
- def utf8_bom_file(temp_dir):
- """Create a UTF-8 with BOM test file."""
- file_path = temp_dir / "utf8_bom_test.txt"
- content = "这是带有BOM的UTF-8文件。\n第二行。"
- # Write UTF-8 BOM + content
- file_path.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
- return file_path, content
- @pytest.fixture
- def large_file(temp_dir):
- """Create a large test file for performance testing."""
- file_path = temp_dir / "large_test.txt"
- # Create a file with about 100KB of content
- content = "这是测试内容。" * 10000
- file_path.write_text(content, encoding="utf-8")
- return file_path, content
- class TestTxtReader:
- """Test suite for TxtReader."""
- def test_read_utf8_file(self, reader, utf8_file):
- """Test reading a UTF-8 encoded file."""
- file_path, expected_content = utf8_file
- content = reader.read(file_path)
- assert content == expected_content
- def test_read_gbk_file(self, reader, gbk_file):
- """Test reading a GBK encoded file."""
- file_path, expected_content = gbk_file
- content = reader.read(file_path)
- assert content == expected_content
- def test_read_utf8_bom_file(self, reader, utf8_bom_file):
- """Test reading a UTF-8 file with BOM."""
- file_path, expected_content = utf8_bom_file
- content = reader.read(file_path)
- assert content == expected_content
- def test_read_nonexistent_file(self, reader):
- """Test reading a non-existent file raises FileNotFoundError."""
- with pytest.raises(FileNotFoundError):
- reader.read("/nonexistent/path/file.txt")
- def test_read_with_info_returns_encoding(self, reader, utf8_file):
- """Test read_with_info returns both content and encoding."""
- file_path, expected_content = utf8_file
- content, encoding = reader.read_with_info(file_path)
- assert content == expected_content
- assert encoding == "utf-8"
- def test_read_with_info_gbk(self, reader, gbk_file):
- """Test read_with_info detects GBK encoding."""
- file_path, expected_content = gbk_file
- content, encoding = reader.read_with_info(file_path)
- assert content == expected_content
- # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
- assert encoding in ["gbk", "gb2312", "gb18030"]
- def test_read_lines_keep_newlines(self, reader, utf8_file):
- """Test reading lines with newlines preserved."""
- file_path, content = utf8_file
- lines = reader.read_lines(file_path, keep_newlines=True)
- assert len(lines) == 3
- assert lines[0].endswith("\n")
- def test_read_lines_without_newlines(self, reader, utf8_file):
- """Test reading lines without newlines."""
- file_path, content = utf8_file
- lines = reader.read_lines(file_path, keep_newlines=False)
- assert len(lines) == 3
- assert not lines[0].endswith("\n")
- def test_detect_encoding_utf8(self, reader, utf8_file):
- """Test encoding detection for UTF-8."""
- file_path, _ = utf8_file
- encoding = reader.detect_encoding(file_path)
- assert encoding in ["utf-8", "ascii"]
- def test_detect_encoding_gbk(self, reader, gbk_file):
- """Test encoding detection for GBK."""
- file_path, _ = gbk_file
- encoding = reader.detect_encoding(file_path)
- # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
- assert encoding in ["gbk", "gb2312", "gb18030"]
- def test_detect_encoding_nonexistent(self, reader):
- """Test encoding detection for non-existent file."""
- encoding = reader.detect_encoding("/nonexistent/file.txt")
- assert encoding is None
- def test_is_binary_with_text_file(self, reader, utf8_file):
- """Test is_binary returns False for text files."""
- file_path, _ = utf8_file
- assert not reader.is_binary(file_path)
- def test_is_binary_with_binary_file(self, reader, temp_dir):
- """Test is_binary returns True for binary files."""
- file_path = temp_dir / "binary_test.bin"
- file_path.write_bytes(b"\x00\x01\x02\x03" * 1000)
- assert reader.is_binary(file_path)
- def test_is_binary_nonexistent(self, reader):
- """Test is_binary returns False for non-existent file."""
- assert not reader.is_binary("/nonexistent/file.txt")
- def test_read_large_file_performance(self, reader, large_file):
- """Test that large files are read efficiently."""
- import time
- file_path, expected_content = large_file
- start_time = time.time()
- content = reader.read(file_path)
- elapsed = time.time() - start_time
- assert content == expected_content
- # Should read 100KB in less than 1 second
- assert elapsed < 1.0
- def test_custom_default_encoding(self, temp_dir):
- """Test reader with custom default encoding."""
- file_path = temp_dir / "gbk_test.txt"
- content = "GBK编码测试"
- file_path.write_bytes(content.encode("gbk"))
- reader = TxtReader(default_encoding="gbk")
- result = reader.read(file_path)
- assert result == content
- def test_empty_file(self, reader, temp_dir):
- """Test reading an empty file."""
- file_path = temp_dir / "empty.txt"
- file_path.write_text("", encoding="utf-8")
- content = reader.read(file_path)
- assert content == ""
- def test_file_with_special_characters(self, reader, temp_dir):
- """Test reading file with various special characters."""
- file_path = temp_dir / "special.txt"
- content = "测试!@#$%^&*()_+-=[]{}|;':\",./<>?\n换行\n制表符\t内容"
- file_path.write_text(content, encoding="utf-8")
- result = reader.read(file_path)
- assert result == content
- def test_file_with_mixed_line_endings(self, reader, temp_dir):
- """Test reading file with mixed line endings gets normalized."""
- file_path = temp_dir / "mixed_endings.txt"
- # Write with binary to preserve exact bytes
- content = "Line1\nLine2\r\nLine3\r"
- file_path.write_bytes(content.encode("utf-8"))
- result = reader.read(file_path)
- # Python's text mode normalizes line endings to \n
- expected = "Line1\nLine2\nLine3\n"
- assert result == expected
- class TestTxtReaderErrorHandling:
- """Test error handling in TxtReader."""
- def test_directory_path_raises_error(self, reader, temp_dir):
- """Test that reading a directory raises TxtReaderError."""
- with pytest.raises(TxtReaderError):
- reader.read(temp_dir)
- def test_unreadable_encoding(self, reader, temp_dir):
- """Test handling of file with encoding that can't be auto-detected."""
- # Create a file that's valid UTF-16 but not readable as UTF-8
- file_path = temp_dir / "utf16_test.txt"
- content = "测试内容"
- file_path.write_bytes(content.encode("utf-16-le"))
- # Should still be able to read it via fallback
- result = reader.read(file_path)
- assert "测试" in result or len(result) > 0
|