""" Unit tests for TxtReader. """ import pytest from pathlib import Path from src.cleaning.reader import TxtReader, TxtReaderError @pytest.fixture def reader(): """Create a TxtReader instance.""" return TxtReader() @pytest.fixture def temp_dir(tmp_path): """Create temporary directory for test files.""" return tmp_path @pytest.fixture def utf8_file(temp_dir): """Create a UTF-8 encoded test file.""" file_path = temp_dir / "utf8_test.txt" content = "这是一个测试文件。\n第二行内容。\n第三行内容。" file_path.write_text(content, encoding="utf-8") return file_path, content @pytest.fixture def gbk_file(temp_dir): """Create a GBK encoded test file.""" file_path = temp_dir / "gbk_test.txt" content = "这是一个GBK编码的测试文件。\n第二行内容。" file_path.write_bytes(content.encode("gbk")) return file_path, content @pytest.fixture def utf8_bom_file(temp_dir): """Create a UTF-8 with BOM test file.""" file_path = temp_dir / "utf8_bom_test.txt" content = "这是带有BOM的UTF-8文件。\n第二行。" # Write UTF-8 BOM + content file_path.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8")) return file_path, content @pytest.fixture def large_file(temp_dir): """Create a large test file for performance testing.""" file_path = temp_dir / "large_test.txt" # Create a file with about 100KB of content content = "这是测试内容。" * 10000 file_path.write_text(content, encoding="utf-8") return file_path, content class TestTxtReader: """Test suite for TxtReader.""" def test_read_utf8_file(self, reader, utf8_file): """Test reading a UTF-8 encoded file.""" file_path, expected_content = utf8_file content = reader.read(file_path) assert content == expected_content def test_read_gbk_file(self, reader, gbk_file): """Test reading a GBK encoded file.""" file_path, expected_content = gbk_file content = reader.read(file_path) assert content == expected_content def test_read_utf8_bom_file(self, reader, utf8_bom_file): """Test reading a UTF-8 file with BOM.""" file_path, expected_content = utf8_bom_file content = reader.read(file_path) assert content == expected_content def test_read_nonexistent_file(self, reader): """Test reading a non-existent file raises FileNotFoundError.""" with pytest.raises(FileNotFoundError): reader.read("/nonexistent/path/file.txt") def test_read_with_info_returns_encoding(self, reader, utf8_file): """Test read_with_info returns both content and encoding.""" file_path, expected_content = utf8_file content, encoding = reader.read_with_info(file_path) assert content == expected_content assert encoding == "utf-8" def test_read_with_info_gbk(self, reader, gbk_file): """Test read_with_info detects GBK encoding.""" file_path, expected_content = gbk_file content, encoding = reader.read_with_info(file_path) assert content == expected_content # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030 assert encoding in ["gbk", "gb2312", "gb18030"] def test_read_lines_keep_newlines(self, reader, utf8_file): """Test reading lines with newlines preserved.""" file_path, content = utf8_file lines = reader.read_lines(file_path, keep_newlines=True) assert len(lines) == 3 assert lines[0].endswith("\n") def test_read_lines_without_newlines(self, reader, utf8_file): """Test reading lines without newlines.""" file_path, content = utf8_file lines = reader.read_lines(file_path, keep_newlines=False) assert len(lines) == 3 assert not lines[0].endswith("\n") def test_detect_encoding_utf8(self, reader, utf8_file): """Test encoding detection for UTF-8.""" file_path, _ = utf8_file encoding = reader.detect_encoding(file_path) assert encoding in ["utf-8", "ascii"] def test_detect_encoding_gbk(self, reader, gbk_file): """Test encoding detection for GBK.""" file_path, _ = gbk_file encoding = reader.detect_encoding(file_path) # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030 assert encoding in ["gbk", "gb2312", "gb18030"] def test_detect_encoding_nonexistent(self, reader): """Test encoding detection for non-existent file.""" encoding = reader.detect_encoding("/nonexistent/file.txt") assert encoding is None def test_is_binary_with_text_file(self, reader, utf8_file): """Test is_binary returns False for text files.""" file_path, _ = utf8_file assert not reader.is_binary(file_path) def test_is_binary_with_binary_file(self, reader, temp_dir): """Test is_binary returns True for binary files.""" file_path = temp_dir / "binary_test.bin" file_path.write_bytes(b"\x00\x01\x02\x03" * 1000) assert reader.is_binary(file_path) def test_is_binary_nonexistent(self, reader): """Test is_binary returns False for non-existent file.""" assert not reader.is_binary("/nonexistent/file.txt") def test_read_large_file_performance(self, reader, large_file): """Test that large files are read efficiently.""" import time file_path, expected_content = large_file start_time = time.time() content = reader.read(file_path) elapsed = time.time() - start_time assert content == expected_content # Should read 100KB in less than 1 second assert elapsed < 1.0 def test_custom_default_encoding(self, temp_dir): """Test reader with custom default encoding.""" file_path = temp_dir / "gbk_test.txt" content = "GBK编码测试" file_path.write_bytes(content.encode("gbk")) reader = TxtReader(default_encoding="gbk") result = reader.read(file_path) assert result == content def test_empty_file(self, reader, temp_dir): """Test reading an empty file.""" file_path = temp_dir / "empty.txt" file_path.write_text("", encoding="utf-8") content = reader.read(file_path) assert content == "" def test_file_with_special_characters(self, reader, temp_dir): """Test reading file with various special characters.""" file_path = temp_dir / "special.txt" content = "测试!@#$%^&*()_+-=[]{}|;':\",./<>?\n换行\n制表符\t内容" file_path.write_text(content, encoding="utf-8") result = reader.read(file_path) assert result == content def test_file_with_mixed_line_endings(self, reader, temp_dir): """Test reading file with mixed line endings gets normalized.""" file_path = temp_dir / "mixed_endings.txt" # Write with binary to preserve exact bytes content = "Line1\nLine2\r\nLine3\r" file_path.write_bytes(content.encode("utf-8")) result = reader.read(file_path) # Python's text mode normalizes line endings to \n expected = "Line1\nLine2\nLine3\n" assert result == expected class TestTxtReaderErrorHandling: """Test error handling in TxtReader.""" def test_directory_path_raises_error(self, reader, temp_dir): """Test that reading a directory raises TxtReaderError.""" with pytest.raises(TxtReaderError): reader.read(temp_dir) def test_unreadable_encoding(self, reader, temp_dir): """Test handling of file with encoding that can't be auto-detected.""" # Create a file that's valid UTF-16 but not readable as UTF-8 file_path = temp_dir / "utf16_test.txt" content = "测试内容" file_path.write_bytes(content.encode("utf-16-le")) # Should still be able to read it via fallback result = reader.read(file_path) assert "测试" in result or len(result) > 0