test_reader.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. """
  2. Unit tests for TxtReader.
  3. """
  4. import pytest
  5. from pathlib import Path
  6. from src.cleaning.reader import TxtReader, TxtReaderError
  7. @pytest.fixture
  8. def reader():
  9. """Create a TxtReader instance."""
  10. return TxtReader()
  11. @pytest.fixture
  12. def temp_dir(tmp_path):
  13. """Create temporary directory for test files."""
  14. return tmp_path
  15. @pytest.fixture
  16. def utf8_file(temp_dir):
  17. """Create a UTF-8 encoded test file."""
  18. file_path = temp_dir / "utf8_test.txt"
  19. content = "这是一个测试文件。\n第二行内容。\n第三行内容。"
  20. file_path.write_text(content, encoding="utf-8")
  21. return file_path, content
  22. @pytest.fixture
  23. def gbk_file(temp_dir):
  24. """Create a GBK encoded test file."""
  25. file_path = temp_dir / "gbk_test.txt"
  26. content = "这是一个GBK编码的测试文件。\n第二行内容。"
  27. file_path.write_bytes(content.encode("gbk"))
  28. return file_path, content
  29. @pytest.fixture
  30. def utf8_bom_file(temp_dir):
  31. """Create a UTF-8 with BOM test file."""
  32. file_path = temp_dir / "utf8_bom_test.txt"
  33. content = "这是带有BOM的UTF-8文件。\n第二行。"
  34. # Write UTF-8 BOM + content
  35. file_path.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
  36. return file_path, content
  37. @pytest.fixture
  38. def large_file(temp_dir):
  39. """Create a large test file for performance testing."""
  40. file_path = temp_dir / "large_test.txt"
  41. # Create a file with about 100KB of content
  42. content = "这是测试内容。" * 10000
  43. file_path.write_text(content, encoding="utf-8")
  44. return file_path, content
  45. class TestTxtReader:
  46. """Test suite for TxtReader."""
  47. def test_read_utf8_file(self, reader, utf8_file):
  48. """Test reading a UTF-8 encoded file."""
  49. file_path, expected_content = utf8_file
  50. content = reader.read(file_path)
  51. assert content == expected_content
  52. def test_read_gbk_file(self, reader, gbk_file):
  53. """Test reading a GBK encoded file."""
  54. file_path, expected_content = gbk_file
  55. content = reader.read(file_path)
  56. assert content == expected_content
  57. def test_read_utf8_bom_file(self, reader, utf8_bom_file):
  58. """Test reading a UTF-8 file with BOM."""
  59. file_path, expected_content = utf8_bom_file
  60. content = reader.read(file_path)
  61. assert content == expected_content
  62. def test_read_nonexistent_file(self, reader):
  63. """Test reading a non-existent file raises FileNotFoundError."""
  64. with pytest.raises(FileNotFoundError):
  65. reader.read("/nonexistent/path/file.txt")
  66. def test_read_with_info_returns_encoding(self, reader, utf8_file):
  67. """Test read_with_info returns both content and encoding."""
  68. file_path, expected_content = utf8_file
  69. content, encoding = reader.read_with_info(file_path)
  70. assert content == expected_content
  71. assert encoding == "utf-8"
  72. def test_read_with_info_gbk(self, reader, gbk_file):
  73. """Test read_with_info detects GBK encoding."""
  74. file_path, expected_content = gbk_file
  75. content, encoding = reader.read_with_info(file_path)
  76. assert content == expected_content
  77. # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
  78. assert encoding in ["gbk", "gb2312", "gb18030"]
  79. def test_read_lines_keep_newlines(self, reader, utf8_file):
  80. """Test reading lines with newlines preserved."""
  81. file_path, content = utf8_file
  82. lines = reader.read_lines(file_path, keep_newlines=True)
  83. assert len(lines) == 3
  84. assert lines[0].endswith("\n")
  85. def test_read_lines_without_newlines(self, reader, utf8_file):
  86. """Test reading lines without newlines."""
  87. file_path, content = utf8_file
  88. lines = reader.read_lines(file_path, keep_newlines=False)
  89. assert len(lines) == 3
  90. assert not lines[0].endswith("\n")
  91. def test_detect_encoding_utf8(self, reader, utf8_file):
  92. """Test encoding detection for UTF-8."""
  93. file_path, _ = utf8_file
  94. encoding = reader.detect_encoding(file_path)
  95. assert encoding in ["utf-8", "ascii"]
  96. def test_detect_encoding_gbk(self, reader, gbk_file):
  97. """Test encoding detection for GBK."""
  98. file_path, _ = gbk_file
  99. encoding = reader.detect_encoding(file_path)
  100. # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
  101. assert encoding in ["gbk", "gb2312", "gb18030"]
  102. def test_detect_encoding_nonexistent(self, reader):
  103. """Test encoding detection for non-existent file."""
  104. encoding = reader.detect_encoding("/nonexistent/file.txt")
  105. assert encoding is None
  106. def test_is_binary_with_text_file(self, reader, utf8_file):
  107. """Test is_binary returns False for text files."""
  108. file_path, _ = utf8_file
  109. assert not reader.is_binary(file_path)
  110. def test_is_binary_with_binary_file(self, reader, temp_dir):
  111. """Test is_binary returns True for binary files."""
  112. file_path = temp_dir / "binary_test.bin"
  113. file_path.write_bytes(b"\x00\x01\x02\x03" * 1000)
  114. assert reader.is_binary(file_path)
  115. def test_is_binary_nonexistent(self, reader):
  116. """Test is_binary returns False for non-existent file."""
  117. assert not reader.is_binary("/nonexistent/file.txt")
  118. def test_read_large_file_performance(self, reader, large_file):
  119. """Test that large files are read efficiently."""
  120. import time
  121. file_path, expected_content = large_file
  122. start_time = time.time()
  123. content = reader.read(file_path)
  124. elapsed = time.time() - start_time
  125. assert content == expected_content
  126. # Should read 100KB in less than 1 second
  127. assert elapsed < 1.0
  128. def test_custom_default_encoding(self, temp_dir):
  129. """Test reader with custom default encoding."""
  130. file_path = temp_dir / "gbk_test.txt"
  131. content = "GBK编码测试"
  132. file_path.write_bytes(content.encode("gbk"))
  133. reader = TxtReader(default_encoding="gbk")
  134. result = reader.read(file_path)
  135. assert result == content
  136. def test_empty_file(self, reader, temp_dir):
  137. """Test reading an empty file."""
  138. file_path = temp_dir / "empty.txt"
  139. file_path.write_text("", encoding="utf-8")
  140. content = reader.read(file_path)
  141. assert content == ""
  142. def test_file_with_special_characters(self, reader, temp_dir):
  143. """Test reading file with various special characters."""
  144. file_path = temp_dir / "special.txt"
  145. content = "测试!@#$%^&*()_+-=[]{}|;':\",./<>?\n换行\n制表符\t内容"
  146. file_path.write_text(content, encoding="utf-8")
  147. result = reader.read(file_path)
  148. assert result == content
  149. def test_file_with_mixed_line_endings(self, reader, temp_dir):
  150. """Test reading file with mixed line endings gets normalized."""
  151. file_path = temp_dir / "mixed_endings.txt"
  152. # Write with binary to preserve exact bytes
  153. content = "Line1\nLine2\r\nLine3\r"
  154. file_path.write_bytes(content.encode("utf-8"))
  155. result = reader.read(file_path)
  156. # Python's text mode normalizes line endings to \n
  157. expected = "Line1\nLine2\nLine3\n"
  158. assert result == expected
  159. class TestTxtReaderErrorHandling:
  160. """Test error handling in TxtReader."""
  161. def test_directory_path_raises_error(self, reader, temp_dir):
  162. """Test that reading a directory raises TxtReaderError."""
  163. with pytest.raises(TxtReaderError):
  164. reader.read(temp_dir)
  165. def test_unreadable_encoding(self, reader, temp_dir):
  166. """Test handling of file with encoding that can't be auto-detected."""
  167. # Create a file that's valid UTF-16 but not readable as UTF-8
  168. file_path = temp_dir / "utf16_test.txt"
  169. content = "测试内容"
  170. file_path.write_bytes(content.encode("utf-16-le"))
  171. # Should still be able to read it via fallback
  172. result = reader.read(file_path)
  173. assert "测试" in result or len(result) > 0