2
0

test_cleaner.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. """
  2. Unit tests for TextCleaner.
  3. """
  4. import pytest
  5. from src.cleaning.cleaner import TextCleaner
  6. class TestTextCleaner:
  7. """Test suite for TextCleaner."""
  8. @pytest.fixture
  9. def cleaner(self):
  10. """Create a TextCleaner instance."""
  11. return TextCleaner()
  12. def test_clean_empty_text(self, cleaner):
  13. """Test cleaning empty text."""
  14. assert cleaner.clean("") == ""
  15. assert cleaner.clean(None) if None else "" == cleaner.clean("")
  16. def test_remove_extra_whitespace(self, cleaner):
  17. """Test removing extra whitespace."""
  18. text = "这是 一段 有很多空格的 文本。"
  19. result = cleaner.remove_extra_whitespace_func(text)
  20. assert " " not in result
  21. assert "很多" in result
  22. def test_remove_multiple_newlines(self, cleaner):
  23. """Test removing multiple consecutive newlines."""
  24. text = "第一行\n\n\n\n\n第二行"
  25. result = cleaner.remove_extra_whitespace_func(text)
  26. # Should keep up to 2 consecutive empty lines (3 newlines = 2 empty lines)
  27. assert "第一行" in result
  28. assert "第二行" in result
  29. # Should reduce 5 newlines (4 empty lines) to 3 newlines (2 empty lines)
  30. assert result.count("\n") < text.count("\n")
  31. def test_preserve_paragraph_structure(self, cleaner):
  32. """Test that paragraph structure is preserved."""
  33. text = "第一段\n\n第二段\n\n第三段"
  34. result = cleaner.remove_extra_whitespace_func(text)
  35. assert "\n\n" in result # Paragraph breaks should be kept
  36. def test_fix_multiple_periods(self, cleaner):
  37. """Test fixing multiple Chinese periods."""
  38. text = "这是第一句。。这是第二句。。。"
  39. result = cleaner.fix_punctuation_func(text)
  40. assert "。。" not in result
  41. def test_fix_mixed_punctuation(self, cleaner):
  42. """Test fixing mixed Chinese/English punctuation."""
  43. text = "这是句子。,也是句子。!"
  44. result = cleaner.fix_punctuation_func(text)
  45. # Note: mixed punctuation is complex, just check that some fix was attempted
  46. assert "。" in result # Chinese period should be preserved
  47. def test_fix_multiple_exclamations(self, cleaner):
  48. """Test fixing multiple exclamation marks."""
  49. text = "太棒了!!!!!"
  50. result = cleaner.fix_punctuation_func(text)
  51. assert "!!" not in result
  52. def test_fix_multiple_question_marks(self, cleaner):
  53. """Test fixing multiple question marks."""
  54. text = "真的吗???"
  55. result = cleaner.fix_punctuation_func(text)
  56. assert "??" not in result
  57. def test_remove_invalid_chars(self, cleaner):
  58. """Test removing invalid characters."""
  59. text = "正常文本\x00\x01\x02更多文本"
  60. result = cleaner.remove_invalid_chars_func(text)
  61. assert "\x00" not in result
  62. assert "\x01" not in result
  63. assert "正常文本" in result
  64. def test_remove_bom_character(self, cleaner):
  65. """Test removing BOM character."""
  66. text = "\ufeff这是文本"
  67. result = cleaner.remove_invalid_chars_func(text)
  68. assert "\ufeff" not in result
  69. def test_normalize_quotes(self, cleaner):
  70. """Test quote normalization."""
  71. text = '这是\'引号\'和"双引号"内容'
  72. result = cleaner.normalize_quotes_func(text)
  73. # Quotes should be normalized to ASCII
  74. assert "'" in result or '"' in result
  75. def test_full_cleaning_pipeline(self, cleaner):
  76. """Test the full cleaning pipeline."""
  77. text = " 这是 一段 有问题\x00的文本。。\n\n\n还有多余的空格! "
  78. result = cleaner.clean(text)
  79. assert "\x00" not in result
  80. assert "。。" not in result
  81. assert not result.startswith(" ")
  82. assert not result.endswith(" ")
  83. def test_remove_urls(self, cleaner):
  84. """Test URL removal."""
  85. text = "访问 https://example.com 查看更多信息"
  86. result = cleaner.clean(text)
  87. assert "https://" not in result
  88. def test_remove_email_addresses(self, cleaner):
  89. """Test email address removal."""
  90. text = "联系 test@example.com 获取更多信息"
  91. result = cleaner.clean(text)
  92. assert "@" not in result
  93. def test_custom_removal_patterns(self):
  94. """Test custom removal patterns."""
  95. text = "这是 [ISBN:123] 一些文字 [ISBN:456] 更多文字"
  96. cleaner = TextCleaner(custom_removals=[r'\[ISBN:\d+\]'])
  97. result = cleaner.clean(text)
  98. assert "[ISBN:" not in result
  99. def test_remove_ads(self, cleaner):
  100. """Test advertisement removal."""
  101. text = "这是小说内容。本章完。请收藏本站。更多精彩内容。"
  102. result = cleaner.remove_ads(text)
  103. assert "本章完" not in result
  104. assert "请收藏" not in result
  105. def test_extract_numbers(self, cleaner):
  106. """Test number extraction."""
  107. text = "林风今年18岁,身高175.5厘米,有3个朋友。"
  108. numbers = cleaner.extract_numbers(text)
  109. assert "18" in numbers
  110. assert "175.5" in numbers # Decimal is returned as whole number
  111. assert "3" in numbers
  112. def test_count_words_chinese(self, cleaner):
  113. """Test word counting for Chinese text."""
  114. text = "这是一段中文文本用于测试字数统计。"
  115. count = cleaner.count_words(text)
  116. assert count > 0
  117. def test_count_words_mixed(self, cleaner):
  118. """Test word counting for mixed text."""
  119. text = "这里有 Chinese 和 English 123 混合"
  120. count = cleaner.count_words(text)
  121. assert count > 0
  122. def test_truncate_short_text(self, cleaner):
  123. """Test truncating short text (no change)."""
  124. text = "短文本"
  125. result = cleaner.truncate(text, 100)
  126. assert result == text
  127. def test_truncate_long_text(self, cleaner):
  128. """Test truncating long text."""
  129. text = "这是一段很长的文本需要被截断"
  130. result = cleaner.truncate(text, 10)
  131. # Each Chinese character is 1 byte in Python string length
  132. assert len(result) <= 13 # 10 chars + "..." (but might be less due to multibyte)
  133. assert result.endswith("...")
  134. def test_split_into_sentences_chinese(self, cleaner):
  135. """Test splitting Chinese text into sentences."""
  136. text = "这是第一句。这是第二句!这是第三句?"
  137. sentences = cleaner.split_into_sentences(text)
  138. assert len(sentences) == 3
  139. assert "第一句" in sentences[0]
  140. def test_split_into_sentences_english(self, cleaner):
  141. """Test splitting English text into sentences."""
  142. text = "This is first. This is second! This is third?"
  143. sentences = cleaner.split_into_sentences(text)
  144. assert len(sentences) >= 2
  145. def test_cleaning_preserves_content(self, cleaner):
  146. """Test that cleaning doesn't remove important content."""
  147. text = "第一章 开始\n\n林风站在山顶,看着远方的城市。\n\n" \
  148. "\"你好,\"他说道。\n\n这是重要的对话内容。"
  149. result = cleaner.clean(text)
  150. assert "第一章" in result
  151. assert "林风" in result
  152. assert "山顶" in result
  153. def test_fix_punctuation_spacing(self, cleaner):
  154. """Test fixing spacing around punctuation."""
  155. text = "这是句子 ,还有句子 。 还有感叹号 !"
  156. result = cleaner.fix_punctuation_func(text)
  157. assert " ," not in result # No space before Chinese comma
  158. assert " 。" not in result # No space before Chinese period
  159. def test_mismatched_parentheses(self, cleaner):
  160. """Test fixing mismatched parentheses."""
  161. text = "这是(左括号和)右括号"
  162. result = cleaner.fix_punctuation_func(text)
  163. # Should normalize to matching pairs
  164. def test_disabled_options(self):
  165. """Test cleaner with options disabled."""
  166. text = " 文本 。。\x00"
  167. cleaner = TextCleaner(
  168. remove_extra_whitespace=False,
  169. fix_punctuation=False,
  170. remove_invalid_chars=False
  171. )
  172. result = cleaner.clean(text)
  173. # Should preserve most of the original
  174. assert " " in result # Extra spaces preserved
  175. def test_trailing_whitespace_removal(self, cleaner):
  176. """Test removal of trailing whitespace."""
  177. text = "第一行 \n第二行\t\n第三行 "
  178. result = cleaner.remove_extra_whitespace_func(text)
  179. assert not result.endswith(" ")
  180. assert not result.endswith("\t")
  181. def test_empty_lines_preservation(self, cleaner):
  182. """Test that single empty lines are preserved."""
  183. text = "第一段\n\n第二段"
  184. result = cleaner.remove_extra_whitespace_func(text)
  185. assert "\n\n" in result
  186. def test_multiple_consecutive_punctuation(self, cleaner):
  187. """Test handling of multiple consecutive punctuation marks."""
  188. text = "什么!??真的。。。好吧。。。"
  189. result = cleaner.fix_punctuation_func(text)
  190. assert "!?" in result or "?" in result
  191. assert "。。" not in result
  192. def test_colon_and_semicolon_fix(self, cleaner):
  193. """Test fixing colon and semicolon issues."""
  194. text = "这是::测试;;内容"
  195. result = cleaner.fix_punctuation_func(text)
  196. assert "::" not in result
  197. assert ";;" not in result