| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232 |
- """
- Unit tests for TextCleaner.
- """
- import pytest
- from src.cleaning.cleaner import TextCleaner
- class TestTextCleaner:
- """Test suite for TextCleaner."""
- @pytest.fixture
- def cleaner(self):
- """Create a TextCleaner instance."""
- return TextCleaner()
- def test_clean_empty_text(self, cleaner):
- """Test cleaning empty text."""
- assert cleaner.clean("") == ""
- assert cleaner.clean(None) if None else "" == cleaner.clean("")
- def test_remove_extra_whitespace(self, cleaner):
- """Test removing extra whitespace."""
- text = "这是 一段 有很多空格的 文本。"
- result = cleaner.remove_extra_whitespace_func(text)
- assert " " not in result
- assert "很多" in result
- def test_remove_multiple_newlines(self, cleaner):
- """Test removing multiple consecutive newlines."""
- text = "第一行\n\n\n\n\n第二行"
- result = cleaner.remove_extra_whitespace_func(text)
- # Should keep up to 2 consecutive empty lines (3 newlines = 2 empty lines)
- assert "第一行" in result
- assert "第二行" in result
- # Should reduce 5 newlines (4 empty lines) to 3 newlines (2 empty lines)
- assert result.count("\n") < text.count("\n")
- def test_preserve_paragraph_structure(self, cleaner):
- """Test that paragraph structure is preserved."""
- text = "第一段\n\n第二段\n\n第三段"
- result = cleaner.remove_extra_whitespace_func(text)
- assert "\n\n" in result # Paragraph breaks should be kept
- def test_fix_multiple_periods(self, cleaner):
- """Test fixing multiple Chinese periods."""
- text = "这是第一句。。这是第二句。。。"
- result = cleaner.fix_punctuation_func(text)
- assert "。。" not in result
- def test_fix_mixed_punctuation(self, cleaner):
- """Test fixing mixed Chinese/English punctuation."""
- text = "这是句子。,也是句子。!"
- result = cleaner.fix_punctuation_func(text)
- # Note: mixed punctuation is complex, just check that some fix was attempted
- assert "。" in result # Chinese period should be preserved
- def test_fix_multiple_exclamations(self, cleaner):
- """Test fixing multiple exclamation marks."""
- text = "太棒了!!!!!"
- result = cleaner.fix_punctuation_func(text)
- assert "!!" not in result
- def test_fix_multiple_question_marks(self, cleaner):
- """Test fixing multiple question marks."""
- text = "真的吗???"
- result = cleaner.fix_punctuation_func(text)
- assert "??" not in result
- def test_remove_invalid_chars(self, cleaner):
- """Test removing invalid characters."""
- text = "正常文本\x00\x01\x02更多文本"
- result = cleaner.remove_invalid_chars_func(text)
- assert "\x00" not in result
- assert "\x01" not in result
- assert "正常文本" in result
- def test_remove_bom_character(self, cleaner):
- """Test removing BOM character."""
- text = "\ufeff这是文本"
- result = cleaner.remove_invalid_chars_func(text)
- assert "\ufeff" not in result
- def test_normalize_quotes(self, cleaner):
- """Test quote normalization."""
- text = '这是\'引号\'和"双引号"内容'
- result = cleaner.normalize_quotes_func(text)
- # Quotes should be normalized to ASCII
- assert "'" in result or '"' in result
- def test_full_cleaning_pipeline(self, cleaner):
- """Test the full cleaning pipeline."""
- text = " 这是 一段 有问题\x00的文本。。\n\n\n还有多余的空格! "
- result = cleaner.clean(text)
- assert "\x00" not in result
- assert "。。" not in result
- assert not result.startswith(" ")
- assert not result.endswith(" ")
- def test_remove_urls(self, cleaner):
- """Test URL removal."""
- text = "访问 https://example.com 查看更多信息"
- result = cleaner.clean(text)
- assert "https://" not in result
- def test_remove_email_addresses(self, cleaner):
- """Test email address removal."""
- text = "联系 test@example.com 获取更多信息"
- result = cleaner.clean(text)
- assert "@" not in result
- def test_custom_removal_patterns(self):
- """Test custom removal patterns."""
- text = "这是 [ISBN:123] 一些文字 [ISBN:456] 更多文字"
- cleaner = TextCleaner(custom_removals=[r'\[ISBN:\d+\]'])
- result = cleaner.clean(text)
- assert "[ISBN:" not in result
- def test_remove_ads(self, cleaner):
- """Test advertisement removal."""
- text = "这是小说内容。本章完。请收藏本站。更多精彩内容。"
- result = cleaner.remove_ads(text)
- assert "本章完" not in result
- assert "请收藏" not in result
- def test_extract_numbers(self, cleaner):
- """Test number extraction."""
- text = "林风今年18岁,身高175.5厘米,有3个朋友。"
- numbers = cleaner.extract_numbers(text)
- assert "18" in numbers
- assert "175.5" in numbers # Decimal is returned as whole number
- assert "3" in numbers
- def test_count_words_chinese(self, cleaner):
- """Test word counting for Chinese text."""
- text = "这是一段中文文本用于测试字数统计。"
- count = cleaner.count_words(text)
- assert count > 0
- def test_count_words_mixed(self, cleaner):
- """Test word counting for mixed text."""
- text = "这里有 Chinese 和 English 123 混合"
- count = cleaner.count_words(text)
- assert count > 0
- def test_truncate_short_text(self, cleaner):
- """Test truncating short text (no change)."""
- text = "短文本"
- result = cleaner.truncate(text, 100)
- assert result == text
- def test_truncate_long_text(self, cleaner):
- """Test truncating long text."""
- text = "这是一段很长的文本需要被截断"
- result = cleaner.truncate(text, 10)
- # Each Chinese character is 1 byte in Python string length
- assert len(result) <= 13 # 10 chars + "..." (but might be less due to multibyte)
- assert result.endswith("...")
- def test_split_into_sentences_chinese(self, cleaner):
- """Test splitting Chinese text into sentences."""
- text = "这是第一句。这是第二句!这是第三句?"
- sentences = cleaner.split_into_sentences(text)
- assert len(sentences) == 3
- assert "第一句" in sentences[0]
- def test_split_into_sentences_english(self, cleaner):
- """Test splitting English text into sentences."""
- text = "This is first. This is second! This is third?"
- sentences = cleaner.split_into_sentences(text)
- assert len(sentences) >= 2
- def test_cleaning_preserves_content(self, cleaner):
- """Test that cleaning doesn't remove important content."""
- text = "第一章 开始\n\n林风站在山顶,看着远方的城市。\n\n" \
- "\"你好,\"他说道。\n\n这是重要的对话内容。"
- result = cleaner.clean(text)
- assert "第一章" in result
- assert "林风" in result
- assert "山顶" in result
- def test_fix_punctuation_spacing(self, cleaner):
- """Test fixing spacing around punctuation."""
- text = "这是句子 ,还有句子 。 还有感叹号 !"
- result = cleaner.fix_punctuation_func(text)
- assert " ," not in result # No space before Chinese comma
- assert " 。" not in result # No space before Chinese period
- def test_mismatched_parentheses(self, cleaner):
- """Test fixing mismatched parentheses."""
- text = "这是(左括号和)右括号"
- result = cleaner.fix_punctuation_func(text)
- # Should normalize to matching pairs
- def test_disabled_options(self):
- """Test cleaner with options disabled."""
- text = " 文本 。。\x00"
- cleaner = TextCleaner(
- remove_extra_whitespace=False,
- fix_punctuation=False,
- remove_invalid_chars=False
- )
- result = cleaner.clean(text)
- # Should preserve most of the original
- assert " " in result # Extra spaces preserved
- def test_trailing_whitespace_removal(self, cleaner):
- """Test removal of trailing whitespace."""
- text = "第一行 \n第二行\t\n第三行 "
- result = cleaner.remove_extra_whitespace_func(text)
- assert not result.endswith(" ")
- assert not result.endswith("\t")
- def test_empty_lines_preservation(self, cleaner):
- """Test that single empty lines are preserved."""
- text = "第一段\n\n第二段"
- result = cleaner.remove_extra_whitespace_func(text)
- assert "\n\n" in result
- def test_multiple_consecutive_punctuation(self, cleaner):
- """Test handling of multiple consecutive punctuation marks."""
- text = "什么!??真的。。。好吧。。。"
- result = cleaner.fix_punctuation_func(text)
- assert "!?" in result or "?" in result
- assert "。。" not in result
- def test_colon_and_semicolon_fix(self, cleaner):
- """Test fixing colon and semicolon issues."""
- text = "这是::测试;;内容"
- result = cleaner.fix_punctuation_func(text)
- assert "::" not in result
- assert ";;" not in result
|