""" Unit tests for TextCleaner. """ import pytest from src.cleaning.cleaner import TextCleaner class TestTextCleaner: """Test suite for TextCleaner.""" @pytest.fixture def cleaner(self): """Create a TextCleaner instance.""" return TextCleaner() def test_clean_empty_text(self, cleaner): """Test cleaning empty text.""" assert cleaner.clean("") == "" assert cleaner.clean(None) if None else "" == cleaner.clean("") def test_remove_extra_whitespace(self, cleaner): """Test removing extra whitespace.""" text = "这是 一段 有很多空格的 文本。" result = cleaner.remove_extra_whitespace_func(text) assert " " not in result assert "很多" in result def test_remove_multiple_newlines(self, cleaner): """Test removing multiple consecutive newlines.""" text = "第一行\n\n\n\n\n第二行" result = cleaner.remove_extra_whitespace_func(text) # Should keep up to 2 consecutive empty lines (3 newlines = 2 empty lines) assert "第一行" in result assert "第二行" in result # Should reduce 5 newlines (4 empty lines) to 3 newlines (2 empty lines) assert result.count("\n") < text.count("\n") def test_preserve_paragraph_structure(self, cleaner): """Test that paragraph structure is preserved.""" text = "第一段\n\n第二段\n\n第三段" result = cleaner.remove_extra_whitespace_func(text) assert "\n\n" in result # Paragraph breaks should be kept def test_fix_multiple_periods(self, cleaner): """Test fixing multiple Chinese periods.""" text = "这是第一句。。这是第二句。。。" result = cleaner.fix_punctuation_func(text) assert "。。" not in result def test_fix_mixed_punctuation(self, cleaner): """Test fixing mixed Chinese/English punctuation.""" text = "这是句子。,也是句子。!" result = cleaner.fix_punctuation_func(text) # Note: mixed punctuation is complex, just check that some fix was attempted assert "。" in result # Chinese period should be preserved def test_fix_multiple_exclamations(self, cleaner): """Test fixing multiple exclamation marks.""" text = "太棒了!!!!!" result = cleaner.fix_punctuation_func(text) assert "!!" not in result def test_fix_multiple_question_marks(self, cleaner): """Test fixing multiple question marks.""" text = "真的吗???" result = cleaner.fix_punctuation_func(text) assert "??" not in result def test_remove_invalid_chars(self, cleaner): """Test removing invalid characters.""" text = "正常文本\x00\x01\x02更多文本" result = cleaner.remove_invalid_chars_func(text) assert "\x00" not in result assert "\x01" not in result assert "正常文本" in result def test_remove_bom_character(self, cleaner): """Test removing BOM character.""" text = "\ufeff这是文本" result = cleaner.remove_invalid_chars_func(text) assert "\ufeff" not in result def test_normalize_quotes(self, cleaner): """Test quote normalization.""" text = '这是\'引号\'和"双引号"内容' result = cleaner.normalize_quotes_func(text) # Quotes should be normalized to ASCII assert "'" in result or '"' in result def test_full_cleaning_pipeline(self, cleaner): """Test the full cleaning pipeline.""" text = " 这是 一段 有问题\x00的文本。。\n\n\n还有多余的空格! " result = cleaner.clean(text) assert "\x00" not in result assert "。。" not in result assert not result.startswith(" ") assert not result.endswith(" ") def test_remove_urls(self, cleaner): """Test URL removal.""" text = "访问 https://example.com 查看更多信息" result = cleaner.clean(text) assert "https://" not in result def test_remove_email_addresses(self, cleaner): """Test email address removal.""" text = "联系 test@example.com 获取更多信息" result = cleaner.clean(text) assert "@" not in result def test_custom_removal_patterns(self): """Test custom removal patterns.""" text = "这是 [ISBN:123] 一些文字 [ISBN:456] 更多文字" cleaner = TextCleaner(custom_removals=[r'\[ISBN:\d+\]']) result = cleaner.clean(text) assert "[ISBN:" not in result def test_remove_ads(self, cleaner): """Test advertisement removal.""" text = "这是小说内容。本章完。请收藏本站。更多精彩内容。" result = cleaner.remove_ads(text) assert "本章完" not in result assert "请收藏" not in result def test_extract_numbers(self, cleaner): """Test number extraction.""" text = "林风今年18岁,身高175.5厘米,有3个朋友。" numbers = cleaner.extract_numbers(text) assert "18" in numbers assert "175.5" in numbers # Decimal is returned as whole number assert "3" in numbers def test_count_words_chinese(self, cleaner): """Test word counting for Chinese text.""" text = "这是一段中文文本用于测试字数统计。" count = cleaner.count_words(text) assert count > 0 def test_count_words_mixed(self, cleaner): """Test word counting for mixed text.""" text = "这里有 Chinese 和 English 123 混合" count = cleaner.count_words(text) assert count > 0 def test_truncate_short_text(self, cleaner): """Test truncating short text (no change).""" text = "短文本" result = cleaner.truncate(text, 100) assert result == text def test_truncate_long_text(self, cleaner): """Test truncating long text.""" text = "这是一段很长的文本需要被截断" result = cleaner.truncate(text, 10) # Each Chinese character is 1 byte in Python string length assert len(result) <= 13 # 10 chars + "..." (but might be less due to multibyte) assert result.endswith("...") def test_split_into_sentences_chinese(self, cleaner): """Test splitting Chinese text into sentences.""" text = "这是第一句。这是第二句!这是第三句?" sentences = cleaner.split_into_sentences(text) assert len(sentences) == 3 assert "第一句" in sentences[0] def test_split_into_sentences_english(self, cleaner): """Test splitting English text into sentences.""" text = "This is first. This is second! This is third?" sentences = cleaner.split_into_sentences(text) assert len(sentences) >= 2 def test_cleaning_preserves_content(self, cleaner): """Test that cleaning doesn't remove important content.""" text = "第一章 开始\n\n林风站在山顶,看着远方的城市。\n\n" \ "\"你好,\"他说道。\n\n这是重要的对话内容。" result = cleaner.clean(text) assert "第一章" in result assert "林风" in result assert "山顶" in result def test_fix_punctuation_spacing(self, cleaner): """Test fixing spacing around punctuation.""" text = "这是句子 ,还有句子 。 还有感叹号 !" result = cleaner.fix_punctuation_func(text) assert " ," not in result # No space before Chinese comma assert " 。" not in result # No space before Chinese period def test_mismatched_parentheses(self, cleaner): """Test fixing mismatched parentheses.""" text = "这是(左括号和)右括号" result = cleaner.fix_punctuation_func(text) # Should normalize to matching pairs def test_disabled_options(self): """Test cleaner with options disabled.""" text = " 文本 。。\x00" cleaner = TextCleaner( remove_extra_whitespace=False, fix_punctuation=False, remove_invalid_chars=False ) result = cleaner.clean(text) # Should preserve most of the original assert " " in result # Extra spaces preserved def test_trailing_whitespace_removal(self, cleaner): """Test removal of trailing whitespace.""" text = "第一行 \n第二行\t\n第三行 " result = cleaner.remove_extra_whitespace_func(text) assert not result.endswith(" ") assert not result.endswith("\t") def test_empty_lines_preservation(self, cleaner): """Test that single empty lines are preserved.""" text = "第一段\n\n第二段" result = cleaner.remove_extra_whitespace_func(text) assert "\n\n" in result def test_multiple_consecutive_punctuation(self, cleaner): """Test handling of multiple consecutive punctuation marks.""" text = "什么!??真的。。。好吧。。。" result = cleaner.fix_punctuation_func(text) assert "!?" in result or "?" in result assert "。。" not in result def test_colon_and_semicolon_fix(self, cleaner): """Test fixing colon and semicolon issues.""" text = "这是::测试;;内容" result = cleaner.fix_punctuation_func(text) assert "::" not in result assert ";;" not in result