223-template-236
/
blank
ответвлено от 137-template-113/blank


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
							"""
Unit tests for TextCleaner.
"""

import pytest
from src.cleaning.cleaner import TextCleaner


class TestTextCleaner:
    """Test suite for TextCleaner."""

    @pytest.fixture
    def cleaner(self):
        """Create a TextCleaner instance."""
        return TextCleaner()

    def test_clean_empty_text(self, cleaner):
        """Test cleaning empty text."""
        assert cleaner.clean("") == ""
        assert cleaner.clean(None) if None else "" == cleaner.clean("")

    def test_remove_extra_whitespace(self, cleaner):
        """Test removing extra whitespace."""
        text = "这是  一段    有很多空格的  文本。"
        result = cleaner.remove_extra_whitespace_func(text)
        assert "  " not in result
        assert "很多" in result

    def test_remove_multiple_newlines(self, cleaner):
        """Test removing multiple consecutive newlines."""
        text = "第一行\n\n\n\n\n第二行"
        result = cleaner.remove_extra_whitespace_func(text)
        # Should keep up to 2 consecutive empty lines (3 newlines = 2 empty lines)
        assert "第一行" in result
        assert "第二行" in result
        # Should reduce 5 newlines (4 empty lines) to 3 newlines (2 empty lines)
        assert result.count("\n") < text.count("\n")

    def test_preserve_paragraph_structure(self, cleaner):
        """Test that paragraph structure is preserved."""
        text = "第一段\n\n第二段\n\n第三段"
        result = cleaner.remove_extra_whitespace_func(text)
        assert "\n\n" in result  # Paragraph breaks should be kept

    def test_fix_multiple_periods(self, cleaner):
        """Test fixing multiple Chinese periods."""
        text = "这是第一句。。这是第二句。。。"
        result = cleaner.fix_punctuation_func(text)
        assert "。。" not in result

    def test_fix_mixed_punctuation(self, cleaner):
        """Test fixing mixed Chinese/English punctuation."""
        text = "这是句子。，也是句子。!"
        result = cleaner.fix_punctuation_func(text)
        # Note: mixed punctuation is complex, just check that some fix was attempted
        assert "。" in result  # Chinese period should be preserved

    def test_fix_multiple_exclamations(self, cleaner):
        """Test fixing multiple exclamation marks."""
        text = "太棒了！！！！！"
        result = cleaner.fix_punctuation_func(text)
        assert "！！" not in result

    def test_fix_multiple_question_marks(self, cleaner):
        """Test fixing multiple question marks."""
        text = "真的吗？？？"
        result = cleaner.fix_punctuation_func(text)
        assert "？？" not in result

    def test_remove_invalid_chars(self, cleaner):
        """Test removing invalid characters."""
        text = "正常文本\x00\x01\x02更多文本"
        result = cleaner.remove_invalid_chars_func(text)
        assert "\x00" not in result
        assert "\x01" not in result
        assert "正常文本" in result

    def test_remove_bom_character(self, cleaner):
        """Test removing BOM character."""
        text = "\ufeff这是文本"
        result = cleaner.remove_invalid_chars_func(text)
        assert "\ufeff" not in result

    def test_normalize_quotes(self, cleaner):
        """Test quote normalization."""
        text = '这是\'引号\'和"双引号"内容'
        result = cleaner.normalize_quotes_func(text)
        # Quotes should be normalized to ASCII
        assert "'" in result or '"' in result

    def test_full_cleaning_pipeline(self, cleaner):
        """Test the full cleaning pipeline."""
        text = "  这是  一段  有问题\x00的文本。。\n\n\n还有多余的空格！  "
        result = cleaner.clean(text)
        assert "\x00" not in result
        assert "。。" not in result
        assert not result.startswith(" ")
        assert not result.endswith(" ")

    def test_remove_urls(self, cleaner):
        """Test URL removal."""
        text = "访问 https://example.com 查看更多信息"
        result = cleaner.clean(text)
        assert "https://" not in result

    def test_remove_email_addresses(self, cleaner):
        """Test email address removal."""
        text = "联系 test@example.com 获取更多信息"
        result = cleaner.clean(text)
        assert "@" not in result

    def test_custom_removal_patterns(self):
        """Test custom removal patterns."""
        text = "这是 [ISBN:123] 一些文字 [ISBN:456] 更多文字"
        cleaner = TextCleaner(custom_removals=[r'\[ISBN:\d+\]'])
        result = cleaner.clean(text)
        assert "[ISBN:" not in result

    def test_remove_ads(self, cleaner):
        """Test advertisement removal."""
        text = "这是小说内容。本章完。请收藏本站。更多精彩内容。"
        result = cleaner.remove_ads(text)
        assert "本章完" not in result
        assert "请收藏" not in result

    def test_extract_numbers(self, cleaner):
        """Test number extraction."""
        text = "林风今年18岁，身高175.5厘米，有3个朋友。"
        numbers = cleaner.extract_numbers(text)
        assert "18" in numbers
        assert "175.5" in numbers  # Decimal is returned as whole number
        assert "3" in numbers

    def test_count_words_chinese(self, cleaner):
        """Test word counting for Chinese text."""
        text = "这是一段中文文本用于测试字数统计。"
        count = cleaner.count_words(text)
        assert count > 0

    def test_count_words_mixed(self, cleaner):
        """Test word counting for mixed text."""
        text = "这里有 Chinese 和 English 123 混合"
        count = cleaner.count_words(text)
        assert count > 0

    def test_truncate_short_text(self, cleaner):
        """Test truncating short text (no change)."""
        text = "短文本"
        result = cleaner.truncate(text, 100)
        assert result == text

    def test_truncate_long_text(self, cleaner):
        """Test truncating long text."""
        text = "这是一段很长的文本需要被截断"
        result = cleaner.truncate(text, 10)
        # Each Chinese character is 1 byte in Python string length
        assert len(result) <= 13  # 10 chars + "..." (but might be less due to multibyte)
        assert result.endswith("...")

    def test_split_into_sentences_chinese(self, cleaner):
        """Test splitting Chinese text into sentences."""
        text = "这是第一句。这是第二句！这是第三句？"
        sentences = cleaner.split_into_sentences(text)
        assert len(sentences) == 3
        assert "第一句" in sentences[0]

    def test_split_into_sentences_english(self, cleaner):
        """Test splitting English text into sentences."""
        text = "This is first. This is second! This is third?"
        sentences = cleaner.split_into_sentences(text)
        assert len(sentences) >= 2

    def test_cleaning_preserves_content(self, cleaner):
        """Test that cleaning doesn't remove important content."""
        text = "第一章 开始\n\n林风站在山顶，看着远方的城市。\n\n" \
               "\"你好，\"他说道。\n\n这是重要的对话内容。"
        result = cleaner.clean(text)
        assert "第一章" in result
        assert "林风" in result
        assert "山顶" in result

    def test_fix_punctuation_spacing(self, cleaner):
        """Test fixing spacing around punctuation."""
        text = "这是句子 ，还有句子 。 还有感叹号 ！"
        result = cleaner.fix_punctuation_func(text)
        assert " ，" not in result  # No space before Chinese comma
        assert " 。" not in result  # No space before Chinese period

    def test_mismatched_parentheses(self, cleaner):
        """Test fixing mismatched parentheses."""
        text = "这是（左括号和)右括号"
        result = cleaner.fix_punctuation_func(text)
        # Should normalize to matching pairs

    def test_disabled_options(self):
        """Test cleaner with options disabled."""
        text = "  文本  。。\x00"
        cleaner = TextCleaner(
            remove_extra_whitespace=False,
            fix_punctuation=False,
            remove_invalid_chars=False
        )
        result = cleaner.clean(text)
        # Should preserve most of the original
        assert "  " in result  # Extra spaces preserved

    def test_trailing_whitespace_removal(self, cleaner):
        """Test removal of trailing whitespace."""
        text = "第一行  \n第二行\t\n第三行   "
        result = cleaner.remove_extra_whitespace_func(text)
        assert not result.endswith(" ")
        assert not result.endswith("\t")

    def test_empty_lines_preservation(self, cleaner):
        """Test that single empty lines are preserved."""
        text = "第一段\n\n第二段"
        result = cleaner.remove_extra_whitespace_func(text)
        assert "\n\n" in result

    def test_multiple_consecutive_punctuation(self, cleaner):
        """Test handling of multiple consecutive punctuation marks."""
        text = "什么！？？真的。。。好吧。。。"
        result = cleaner.fix_punctuation_func(text)
        assert "！？" in result or "？" in result
        assert "。。" not in result

    def test_colon_and_semicolon_fix(self, cleaner):
        """Test fixing colon and semicolon issues."""
        text = "这是：：测试；；内容"
        result = cleaner.fix_punctuation_func(text)
        assert "：：" not in result
        assert "；；" not in result