| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363 |
- """
- Unit tests for ChapterSplitter.
- """
- import pytest
- from src.cleaning.splitter import ChapterSplitter, ChapterSplitterError
- class TestChapterSplitter:
- """Test suite for ChapterSplitter."""
- @pytest.fixture
- def splitter(self):
- """Create a ChapterSplitter instance with low min length for testing."""
- return ChapterSplitter(min_chapter_length=10, merge_short_chapters=False)
- def test_split_chinese_numerals(self, splitter):
- """Test splitting Chinese chapter titles (第一章, etc.)."""
- text = """第一章 开始
- 这是第一章的内容。
- 第二章 继续
- 这是第二章的内容。"""
- chapters = splitter.split(text)
- assert len(chapters) == 2
- assert "第一章" in chapters[0].title
- assert "第二章" in chapters[1].title
- def test_split_arabic_chinese(self, splitter):
- """Test splitting Arabic-Chinese chapter titles (第1章, etc.)."""
- text = """第1章 开始
- 这是第一章的内容。
- 第2章 继续
- 这是第二章的内容。"""
- chapters = splitter.split(text)
- assert len(chapters) == 2
- assert "第1章" in chapters[0].title
- assert "第2章" in chapters[1].title
- def test_split_english_format(self, splitter):
- """Test splitting English chapter titles."""
- text = """Chapter 1 The Beginning
- This is chapter one.
- Chapter 2 The Journey Continues
- This is chapter two."""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- assert "Chapter 1" in chapters[0].title
- def test_split_numbered_format(self, splitter):
- """Test splitting numbered chapter titles."""
- text = """1. The Start
- Content here.
- 2. The Middle
- More content."""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- assert "1." in chapters[0].title
- def test_split_date_format(self, splitter):
- """Test splitting date format chapters."""
- text = """2024年3月15日
- 这是第一天的内容。
- 2024年3月16日
- 这是第二天的内容。"""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- def test_split_volume_format(self, splitter):
- """Test splitting volume format (第一卷, etc.)."""
- text = """第一卷 命运的开始
- 这是第一卷的内容。
- 第二卷 奇遇
- 这是第二卷的内容。"""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- def test_split_with_brackets(self, splitter):
- """Test splitting bracket format chapters."""
- text = """[1] 开始
- 内容。
- [2] 继续
- 更多内容。"""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- def test_empty_text_returns_empty_list(self, splitter):
- """Test that empty text returns empty chapter list."""
- chapters = splitter.split("")
- assert chapters == []
- def test_no_chapter_titles_returns_single_chapter(self, splitter):
- """Test that text without chapter titles becomes one chapter."""
- text = "这是一段没有章节标题的文本。\n第二行内容。"
- chapters = splitter.split(text)
- assert len(chapters) == 1
- assert chapters[0].title == "全文"
- def test_chapter_char_count(self, splitter):
- """Test that chapter character count is correct."""
- text = """第一章 测试章节
- 这是第一章的内容,包含一些文字。
- 第二章 第二个章节
- 这是第二章的内容。"""
- chapters = splitter.split(text)
- assert chapters[0].char_count > 0
- assert len(chapters[0].content) == chapters[0].char_count
- def test_chapter_positions(self, splitter):
- """Test that chapter start/end positions are correct."""
- text = "第一章\n内容1\n\n第二章\n内容2"
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- if chapters[0].start_pos is not None:
- assert chapters[0].start_pos == 0
- def test_detect_chapter_title_chinese(self, splitter):
- """Test chapter title detection for Chinese format."""
- assert splitter.is_chapter_title("第一章 开始")
- assert splitter.is_chapter_title("第123章")
- assert splitter.is_chapter_title("第十章 约战")
- def test_detect_chapter_title_english(self, splitter):
- """Test chapter title detection for English format."""
- assert splitter.is_chapter_title("Chapter 1")
- assert splitter.is_chapter_title("Chapter One - The Beginning")
- def test_detect_chapter_title_numbered(self, splitter):
- """Test chapter title detection for numbered format."""
- assert splitter.is_chapter_title("1. Start")
- assert splitter.is_chapter_title("123. End")
- def test_detect_chapter_title_date(self, splitter):
- """Test chapter title detection for date format."""
- assert splitter.is_chapter_title("2024年3月15日")
- assert splitter.is_chapter_title("2024年12月1日")
- def test_detect_not_chapter_title(self, splitter):
- """Test that non-titles are correctly identified."""
- assert not splitter.is_chapter_title("这是一个普通的句子")
- assert not splitter.is_chapter_title("")
- assert not splitter.is_chapter_title("hello world")
- def test_get_chapter_count(self, splitter):
- """Test getting chapter count without full split."""
- text = """第一章 开始
- 内容。
- 第二章 继续
- 更多内容。
- 第三章 结束
- 最后内容。"""
- count = splitter.get_chapter_count(text)
- assert count == 3
- def test_get_chapter_count_no_chapters(self, splitter):
- """Test getting chapter count for text without chapters."""
- text = "这是一段没有章节的文本。"
- count = splitter.get_chapter_count(text)
- assert count == 1
- def test_preview_chapters(self, splitter):
- """Test getting chapter previews."""
- text = """第一章 开始
- 这是第一章的内容,包含一些文字。
- 第二章 继续
- 这是第二章的内容,包含更多文字。"""
- previews = splitter.preview_chapters(text, preview_length=50)
- assert len(previews) >= 2
- assert "第一章" in previews[0]
- assert "第二章" in previews[1]
- def test_merge_short_chapters_enabled(self):
- """Test that short chapters are merged when enabled."""
- text = """第一章 开始
- 短。
- 第二章 中间
- 这是第二章较长的内容。
- 第三章 结尾
- 也短。"""
- splitter = ChapterSplitter(min_chapter_length=50, merge_short_chapters=True)
- chapters = splitter.split(text)
- # Short chapters should be merged with adjacent ones
- assert len(chapters) <= 3
- def test_merge_short_chapters_disabled(self):
- """Test that short chapters are kept when merging disabled."""
- text = """第一章 开始
- 短内容。
- 第二章 继续
- 更多内容。"""
- splitter = ChapterSplitter(min_chapter_length=1000, merge_short_chapters=False)
- chapters = splitter.split(text)
- # All chapters should be kept
- assert len(chapters) == 2
- def test_custom_patterns(self):
- """Test using custom chapter patterns."""
- text = """EPISODE 1 Start
- Content.
- EPISODE 2 Middle
- More content."""
- custom_patterns = [(r'^EPISODE\s+\d+', 1)]
- splitter = ChapterSplitter(
- min_chapter_length=10,
- merge_short_chapters=False,
- custom_patterns=custom_patterns
- )
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- assert "EPISODE 1" in chapters[0].title
- def test_mixed_pattern_types(self, splitter):
- """Test handling mixed chapter pattern types."""
- text = """第一章 开始
- 内容。
- Chapter 2 Middle
- English content.
- 第三章 End
- 中文内容。"""
- chapters = splitter.split(text)
- # Should detect all chapters despite mixed formats
- assert len(chapters) >= 3
- def test_chapter_with_special_characters(self, splitter):
- """Test chapters with special characters in title."""
- text = """第一章:命运的齿轮!
- 内容。
- 第二章 - 新的开始
- 更多内容。"""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- def test_large_chapter_count(self, splitter):
- """Test handling many chapters."""
- # Create text with 100 chapters
- lines = []
- for i in range(1, 101):
- lines.append(f"第{i}章")
- lines.append(f"这是第{i}章的内容。\n")
- text = "\n".join(lines)
- chapters = splitter.split(text)
- assert len(chapters) == 100
- def test_consecutive_chapter_titles(self, splitter):
- """Test handling consecutive chapter titles without content."""
- text = """第一章
- 第二章
- 这是第二章的内容。
- 第三章
- 这是第三章的内容。"""
- chapters = splitter.split(text)
- # Should handle empty chapters gracefully
- assert len(chapters) >= 2
- def test_chapter_with_leading_whitespace(self, splitter):
- """Test chapter titles with leading whitespace."""
- text = """ 第一章 开始
- 内容。
- 第二章 继续
- 更多内容。"""
- chapters = splitter.split(text)
- assert len(chapters) >= 2
- def test_detect_chapter_title_returns_priority(self, splitter):
- """Test that detect_chapter_title returns priority."""
- result = splitter.detect_chapter_title("第一章 开始")
- assert result is not None
- priority, title = result
- assert isinstance(priority, int)
- assert isinstance(title, str)
- def test_word_count_property(self, splitter):
- """Test chapter word_count property."""
- text = """第一章 测试
- 这是测试内容。"""
- chapters = splitter.split(text)
- assert chapters[0].word_count > 0
- def test_len_operator(self, splitter):
- """Test len() operator on Chapter."""
- text = """第一章 测试
- 这是测试内容。"""
- chapters = splitter.split(text)
- assert len(chapters[0]) == chapters[0].char_count
|