""" Unit tests for ChapterSplitter. """ import pytest from src.cleaning.splitter import ChapterSplitter, ChapterSplitterError class TestChapterSplitter: """Test suite for ChapterSplitter.""" @pytest.fixture def splitter(self): """Create a ChapterSplitter instance with low min length for testing.""" return ChapterSplitter(min_chapter_length=10, merge_short_chapters=False) def test_split_chinese_numerals(self, splitter): """Test splitting Chinese chapter titles (第一章, etc.).""" text = """第一章 开始 这是第一章的内容。 第二章 继续 这是第二章的内容。""" chapters = splitter.split(text) assert len(chapters) == 2 assert "第一章" in chapters[0].title assert "第二章" in chapters[1].title def test_split_arabic_chinese(self, splitter): """Test splitting Arabic-Chinese chapter titles (第1章, etc.).""" text = """第1章 开始 这是第一章的内容。 第2章 继续 这是第二章的内容。""" chapters = splitter.split(text) assert len(chapters) == 2 assert "第1章" in chapters[0].title assert "第2章" in chapters[1].title def test_split_english_format(self, splitter): """Test splitting English chapter titles.""" text = """Chapter 1 The Beginning This is chapter one. Chapter 2 The Journey Continues This is chapter two.""" chapters = splitter.split(text) assert len(chapters) >= 2 assert "Chapter 1" in chapters[0].title def test_split_numbered_format(self, splitter): """Test splitting numbered chapter titles.""" text = """1. The Start Content here. 2. The Middle More content.""" chapters = splitter.split(text) assert len(chapters) >= 2 assert "1." in chapters[0].title def test_split_date_format(self, splitter): """Test splitting date format chapters.""" text = """2024年3月15日 这是第一天的内容。 2024年3月16日 这是第二天的内容。""" chapters = splitter.split(text) assert len(chapters) >= 2 def test_split_volume_format(self, splitter): """Test splitting volume format (第一卷, etc.).""" text = """第一卷 命运的开始 这是第一卷的内容。 第二卷 奇遇 这是第二卷的内容。""" chapters = splitter.split(text) assert len(chapters) >= 2 def test_split_with_brackets(self, splitter): """Test splitting bracket format chapters.""" text = """[1] 开始 内容。 [2] 继续 更多内容。""" chapters = splitter.split(text) assert len(chapters) >= 2 def test_empty_text_returns_empty_list(self, splitter): """Test that empty text returns empty chapter list.""" chapters = splitter.split("") assert chapters == [] def test_no_chapter_titles_returns_single_chapter(self, splitter): """Test that text without chapter titles becomes one chapter.""" text = "这是一段没有章节标题的文本。\n第二行内容。" chapters = splitter.split(text) assert len(chapters) == 1 assert chapters[0].title == "全文" def test_chapter_char_count(self, splitter): """Test that chapter character count is correct.""" text = """第一章 测试章节 这是第一章的内容,包含一些文字。 第二章 第二个章节 这是第二章的内容。""" chapters = splitter.split(text) assert chapters[0].char_count > 0 assert len(chapters[0].content) == chapters[0].char_count def test_chapter_positions(self, splitter): """Test that chapter start/end positions are correct.""" text = "第一章\n内容1\n\n第二章\n内容2" chapters = splitter.split(text) assert len(chapters) >= 2 if chapters[0].start_pos is not None: assert chapters[0].start_pos == 0 def test_detect_chapter_title_chinese(self, splitter): """Test chapter title detection for Chinese format.""" assert splitter.is_chapter_title("第一章 开始") assert splitter.is_chapter_title("第123章") assert splitter.is_chapter_title("第十章 约战") def test_detect_chapter_title_english(self, splitter): """Test chapter title detection for English format.""" assert splitter.is_chapter_title("Chapter 1") assert splitter.is_chapter_title("Chapter One - The Beginning") def test_detect_chapter_title_numbered(self, splitter): """Test chapter title detection for numbered format.""" assert splitter.is_chapter_title("1. Start") assert splitter.is_chapter_title("123. End") def test_detect_chapter_title_date(self, splitter): """Test chapter title detection for date format.""" assert splitter.is_chapter_title("2024年3月15日") assert splitter.is_chapter_title("2024年12月1日") def test_detect_not_chapter_title(self, splitter): """Test that non-titles are correctly identified.""" assert not splitter.is_chapter_title("这是一个普通的句子") assert not splitter.is_chapter_title("") assert not splitter.is_chapter_title("hello world") def test_get_chapter_count(self, splitter): """Test getting chapter count without full split.""" text = """第一章 开始 内容。 第二章 继续 更多内容。 第三章 结束 最后内容。""" count = splitter.get_chapter_count(text) assert count == 3 def test_get_chapter_count_no_chapters(self, splitter): """Test getting chapter count for text without chapters.""" text = "这是一段没有章节的文本。" count = splitter.get_chapter_count(text) assert count == 1 def test_preview_chapters(self, splitter): """Test getting chapter previews.""" text = """第一章 开始 这是第一章的内容,包含一些文字。 第二章 继续 这是第二章的内容,包含更多文字。""" previews = splitter.preview_chapters(text, preview_length=50) assert len(previews) >= 2 assert "第一章" in previews[0] assert "第二章" in previews[1] def test_merge_short_chapters_enabled(self): """Test that short chapters are merged when enabled.""" text = """第一章 开始 短。 第二章 中间 这是第二章较长的内容。 第三章 结尾 也短。""" splitter = ChapterSplitter(min_chapter_length=50, merge_short_chapters=True) chapters = splitter.split(text) # Short chapters should be merged with adjacent ones assert len(chapters) <= 3 def test_merge_short_chapters_disabled(self): """Test that short chapters are kept when merging disabled.""" text = """第一章 开始 短内容。 第二章 继续 更多内容。""" splitter = ChapterSplitter(min_chapter_length=1000, merge_short_chapters=False) chapters = splitter.split(text) # All chapters should be kept assert len(chapters) == 2 def test_custom_patterns(self): """Test using custom chapter patterns.""" text = """EPISODE 1 Start Content. EPISODE 2 Middle More content.""" custom_patterns = [(r'^EPISODE\s+\d+', 1)] splitter = ChapterSplitter( min_chapter_length=10, merge_short_chapters=False, custom_patterns=custom_patterns ) chapters = splitter.split(text) assert len(chapters) >= 2 assert "EPISODE 1" in chapters[0].title def test_mixed_pattern_types(self, splitter): """Test handling mixed chapter pattern types.""" text = """第一章 开始 内容。 Chapter 2 Middle English content. 第三章 End 中文内容。""" chapters = splitter.split(text) # Should detect all chapters despite mixed formats assert len(chapters) >= 3 def test_chapter_with_special_characters(self, splitter): """Test chapters with special characters in title.""" text = """第一章:命运的齿轮! 内容。 第二章 - 新的开始 更多内容。""" chapters = splitter.split(text) assert len(chapters) >= 2 def test_large_chapter_count(self, splitter): """Test handling many chapters.""" # Create text with 100 chapters lines = [] for i in range(1, 101): lines.append(f"第{i}章") lines.append(f"这是第{i}章的内容。\n") text = "\n".join(lines) chapters = splitter.split(text) assert len(chapters) == 100 def test_consecutive_chapter_titles(self, splitter): """Test handling consecutive chapter titles without content.""" text = """第一章 第二章 这是第二章的内容。 第三章 这是第三章的内容。""" chapters = splitter.split(text) # Should handle empty chapters gracefully assert len(chapters) >= 2 def test_chapter_with_leading_whitespace(self, splitter): """Test chapter titles with leading whitespace.""" text = """ 第一章 开始 内容。 第二章 继续 更多内容。""" chapters = splitter.split(text) assert len(chapters) >= 2 def test_detect_chapter_title_returns_priority(self, splitter): """Test that detect_chapter_title returns priority.""" result = splitter.detect_chapter_title("第一章 开始") assert result is not None priority, title = result assert isinstance(priority, int) assert isinstance(title, str) def test_word_count_property(self, splitter): """Test chapter word_count property.""" text = """第一章 测试 这是测试内容。""" chapters = splitter.split(text) assert chapters[0].word_count > 0 def test_len_operator(self, splitter): """Test len() operator on Chapter.""" text = """第一章 测试 这是测试内容。""" chapters = splitter.split(text) assert len(chapters[0]) == chapters[0].char_count