test_splitter.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. """
  2. Unit tests for ChapterSplitter.
  3. """
  4. import pytest
  5. from src.cleaning.splitter import ChapterSplitter, ChapterSplitterError
  6. class TestChapterSplitter:
  7. """Test suite for ChapterSplitter."""
  8. @pytest.fixture
  9. def splitter(self):
  10. """Create a ChapterSplitter instance with low min length for testing."""
  11. return ChapterSplitter(min_chapter_length=10, merge_short_chapters=False)
  12. def test_split_chinese_numerals(self, splitter):
  13. """Test splitting Chinese chapter titles (第一章, etc.)."""
  14. text = """第一章 开始
  15. 这是第一章的内容。
  16. 第二章 继续
  17. 这是第二章的内容。"""
  18. chapters = splitter.split(text)
  19. assert len(chapters) == 2
  20. assert "第一章" in chapters[0].title
  21. assert "第二章" in chapters[1].title
  22. def test_split_arabic_chinese(self, splitter):
  23. """Test splitting Arabic-Chinese chapter titles (第1章, etc.)."""
  24. text = """第1章 开始
  25. 这是第一章的内容。
  26. 第2章 继续
  27. 这是第二章的内容。"""
  28. chapters = splitter.split(text)
  29. assert len(chapters) == 2
  30. assert "第1章" in chapters[0].title
  31. assert "第2章" in chapters[1].title
  32. def test_split_english_format(self, splitter):
  33. """Test splitting English chapter titles."""
  34. text = """Chapter 1 The Beginning
  35. This is chapter one.
  36. Chapter 2 The Journey Continues
  37. This is chapter two."""
  38. chapters = splitter.split(text)
  39. assert len(chapters) >= 2
  40. assert "Chapter 1" in chapters[0].title
  41. def test_split_numbered_format(self, splitter):
  42. """Test splitting numbered chapter titles."""
  43. text = """1. The Start
  44. Content here.
  45. 2. The Middle
  46. More content."""
  47. chapters = splitter.split(text)
  48. assert len(chapters) >= 2
  49. assert "1." in chapters[0].title
  50. def test_split_date_format(self, splitter):
  51. """Test splitting date format chapters."""
  52. text = """2024年3月15日
  53. 这是第一天的内容。
  54. 2024年3月16日
  55. 这是第二天的内容。"""
  56. chapters = splitter.split(text)
  57. assert len(chapters) >= 2
  58. def test_split_volume_format(self, splitter):
  59. """Test splitting volume format (第一卷, etc.)."""
  60. text = """第一卷 命运的开始
  61. 这是第一卷的内容。
  62. 第二卷 奇遇
  63. 这是第二卷的内容。"""
  64. chapters = splitter.split(text)
  65. assert len(chapters) >= 2
  66. def test_split_with_brackets(self, splitter):
  67. """Test splitting bracket format chapters."""
  68. text = """[1] 开始
  69. 内容。
  70. [2] 继续
  71. 更多内容。"""
  72. chapters = splitter.split(text)
  73. assert len(chapters) >= 2
  74. def test_empty_text_returns_empty_list(self, splitter):
  75. """Test that empty text returns empty chapter list."""
  76. chapters = splitter.split("")
  77. assert chapters == []
  78. def test_no_chapter_titles_returns_single_chapter(self, splitter):
  79. """Test that text without chapter titles becomes one chapter."""
  80. text = "这是一段没有章节标题的文本。\n第二行内容。"
  81. chapters = splitter.split(text)
  82. assert len(chapters) == 1
  83. assert chapters[0].title == "全文"
  84. def test_chapter_char_count(self, splitter):
  85. """Test that chapter character count is correct."""
  86. text = """第一章 测试章节
  87. 这是第一章的内容,包含一些文字。
  88. 第二章 第二个章节
  89. 这是第二章的内容。"""
  90. chapters = splitter.split(text)
  91. assert chapters[0].char_count > 0
  92. assert len(chapters[0].content) == chapters[0].char_count
  93. def test_chapter_positions(self, splitter):
  94. """Test that chapter start/end positions are correct."""
  95. text = "第一章\n内容1\n\n第二章\n内容2"
  96. chapters = splitter.split(text)
  97. assert len(chapters) >= 2
  98. if chapters[0].start_pos is not None:
  99. assert chapters[0].start_pos == 0
  100. def test_detect_chapter_title_chinese(self, splitter):
  101. """Test chapter title detection for Chinese format."""
  102. assert splitter.is_chapter_title("第一章 开始")
  103. assert splitter.is_chapter_title("第123章")
  104. assert splitter.is_chapter_title("第十章 约战")
  105. def test_detect_chapter_title_english(self, splitter):
  106. """Test chapter title detection for English format."""
  107. assert splitter.is_chapter_title("Chapter 1")
  108. assert splitter.is_chapter_title("Chapter One - The Beginning")
  109. def test_detect_chapter_title_numbered(self, splitter):
  110. """Test chapter title detection for numbered format."""
  111. assert splitter.is_chapter_title("1. Start")
  112. assert splitter.is_chapter_title("123. End")
  113. def test_detect_chapter_title_date(self, splitter):
  114. """Test chapter title detection for date format."""
  115. assert splitter.is_chapter_title("2024年3月15日")
  116. assert splitter.is_chapter_title("2024年12月1日")
  117. def test_detect_not_chapter_title(self, splitter):
  118. """Test that non-titles are correctly identified."""
  119. assert not splitter.is_chapter_title("这是一个普通的句子")
  120. assert not splitter.is_chapter_title("")
  121. assert not splitter.is_chapter_title("hello world")
  122. def test_get_chapter_count(self, splitter):
  123. """Test getting chapter count without full split."""
  124. text = """第一章 开始
  125. 内容。
  126. 第二章 继续
  127. 更多内容。
  128. 第三章 结束
  129. 最后内容。"""
  130. count = splitter.get_chapter_count(text)
  131. assert count == 3
  132. def test_get_chapter_count_no_chapters(self, splitter):
  133. """Test getting chapter count for text without chapters."""
  134. text = "这是一段没有章节的文本。"
  135. count = splitter.get_chapter_count(text)
  136. assert count == 1
  137. def test_preview_chapters(self, splitter):
  138. """Test getting chapter previews."""
  139. text = """第一章 开始
  140. 这是第一章的内容,包含一些文字。
  141. 第二章 继续
  142. 这是第二章的内容,包含更多文字。"""
  143. previews = splitter.preview_chapters(text, preview_length=50)
  144. assert len(previews) >= 2
  145. assert "第一章" in previews[0]
  146. assert "第二章" in previews[1]
  147. def test_merge_short_chapters_enabled(self):
  148. """Test that short chapters are merged when enabled."""
  149. text = """第一章 开始
  150. 短。
  151. 第二章 中间
  152. 这是第二章较长的内容。
  153. 第三章 结尾
  154. 也短。"""
  155. splitter = ChapterSplitter(min_chapter_length=50, merge_short_chapters=True)
  156. chapters = splitter.split(text)
  157. # Short chapters should be merged with adjacent ones
  158. assert len(chapters) <= 3
  159. def test_merge_short_chapters_disabled(self):
  160. """Test that short chapters are kept when merging disabled."""
  161. text = """第一章 开始
  162. 短内容。
  163. 第二章 继续
  164. 更多内容。"""
  165. splitter = ChapterSplitter(min_chapter_length=1000, merge_short_chapters=False)
  166. chapters = splitter.split(text)
  167. # All chapters should be kept
  168. assert len(chapters) == 2
  169. def test_custom_patterns(self):
  170. """Test using custom chapter patterns."""
  171. text = """EPISODE 1 Start
  172. Content.
  173. EPISODE 2 Middle
  174. More content."""
  175. custom_patterns = [(r'^EPISODE\s+\d+', 1)]
  176. splitter = ChapterSplitter(
  177. min_chapter_length=10,
  178. merge_short_chapters=False,
  179. custom_patterns=custom_patterns
  180. )
  181. chapters = splitter.split(text)
  182. assert len(chapters) >= 2
  183. assert "EPISODE 1" in chapters[0].title
  184. def test_mixed_pattern_types(self, splitter):
  185. """Test handling mixed chapter pattern types."""
  186. text = """第一章 开始
  187. 内容。
  188. Chapter 2 Middle
  189. English content.
  190. 第三章 End
  191. 中文内容。"""
  192. chapters = splitter.split(text)
  193. # Should detect all chapters despite mixed formats
  194. assert len(chapters) >= 3
  195. def test_chapter_with_special_characters(self, splitter):
  196. """Test chapters with special characters in title."""
  197. text = """第一章:命运的齿轮!
  198. 内容。
  199. 第二章 - 新的开始
  200. 更多内容。"""
  201. chapters = splitter.split(text)
  202. assert len(chapters) >= 2
  203. def test_large_chapter_count(self, splitter):
  204. """Test handling many chapters."""
  205. # Create text with 100 chapters
  206. lines = []
  207. for i in range(1, 101):
  208. lines.append(f"第{i}章")
  209. lines.append(f"这是第{i}章的内容。\n")
  210. text = "\n".join(lines)
  211. chapters = splitter.split(text)
  212. assert len(chapters) == 100
  213. def test_consecutive_chapter_titles(self, splitter):
  214. """Test handling consecutive chapter titles without content."""
  215. text = """第一章
  216. 第二章
  217. 这是第二章的内容。
  218. 第三章
  219. 这是第三章的内容。"""
  220. chapters = splitter.split(text)
  221. # Should handle empty chapters gracefully
  222. assert len(chapters) >= 2
  223. def test_chapter_with_leading_whitespace(self, splitter):
  224. """Test chapter titles with leading whitespace."""
  225. text = """ 第一章 开始
  226. 内容。
  227. 第二章 继续
  228. 更多内容。"""
  229. chapters = splitter.split(text)
  230. assert len(chapters) >= 2
  231. def test_detect_chapter_title_returns_priority(self, splitter):
  232. """Test that detect_chapter_title returns priority."""
  233. result = splitter.detect_chapter_title("第一章 开始")
  234. assert result is not None
  235. priority, title = result
  236. assert isinstance(priority, int)
  237. assert isinstance(title, str)
  238. def test_word_count_property(self, splitter):
  239. """Test chapter word_count property."""
  240. text = """第一章 测试
  241. 这是测试内容。"""
  242. chapters = splitter.split(text)
  243. assert chapters[0].word_count > 0
  244. def test_len_operator(self, splitter):
  245. """Test len() operator on Chapter."""
  246. text = """第一章 测试
  247. 这是测试内容。"""
  248. chapters = splitter.split(text)
  249. assert len(chapters[0]) == chapters[0].char_count