test_glossary.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. """
  2. Unit tests for the glossary module.
  3. Tests cover terminology matching, preprocessing, postprocessing,
  4. and integration scenarios.
  5. """
  6. import pytest
  7. from pathlib import Path
  8. import tempfile
  9. import json
  10. import os
  11. from src.glossary.models import Glossary, GlossaryEntry, TermCategory
  12. from src.glossary.matcher import GlossaryMatcher, TermMatch
  13. from src.glossary.preprocessor import GlossaryPreprocessor
  14. from src.glossary.postprocessor import GlossaryPostprocessor
  15. class TestGlossary:
  16. """Test cases for Glossary class."""
  17. def test_add_and_retrieve_term(self):
  18. """Test adding and retrieving a term."""
  19. glossary = Glossary()
  20. entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
  21. glossary.add(entry)
  22. retrieved = glossary.get("林风")
  23. assert retrieved is not None
  24. assert retrieved.source == "林风"
  25. assert retrieved.target == "Lin Feng"
  26. assert retrieved.category == TermCategory.CHARACTER
  27. def test_remove_term(self):
  28. """Test removing a term."""
  29. glossary = Glossary()
  30. entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
  31. glossary.add(entry)
  32. assert glossary.remove("林风") is True
  33. assert glossary.get("林风") is None
  34. assert glossary.remove("林风") is False
  35. def test_sort_by_length_desc(self):
  36. """Test sorting terms by length (longest first)."""
  37. glossary = Glossary()
  38. glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  39. glossary.add(GlossaryEntry("三阶魔法师", "Tier 3 Mage", TermCategory.CHARACTER))
  40. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  41. sorted_terms = glossary.sort_by_length_desc()
  42. assert sorted_terms[0] == "三阶魔法师" # 5 chars
  43. assert sorted_terms[1] == "火球术" # 3 chars
  44. assert sorted_terms[2] == "林风" # 2 chars
  45. def test_get_all(self):
  46. """Test getting all terms."""
  47. glossary = Glossary()
  48. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  49. glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  50. all_terms = glossary.get_all()
  51. assert len(all_terms) == 2
  52. def test_contains_operator(self):
  53. """Test the 'in' operator."""
  54. glossary = Glossary()
  55. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  56. assert "林风" in glossary
  57. assert "火球术" not in glossary
  58. def test_save_to_file(self):
  59. """Test saving glossary to a JSON file."""
  60. glossary = Glossary()
  61. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  62. glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  63. with tempfile.TemporaryDirectory() as tmpdir:
  64. filepath = Path(tmpdir) / "glossary.json"
  65. glossary.save_to_file(filepath)
  66. # Verify file exists and contains correct data
  67. assert filepath.exists()
  68. with open(filepath, "r", encoding="utf-8") as f:
  69. data = json.load(f)
  70. assert len(data) == 2
  71. assert data[0]["source"] == "林风"
  72. assert data[0]["target"] == "Lin Feng"
  73. assert data[0]["category"] == "character"
  74. def test_load_from_file(self):
  75. """Test loading glossary from a JSON file."""
  76. with tempfile.TemporaryDirectory() as tmpdir:
  77. filepath = Path(tmpdir) / "glossary.json"
  78. # Create test JSON file
  79. test_data = [
  80. {
  81. "source": "林风",
  82. "target": "Lin Feng",
  83. "category": "character",
  84. "context": "Main protagonist"
  85. },
  86. {
  87. "source": "火球术",
  88. "target": "Fireball",
  89. "category": "skill",
  90. "context": ""
  91. }
  92. ]
  93. with open(filepath, "w", encoding="utf-8") as f:
  94. json.dump(test_data, f, ensure_ascii=False)
  95. # Load and verify
  96. glossary = Glossary()
  97. glossary.load_from_file(filepath)
  98. assert len(glossary) == 2
  99. assert "林风" in glossary
  100. assert glossary.get("林风").target == "Lin Feng"
  101. assert glossary.get("林风").context == "Main protagonist"
  102. assert glossary.get("火球术").category == TermCategory.SKILL
  103. def test_load_from_file_clears_existing_entries(self):
  104. """Test that loading from file clears existing entries."""
  105. glossary = Glossary()
  106. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  107. with tempfile.TemporaryDirectory() as tmpdir:
  108. filepath = Path(tmpdir) / "glossary.json"
  109. test_data = [
  110. {
  111. "source": "火球术",
  112. "target": "Fireball",
  113. "category": "skill",
  114. "context": ""
  115. }
  116. ]
  117. with open(filepath, "w", encoding="utf-8") as f:
  118. json.dump(test_data, f)
  119. glossary.load_from_file(filepath)
  120. # Old entry should be gone
  121. assert "林风" not in glossary
  122. # New entry should be present
  123. assert "火球术" in glossary
  124. def test_save_and_load_roundtrip(self):
  125. """Test that save and load preserves all data."""
  126. original = Glossary()
  127. original.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER, "Protagonist"))
  128. original.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  129. original.add(GlossaryEntry("东方大陆", "Eastern Continent", TermCategory.LOCATION))
  130. with tempfile.TemporaryDirectory() as tmpdir:
  131. filepath = Path(tmpdir) / "glossary.json"
  132. original.save_to_file(filepath)
  133. loaded = Glossary()
  134. loaded.load_from_file(filepath)
  135. # Verify all entries preserved
  136. assert len(loaded) == len(original)
  137. assert loaded.get("林风").target == "Lin Feng"
  138. assert loaded.get("林风").context == "Protagonist"
  139. assert loaded.get("火球术").category == TermCategory.SKILL
  140. assert loaded.get("东方大陆").target == "Eastern Continent"
  141. def test_load_from_file_creates_parent_directories(self):
  142. """Test that save_to_file creates parent directories."""
  143. glossary = Glossary()
  144. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  145. with tempfile.TemporaryDirectory() as tmpdir:
  146. # Create a nested path that doesn't exist
  147. filepath = Path(tmpdir) / "nested" / "dir" / "glossary.json"
  148. glossary.save_to_file(filepath)
  149. assert filepath.exists()
  150. def test_load_from_file_not_found(self):
  151. """Test loading from non-existent file raises error."""
  152. glossary = Glossary()
  153. with pytest.raises(FileNotFoundError):
  154. glossary.load_from_file(Path("/nonexistent/path/glossary.json"))
  155. def test_load_from_file_invalid_json(self):
  156. """Test loading from file with invalid JSON raises error."""
  157. with tempfile.TemporaryDirectory() as tmpdir:
  158. filepath = Path(tmpdir) / "invalid.json"
  159. with open(filepath, "w") as f:
  160. f.write("not valid json {]")
  161. glossary = Glossary()
  162. with pytest.raises(json.JSONDecodeError):
  163. glossary.load_from_file(filepath)
  164. def test_load_from_file_invalid_category(self):
  165. """Test that invalid category defaults to OTHER."""
  166. with tempfile.TemporaryDirectory() as tmpdir:
  167. filepath = Path(tmpdir) / "glossary.json"
  168. test_data = [
  169. {
  170. "source": "林风",
  171. "target": "Lin Feng",
  172. "category": "invalid_category",
  173. "context": ""
  174. }
  175. ]
  176. with open(filepath, "w", encoding="utf-8") as f:
  177. json.dump(test_data, f)
  178. glossary = Glossary()
  179. glossary.load_from_file(filepath)
  180. # Should default to OTHER
  181. assert glossary.get("林风").category == TermCategory.OTHER
  182. def test_load_from_file_missing_optional_fields(self):
  183. """Test loading entries with missing optional fields."""
  184. with tempfile.TemporaryDirectory() as tmpdir:
  185. filepath = Path(tmpdir) / "glossary.json"
  186. test_data = [
  187. {
  188. "source": "林风",
  189. "target": "Lin Feng"
  190. # Missing category and context
  191. }
  192. ]
  193. with open(filepath, "w", encoding="utf-8") as f:
  194. json.dump(test_data, f)
  195. glossary = Glossary()
  196. glossary.load_from_file(filepath)
  197. # Should use defaults
  198. assert glossary.get("林风").category == TermCategory.OTHER
  199. assert glossary.get("林风").context == ""
  200. def test_save_to_file_empty_glossary(self):
  201. """Test saving an empty glossary."""
  202. glossary = Glossary()
  203. with tempfile.TemporaryDirectory() as tmpdir:
  204. filepath = Path(tmpdir) / "empty.json"
  205. glossary.save_to_file(filepath)
  206. with open(filepath, "r", encoding="utf-8") as f:
  207. data = json.load(f)
  208. assert data == []
  209. class TestGlossaryMatcher:
  210. """Test cases for GlossaryMatcher."""
  211. def test_find_single_term(self):
  212. """Test finding a single term in text."""
  213. glossary = Glossary()
  214. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  215. matcher = GlossaryMatcher(glossary)
  216. matches = matcher.find_matches("林风释放了火球术")
  217. assert len(matches) == 1
  218. assert matches[0].source == "林风"
  219. assert matches[0].target == "Lin Feng"
  220. assert matches[0].start == 0
  221. assert matches[0].end == 2
  222. def test_longest_term_priority(self):
  223. """Test that longer terms are matched first."""
  224. glossary = Glossary()
  225. glossary.add(GlossaryEntry("魔法", "Magic", TermCategory.OTHER))
  226. glossary.add(GlossaryEntry("魔法师", "Mage", TermCategory.CHARACTER))
  227. matcher = GlossaryMatcher(glossary)
  228. matches = matcher.find_matches("魔法师使用了魔法")
  229. # Should match "魔法师" but not the "魔法" within it
  230. assert len(matches) == 2
  231. assert matches[0].source == "魔法师"
  232. assert matches[1].source == "魔法"
  233. def test_placeholder_generation(self):
  234. """Test placeholder generation."""
  235. glossary = Glossary()
  236. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  237. matcher = GlossaryMatcher(glossary)
  238. processed, mapping = matcher.replace_with_placeholder("林风来了")
  239. assert processed == "__en__林风来了"
  240. assert mapping == {"__en__林风": "Lin Feng"}
  241. def test_non_overlapping_matches(self):
  242. """Test that matches don't overlap."""
  243. glossary = Glossary()
  244. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  245. glossary.add(GlossaryEntry("林", "Lin", TermCategory.CHARACTER))
  246. matcher = GlossaryMatcher(glossary)
  247. matches = matcher.find_matches("林风走了")
  248. # Should only match "林风", not "林" within it
  249. assert len(matches) == 1
  250. assert matches[0].source == "林风"
  251. class TestGlossaryPreprocessor:
  252. """Test cases for GlossaryPreprocessor."""
  253. def test_process_text_with_terms(self):
  254. """Test processing text with terminology."""
  255. glossary = Glossary()
  256. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  257. glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  258. preprocessor = GlossaryPreprocessor(glossary)
  259. result = preprocessor.process("林风释放了火球术")
  260. assert result.processed_text == "__en__林风释放了__en__火球术"
  261. assert result.terms_found["林风"] == 1
  262. assert result.terms_found["火球术"] == 1
  263. def test_batch_processing(self):
  264. """Test batch processing of multiple texts."""
  265. glossary = Glossary()
  266. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  267. preprocessor = GlossaryPreprocessor(glossary)
  268. texts = ["林风来了", "林风走了"]
  269. results = preprocessor.process_batch(texts)
  270. assert len(results) == 2
  271. assert "__en__林风" in results[0].processed_text
  272. assert "__en__林风" in results[1].processed_text
  273. class TestGlossaryPostprocessor:
  274. """Test cases for GlossaryPostprocessor."""
  275. def test_restore_from_placeholder(self):
  276. """Test restoring placeholders to translations."""
  277. postprocessor = GlossaryPostprocessor()
  278. mapping = {"__en__林风": "Lin Feng", "__en__火球术": "Fireball"}
  279. result = postprocessor.restore_from_placeholder("__en__林风 released __en__火球术", mapping)
  280. assert result == "Lin Feng released Fireball"
  281. def test_fix_punctuation(self):
  282. """Test punctuation fixing."""
  283. postprocessor = GlossaryPostprocessor()
  284. # Remove space before punctuation
  285. assert postprocessor.fix_punctuation("Lin Feng .") == "Lin Feng."
  286. # Fix Chinese comma after English
  287. assert postprocessor.fix_punctuation("Lin Feng,走了") == "Lin Feng, 走了"
  288. def test_validate_translation_success(self):
  289. """Test successful validation."""
  290. postprocessor = GlossaryPostprocessor()
  291. mapping = {"__en__林风": "Lin Feng"}
  292. result = postprocessor.validate_translation("林风来了", "Lin Feng came", mapping)
  293. assert result.is_valid is True
  294. assert len(result.missing_terms) == 0
  295. def test_validate_translation_missing_terms(self):
  296. """Test validation with missing terms."""
  297. postprocessor = GlossaryPostprocessor()
  298. mapping = {"__en__林风": "Lin Feng"}
  299. result = postprocessor.validate_translation("林风来了", "Lin came", mapping)
  300. assert result.is_valid is False
  301. class TestGlossaryIntegration:
  302. """Integration tests for the glossary module."""
  303. def test_full_pipeline(self):
  304. """Test complete preprocessing and postprocessing pipeline."""
  305. # Setup glossary
  306. glossary = Glossary()
  307. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  308. glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  309. # Preprocess
  310. preprocessor = GlossaryPreprocessor(glossary)
  311. original = "林风释放了火球术"
  312. pre_result = preprocessor.process(original)
  313. assert pre_result.processed_text == "__en__林风释放了__en__火球术"
  314. # Simulate translation
  315. mock_translated = "__en__林风 released __en__火球术"
  316. # Postprocess
  317. postprocessor = GlossaryPostprocessor()
  318. final = postprocessor.process(mock_translated, pre_result.placeholder_map)
  319. assert final == "Lin Feng released Fireball"
  320. def test_phase_0_validation_scenario(self):
  321. """Test the Phase 0 validation scenario."""
  322. # Without glossary (simulated by empty glossary)
  323. empty_glossary = Glossary()
  324. preprocessor = GlossaryPreprocessor(empty_glossary)
  325. result = preprocessor.process("林风释放了火球术")
  326. # No placeholders added
  327. assert result.placeholder_map == {}
  328. assert result.terms_found == {}
  329. # With glossary
  330. full_glossary = Glossary()
  331. full_glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  332. full_glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
  333. preprocessor = GlossaryPreprocessor(full_glossary)
  334. result = preprocessor.process("林风释放了火球术")
  335. # Placeholders added
  336. assert len(result.placeholder_map) == 2
  337. assert result.terms_found["林风"] == 1
  338. assert result.terms_found["火球术"] == 1
  339. def test_retention_rate_calculation(self):
  340. """Test retention rate calculation."""
  341. glossary = Glossary()
  342. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  343. preprocessor = GlossaryPreprocessor(glossary)
  344. original = "林风释放了火球术"
  345. result = preprocessor.process(original)
  346. # Retention rate should be calculated
  347. assert 0 <= result.retention_rate <= 100
  348. def test_empty_string_retention_rate(self):
  349. """Test retention rate with empty string."""
  350. glossary = Glossary()
  351. preprocessor = GlossaryPreprocessor(glossary)
  352. # Empty string should return 100% retention
  353. rate = preprocessor.calculate_retention_rate("", "")
  354. assert rate == 100.0
  355. def test_matcher_restore_from_placeholder(self):
  356. """Test GlossaryMatcher.restore_from_placeholder method."""
  357. glossary = Glossary()
  358. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  359. matcher = GlossaryMatcher(glossary)
  360. text = "__en__林风 came here"
  361. mapping = {"__en__林风": "Lin Feng"}
  362. result = matcher.restore_from_placeholder(text, mapping)
  363. assert result == "Lin Feng came here"
  364. def test_glossary_entry_validation(self):
  365. """Test GlossaryEntry validation."""
  366. with pytest.raises(ValueError):
  367. GlossaryEntry("", "Lin Feng", TermCategory.CHARACTER)
  368. with pytest.raises(ValueError):
  369. GlossaryEntry("林风", "", TermCategory.CHARACTER)
  370. def test_multiple_occurrences_same_term(self):
  371. """Test matching the same term multiple times."""
  372. glossary = Glossary()
  373. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  374. matcher = GlossaryMatcher(glossary)
  375. matches = matcher.find_matches("林风说,林风知道")
  376. # Should find both occurrences
  377. assert len(matches) == 2
  378. assert matches[0].source == "林风"
  379. assert matches[1].source == "林风"
  380. def test_postprocessor_clean_language_tags(self):
  381. """Test clean_language_tags method."""
  382. postprocessor = GlossaryPostprocessor()
  383. # Clean orphaned __en__ prefixes
  384. result = postprocessor.clean_language_tags("__en__ some text here")
  385. assert "__en__" not in result
  386. assert "some text here" in result
  387. def test_glossary_len_and_contains(self):
  388. """Test __len__ and __contains__ methods."""
  389. glossary = Glossary()
  390. assert len(glossary) == 0
  391. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  392. assert len(glossary) == 1
  393. assert "林风" in glossary
  394. assert "不存在" not in glossary