test_pipeline.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. """
  2. Unit tests for GlossaryPipeline and PostProcessor.
  3. """
  4. import pytest
  5. from src.glossary.models import Glossary, GlossaryEntry, TermCategory
  6. from src.glossary.pipeline import GlossaryPipeline
  7. from src.glossary.post_process import PostProcessor
  8. class TestGlossaryPipeline:
  9. """Test cases for GlossaryPipeline."""
  10. def test_preprocess_single_text(self):
  11. """Test preprocessing a single text."""
  12. glossary = Glossary()
  13. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  14. pipeline = GlossaryPipeline(glossary)
  15. processed, terms = pipeline.preprocess("林风来了")
  16. assert "__en__林风" in processed
  17. assert "林风" in terms
  18. def test_preprocess_returns_terms_used(self):
  19. """Test that preprocess returns list of terms used."""
  20. glossary = Glossary()
  21. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  22. glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION))
  23. pipeline = GlossaryPipeline(glossary)
  24. processed, terms = pipeline.preprocess("林风在青云宗")
  25. assert len(terms) == 2
  26. assert "林风" in terms
  27. assert "青云宗" in terms
  28. def test_batch_preprocess(self):
  29. """Test batch preprocessing multiple texts."""
  30. glossary = Glossary()
  31. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  32. pipeline = GlossaryPipeline(glossary)
  33. texts = ["林风来了", "林风走了"]
  34. results = pipeline.batch_preprocess(texts)
  35. assert len(results) == 2
  36. for processed, terms in results:
  37. assert "__en__林风" in processed
  38. def test_get_statistics(self):
  39. """Test getting terminology statistics."""
  40. glossary = Glossary()
  41. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  42. pipeline = GlossaryPipeline(glossary)
  43. stats = pipeline.get_statistics("林风来了,林风走了")
  44. assert stats["林风"] == 2
  45. class TestPostProcessor:
  46. """Test cases for PostProcessor."""
  47. def test_remove_lang_tags(self):
  48. """Test removing language tag prefixes."""
  49. assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text"
  50. assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here"
  51. assert PostProcessor.remove_lang_tags("Normal text") == "Normal text"
  52. def test_fix_punctuation_multiple_dots(self):
  53. """Test fixing multiple consecutive dots."""
  54. assert PostProcessor.fix_punctuation("Hello..") == "Hello."
  55. assert PostProcessor.fix_punctuation("Text...") == "Text."
  56. assert PostProcessor.fix_punctuation("End....") == "End."
  57. def test_fix_punctuation_space_before(self):
  58. """Test fixing space before punctuation."""
  59. assert PostProcessor.fix_punctuation("Word .") == "Word."
  60. assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world"
  61. def test_process_with_placeholder_map(self):
  62. """Test processing with placeholder restoration."""
  63. mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"}
  64. text = "__en__Lin Feng is __en__Qingyun Sect"
  65. result = PostProcessor.process(text, mapping)
  66. assert "Lin Feng" in result
  67. assert "Qingyun Sect" in result
  68. assert "__en__" not in result
  69. def test_process_removes_lang_tags(self):
  70. """Test that process removes language tags."""
  71. result = PostProcessor.process("__en__Some text here")
  72. assert result == "Some text here"
  73. def test_process_fixes_punctuation(self):
  74. """Test that process fixes punctuation issues."""
  75. result = PostProcessor.process("Hello..")
  76. assert result == "Hello."
  77. def test_clean_whitespace(self):
  78. """Test cleaning whitespace."""
  79. assert PostProcessor.clean_whitespace("Hello world") == "Hello world"
  80. assert PostProcessor.clean_whitespace(" Text ") == "Text"
  81. class TestEndToEnd:
  82. """End-to-end integration tests."""
  83. def test_full_pipeline(self):
  84. """Test complete preprocessing and postprocessing."""
  85. glossary = Glossary()
  86. glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
  87. # Preprocess
  88. pipeline = GlossaryPipeline(glossary)
  89. processed, terms = pipeline.preprocess("林风来了")
  90. # Simulate translation
  91. mock_translated = "__en__Lin Feng came"
  92. # Postprocess
  93. final = PostProcessor.process(mock_translated)
  94. assert "Lin Feng came" == final
  95. assert "__en__" not in final