|
|
@@ -0,0 +1,129 @@
|
|
|
+"""
|
|
|
+Unit tests for GlossaryPipeline and PostProcessor.
|
|
|
+"""
|
|
|
+
|
|
|
+import pytest
|
|
|
+
|
|
|
+from src.glossary.models import Glossary, GlossaryEntry, TermCategory
|
|
|
+from src.glossary.pipeline import GlossaryPipeline
|
|
|
+from src.glossary.post_process import PostProcessor
|
|
|
+
|
|
|
+
|
|
|
+class TestGlossaryPipeline:
|
|
|
+ """Test cases for GlossaryPipeline."""
|
|
|
+
|
|
|
+ def test_preprocess_single_text(self):
|
|
|
+ """Test preprocessing a single text."""
|
|
|
+ glossary = Glossary()
|
|
|
+ glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
|
|
|
+
|
|
|
+ pipeline = GlossaryPipeline(glossary)
|
|
|
+ processed, terms = pipeline.preprocess("林风来了")
|
|
|
+
|
|
|
+ assert "__en__林风" in processed
|
|
|
+ assert "林风" in terms
|
|
|
+
|
|
|
+ def test_preprocess_returns_terms_used(self):
|
|
|
+ """Test that preprocess returns list of terms used."""
|
|
|
+ glossary = Glossary()
|
|
|
+ glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
|
|
|
+ glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION))
|
|
|
+
|
|
|
+ pipeline = GlossaryPipeline(glossary)
|
|
|
+ processed, terms = pipeline.preprocess("林风在青云宗")
|
|
|
+
|
|
|
+ assert len(terms) == 2
|
|
|
+ assert "林风" in terms
|
|
|
+ assert "青云宗" in terms
|
|
|
+
|
|
|
+ def test_batch_preprocess(self):
|
|
|
+ """Test batch preprocessing multiple texts."""
|
|
|
+ glossary = Glossary()
|
|
|
+ glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
|
|
|
+
|
|
|
+ pipeline = GlossaryPipeline(glossary)
|
|
|
+ texts = ["林风来了", "林风走了"]
|
|
|
+ results = pipeline.batch_preprocess(texts)
|
|
|
+
|
|
|
+ assert len(results) == 2
|
|
|
+ for processed, terms in results:
|
|
|
+ assert "__en__林风" in processed
|
|
|
+
|
|
|
+ def test_get_statistics(self):
|
|
|
+ """Test getting terminology statistics."""
|
|
|
+ glossary = Glossary()
|
|
|
+ glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
|
|
|
+
|
|
|
+ pipeline = GlossaryPipeline(glossary)
|
|
|
+ stats = pipeline.get_statistics("林风来了,林风走了")
|
|
|
+
|
|
|
+ assert stats["林风"] == 2
|
|
|
+
|
|
|
+
|
|
|
+class TestPostProcessor:
|
|
|
+ """Test cases for PostProcessor."""
|
|
|
+
|
|
|
+ def test_remove_lang_tags(self):
|
|
|
+ """Test removing language tag prefixes."""
|
|
|
+ assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text"
|
|
|
+ assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here"
|
|
|
+ assert PostProcessor.remove_lang_tags("Normal text") == "Normal text"
|
|
|
+
|
|
|
+ def test_fix_punctuation_multiple_dots(self):
|
|
|
+ """Test fixing multiple consecutive dots."""
|
|
|
+ assert PostProcessor.fix_punctuation("Hello..") == "Hello."
|
|
|
+ assert PostProcessor.fix_punctuation("Text...") == "Text."
|
|
|
+ assert PostProcessor.fix_punctuation("End....") == "End."
|
|
|
+
|
|
|
+ def test_fix_punctuation_space_before(self):
|
|
|
+ """Test fixing space before punctuation."""
|
|
|
+ assert PostProcessor.fix_punctuation("Word .") == "Word."
|
|
|
+ assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world"
|
|
|
+
|
|
|
+ def test_process_with_placeholder_map(self):
|
|
|
+ """Test processing with placeholder restoration."""
|
|
|
+ mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"}
|
|
|
+ text = "__en__Lin Feng is __en__Qingyun Sect"
|
|
|
+
|
|
|
+ result = PostProcessor.process(text, mapping)
|
|
|
+
|
|
|
+ assert "Lin Feng" in result
|
|
|
+ assert "Qingyun Sect" in result
|
|
|
+ assert "__en__" not in result
|
|
|
+
|
|
|
+ def test_process_removes_lang_tags(self):
|
|
|
+ """Test that process removes language tags."""
|
|
|
+ result = PostProcessor.process("__en__Some text here")
|
|
|
+ assert result == "Some text here"
|
|
|
+
|
|
|
+ def test_process_fixes_punctuation(self):
|
|
|
+ """Test that process fixes punctuation issues."""
|
|
|
+ result = PostProcessor.process("Hello..")
|
|
|
+ assert result == "Hello."
|
|
|
+
|
|
|
+ def test_clean_whitespace(self):
|
|
|
+ """Test cleaning whitespace."""
|
|
|
+ assert PostProcessor.clean_whitespace("Hello world") == "Hello world"
|
|
|
+ assert PostProcessor.clean_whitespace(" Text ") == "Text"
|
|
|
+
|
|
|
+
|
|
|
+class TestEndToEnd:
|
|
|
+ """End-to-end integration tests."""
|
|
|
+
|
|
|
+ def test_full_pipeline(self):
|
|
|
+ """Test complete preprocessing and postprocessing."""
|
|
|
+ glossary = Glossary()
|
|
|
+ glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
|
|
|
+
|
|
|
+ # Preprocess
|
|
|
+ pipeline = GlossaryPipeline(glossary)
|
|
|
+ processed, terms = pipeline.preprocess("林风来了")
|
|
|
+
|
|
|
+ # Simulate translation
|
|
|
+ mock_translated = "__en__Lin Feng came"
|
|
|
+
|
|
|
+ # Postprocess
|
|
|
+ final = PostProcessor.process(mock_translated)
|
|
|
+
|
|
|
+ assert "Lin Feng came" == final
|
|
|
+ assert "__en__" not in final
|