2
0
فهرست منبع

test(glossary): Add .gitignore and pipeline tests

- Add .gitignore for Python/__pycache__/venv files
- Add tests/test_pipeline.py for GlossaryPipeline and PostProcessor
- Test coverage: preprocess, batch_preprocess, get_statistics, PostProcessor

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
d8dfun 3 روز پیش
والد
کامیت
0cd97d219e
2فایلهای تغییر یافته به همراه204 افزوده شده و 0 حذف شده
  1. 75 0
      .gitignore
  2. 129 0
      tests/test_pipeline.py

+ 75 - 0
.gitignore

@@ -0,0 +1,75 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Virtual environments
+venv/
+ENV/
+env/
+.venv
+test_env/
+m2m_translator_env/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+models/
+output_folder/
+phase0-test/
+phase0_validation/
+*.log
+*.tmp
+_bmad/
+
+# Claude wrappers
+.claude-wrapper-*.sh
+.claude/
+.tmp-input-*.txt
+.dev-container-config.json

+ 129 - 0
tests/test_pipeline.py

@@ -0,0 +1,129 @@
+"""
+Unit tests for GlossaryPipeline and PostProcessor.
+"""
+
+import pytest
+
+from src.glossary.models import Glossary, GlossaryEntry, TermCategory
+from src.glossary.pipeline import GlossaryPipeline
+from src.glossary.post_process import PostProcessor
+
+
+class TestGlossaryPipeline:
+    """Test cases for GlossaryPipeline."""
+
+    def test_preprocess_single_text(self):
+        """Test preprocessing a single text."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        pipeline = GlossaryPipeline(glossary)
+        processed, terms = pipeline.preprocess("林风来了")
+
+        assert "__en__林风" in processed
+        assert "林风" in terms
+
+    def test_preprocess_returns_terms_used(self):
+        """Test that preprocess returns list of terms used."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+        glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION))
+
+        pipeline = GlossaryPipeline(glossary)
+        processed, terms = pipeline.preprocess("林风在青云宗")
+
+        assert len(terms) == 2
+        assert "林风" in terms
+        assert "青云宗" in terms
+
+    def test_batch_preprocess(self):
+        """Test batch preprocessing multiple texts."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        pipeline = GlossaryPipeline(glossary)
+        texts = ["林风来了", "林风走了"]
+        results = pipeline.batch_preprocess(texts)
+
+        assert len(results) == 2
+        for processed, terms in results:
+            assert "__en__林风" in processed
+
+    def test_get_statistics(self):
+        """Test getting terminology statistics."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        pipeline = GlossaryPipeline(glossary)
+        stats = pipeline.get_statistics("林风来了,林风走了")
+
+        assert stats["林风"] == 2
+
+
+class TestPostProcessor:
+    """Test cases for PostProcessor."""
+
+    def test_remove_lang_tags(self):
+        """Test removing language tag prefixes."""
+        assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text"
+        assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here"
+        assert PostProcessor.remove_lang_tags("Normal text") == "Normal text"
+
+    def test_fix_punctuation_multiple_dots(self):
+        """Test fixing multiple consecutive dots."""
+        assert PostProcessor.fix_punctuation("Hello..") == "Hello."
+        assert PostProcessor.fix_punctuation("Text...") == "Text."
+        assert PostProcessor.fix_punctuation("End....") == "End."
+
+    def test_fix_punctuation_space_before(self):
+        """Test fixing space before punctuation."""
+        assert PostProcessor.fix_punctuation("Word .") == "Word."
+        assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world"
+
+    def test_process_with_placeholder_map(self):
+        """Test processing with placeholder restoration."""
+        mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"}
+        text = "__en__Lin Feng is __en__Qingyun Sect"
+
+        result = PostProcessor.process(text, mapping)
+
+        assert "Lin Feng" in result
+        assert "Qingyun Sect" in result
+        assert "__en__" not in result
+
+    def test_process_removes_lang_tags(self):
+        """Test that process removes language tags."""
+        result = PostProcessor.process("__en__Some text here")
+        assert result == "Some text here"
+
+    def test_process_fixes_punctuation(self):
+        """Test that process fixes punctuation issues."""
+        result = PostProcessor.process("Hello..")
+        assert result == "Hello."
+
+    def test_clean_whitespace(self):
+        """Test cleaning whitespace."""
+        assert PostProcessor.clean_whitespace("Hello    world") == "Hello world"
+        assert PostProcessor.clean_whitespace("  Text  ") == "Text"
+
+
+class TestEndToEnd:
+    """End-to-end integration tests."""
+
+    def test_full_pipeline(self):
+        """Test complete preprocessing and postprocessing."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        # Preprocess
+        pipeline = GlossaryPipeline(glossary)
+        processed, terms = pipeline.preprocess("林风来了")
+
+        # Simulate translation
+        mock_translated = "__en__Lin Feng came"
+
+        # Postprocess
+        final = PostProcessor.process(mock_translated)
+
+        assert "Lin Feng came" == final
+        assert "__en__" not in final