1 ماه پیش · 0cd97d219e
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,75 @@
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# C extensions
			
 
				+*.so
			
 
				+
			
 
				+# Distribution / packaging
			
 
				+.Python
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+wheels/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+
			
 
				+# PyInstaller
			
 
				+*.manifest
			
 
				+*.spec
			
 
				+
			
 
				+# Unit test / coverage reports
			
 
				+htmlcov/
			
 
				+.tox/
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+nosetests.xml
			
 
				+coverage.xml
			
 
				+*.cover
			
 
				+.hypothesis/
			
 
				+.pytest_cache/
			
 
				+
			
 
				+# Virtual environments
			
 
				+venv/
			
 
				+ENV/
			
 
				+env/
			
 
				+.venv
			
 
				+test_env/
			
 
				+m2m_translator_env/
			
 
				+
			
 
				+# IDEs
			
 
				+.vscode/
			
 
				+.idea/
			
 
				+*.swp
			
 
				+*.swo
			
 
				+*~
			
 
				+
			
 
				+# OS
			
 
				+.DS_Store
			
 
				+Thumbs.db
			
 
				+
			
 
				+# Project specific
			
 
				+models/
			
 
				+output_folder/
			
 
				+phase0-test/
			
 
				+phase0_validation/
			
 
				+*.log
			
 
				+*.tmp
			
 
				+_bmad/
			
 
				+
			
 
				+# Claude wrappers
			
 
				+.claude-wrapper-*.sh
			
 
				+.claude/
			
 
				+.tmp-input-*.txt
			
 
				+.dev-container-config.json
			
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -0,0 +1,129 @@
 
				+"""
			
 
				+Unit tests for GlossaryPipeline and PostProcessor.
			
 
				+"""
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+from src.glossary.models import Glossary, GlossaryEntry, TermCategory
			
 
				+from src.glossary.pipeline import GlossaryPipeline
			
 
				+from src.glossary.post_process import PostProcessor
			
 
				+
			
 
				+
			
 
				+class TestGlossaryPipeline:
			
 
				+    """Test cases for GlossaryPipeline."""
			
 
				+
			
 
				+    def test_preprocess_single_text(self):
			
 
				+        """Test preprocessing a single text."""
			
 
				+        glossary = Glossary()
			
 
				+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
			
 
				+
			
 
				+        pipeline = GlossaryPipeline(glossary)
			
 
				+        processed, terms = pipeline.preprocess("林风来了")
			
 
				+
			
 
				+        assert "__en__林风" in processed
			
 
				+        assert "林风" in terms
			
 
				+
			
 
				+    def test_preprocess_returns_terms_used(self):
			
 
				+        """Test that preprocess returns list of terms used."""
			
 
				+        glossary = Glossary()
			
 
				+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
			
 
				+        glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION))
			
 
				+
			
 
				+        pipeline = GlossaryPipeline(glossary)
			
 
				+        processed, terms = pipeline.preprocess("林风在青云宗")
			
 
				+
			
 
				+        assert len(terms) == 2
			
 
				+        assert "林风" in terms
			
 
				+        assert "青云宗" in terms
			
 
				+
			
 
				+    def test_batch_preprocess(self):
			
 
				+        """Test batch preprocessing multiple texts."""
			
 
				+        glossary = Glossary()
			
 
				+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
			
 
				+
			
 
				+        pipeline = GlossaryPipeline(glossary)
			
 
				+        texts = ["林风来了", "林风走了"]
			
 
				+        results = pipeline.batch_preprocess(texts)
			
 
				+
			
 
				+        assert len(results) == 2
			
 
				+        for processed, terms in results:
			
 
				+            assert "__en__林风" in processed
			
 
				+
			
 
				+    def test_get_statistics(self):
			
 
				+        """Test getting terminology statistics."""
			
 
				+        glossary = Glossary()
			
 
				+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
			
 
				+
			
 
				+        pipeline = GlossaryPipeline(glossary)
			
 
				+        stats = pipeline.get_statistics("林风来了，林风走了")
			
 
				+
			
 
				+        assert stats["林风"] == 2
			
 
				+
			
 
				+
			
 
				+class TestPostProcessor:
			
 
				+    """Test cases for PostProcessor."""
			
 
				+
			
 
				+    def test_remove_lang_tags(self):
			
 
				+        """Test removing language tag prefixes."""
			
 
				+        assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text"
			
 
				+        assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here"
			
 
				+        assert PostProcessor.remove_lang_tags("Normal text") == "Normal text"
			
 
				+
			
 
				+    def test_fix_punctuation_multiple_dots(self):
			
 
				+        """Test fixing multiple consecutive dots."""
			
 
				+        assert PostProcessor.fix_punctuation("Hello..") == "Hello."
			
 
				+        assert PostProcessor.fix_punctuation("Text...") == "Text."
			
 
				+        assert PostProcessor.fix_punctuation("End....") == "End."
			
 
				+
			
 
				+    def test_fix_punctuation_space_before(self):
			
 
				+        """Test fixing space before punctuation."""
			
 
				+        assert PostProcessor.fix_punctuation("Word .") == "Word."
			
 
				+        assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world"
			
 
				+
			
 
				+    def test_process_with_placeholder_map(self):
			
 
				+        """Test processing with placeholder restoration."""
			
 
				+        mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"}
			
 
				+        text = "__en__Lin Feng is __en__Qingyun Sect"
			
 
				+
			
 
				+        result = PostProcessor.process(text, mapping)
			
 
				+
			
 
				+        assert "Lin Feng" in result
			
 
				+        assert "Qingyun Sect" in result
			
 
				+        assert "__en__" not in result
			
 
				+
			
 
				+    def test_process_removes_lang_tags(self):
			
 
				+        """Test that process removes language tags."""
			
 
				+        result = PostProcessor.process("__en__Some text here")
			
 
				+        assert result == "Some text here"
			
 
				+
			
 
				+    def test_process_fixes_punctuation(self):
			
 
				+        """Test that process fixes punctuation issues."""
			
 
				+        result = PostProcessor.process("Hello..")
			
 
				+        assert result == "Hello."
			
 
				+
			
 
				+    def test_clean_whitespace(self):
			
 
				+        """Test cleaning whitespace."""
			
 
				+        assert PostProcessor.clean_whitespace("Hello    world") == "Hello world"
			
 
				+        assert PostProcessor.clean_whitespace("  Text  ") == "Text"
			
 
				+
			
 
				+
			
 
				+class TestEndToEnd:
			
 
				+    """End-to-end integration tests."""
			
 
				+
			
 
				+    def test_full_pipeline(self):
			
 
				+        """Test complete preprocessing and postprocessing."""
			
 
				+        glossary = Glossary()
			
 
				+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
			
 
				+
			
 
				+        # Preprocess
			
 
				+        pipeline = GlossaryPipeline(glossary)
			
 
				+        processed, terms = pipeline.preprocess("林风来了")
			
 
				+
			
 
				+        # Simulate translation
			
 
				+        mock_translated = "__en__Lin Feng came"
			
 
				+
			
 
				+        # Postprocess
			
 
				+        final = PostProcessor.process(mock_translated)
			
 
				+
			
 
				+        assert "Lin Feng came" == final
			
 
				+        assert "__en__" not in final