""" Unit tests for GlossaryPipeline and PostProcessor. """ import pytest from src.glossary.models import Glossary, GlossaryEntry, TermCategory from src.glossary.pipeline import GlossaryPipeline from src.glossary.post_process import PostProcessor class TestGlossaryPipeline: """Test cases for GlossaryPipeline.""" def test_preprocess_single_text(self): """Test preprocessing a single text.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) pipeline = GlossaryPipeline(glossary) processed, terms = pipeline.preprocess("林风来了") assert "__en__林风" in processed assert "林风" in terms def test_preprocess_returns_terms_used(self): """Test that preprocess returns list of terms used.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION)) pipeline = GlossaryPipeline(glossary) processed, terms = pipeline.preprocess("林风在青云宗") assert len(terms) == 2 assert "林风" in terms assert "青云宗" in terms def test_batch_preprocess(self): """Test batch preprocessing multiple texts.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) pipeline = GlossaryPipeline(glossary) texts = ["林风来了", "林风走了"] results = pipeline.batch_preprocess(texts) assert len(results) == 2 for processed, terms in results: assert "__en__林风" in processed def test_get_statistics(self): """Test getting terminology statistics.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) pipeline = GlossaryPipeline(glossary) stats = pipeline.get_statistics("林风来了,林风走了") assert stats["林风"] == 2 class TestPostProcessor: """Test cases for PostProcessor.""" def test_remove_lang_tags(self): """Test removing language tag prefixes.""" assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text" assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here" assert PostProcessor.remove_lang_tags("Normal text") == "Normal text" def test_fix_punctuation_multiple_dots(self): """Test fixing multiple consecutive dots.""" assert PostProcessor.fix_punctuation("Hello..") == "Hello." assert PostProcessor.fix_punctuation("Text...") == "Text." assert PostProcessor.fix_punctuation("End....") == "End." def test_fix_punctuation_space_before(self): """Test fixing space before punctuation.""" assert PostProcessor.fix_punctuation("Word .") == "Word." assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world" def test_process_with_placeholder_map(self): """Test processing with placeholder restoration.""" mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"} text = "__en__Lin Feng is __en__Qingyun Sect" result = PostProcessor.process(text, mapping) assert "Lin Feng" in result assert "Qingyun Sect" in result assert "__en__" not in result def test_process_removes_lang_tags(self): """Test that process removes language tags.""" result = PostProcessor.process("__en__Some text here") assert result == "Some text here" def test_process_fixes_punctuation(self): """Test that process fixes punctuation issues.""" result = PostProcessor.process("Hello..") assert result == "Hello." def test_clean_whitespace(self): """Test cleaning whitespace.""" assert PostProcessor.clean_whitespace("Hello world") == "Hello world" assert PostProcessor.clean_whitespace(" Text ") == "Text" class TestEndToEnd: """End-to-end integration tests.""" def test_full_pipeline(self): """Test complete preprocessing and postprocessing.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) # Preprocess pipeline = GlossaryPipeline(glossary) processed, terms = pipeline.preprocess("林风来了") # Simulate translation mock_translated = "__en__Lin Feng came" # Postprocess final = PostProcessor.process(mock_translated) assert "Lin Feng came" == final assert "__en__" not in final