| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- """
- Unit tests for GlossaryPipeline and PostProcessor.
- """
- import pytest
- from src.glossary.models import Glossary, GlossaryEntry, TermCategory
- from src.glossary.pipeline import GlossaryPipeline
- from src.glossary.post_process import PostProcessor
- class TestGlossaryPipeline:
- """Test cases for GlossaryPipeline."""
- def test_preprocess_single_text(self):
- """Test preprocessing a single text."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- pipeline = GlossaryPipeline(glossary)
- processed, terms = pipeline.preprocess("林风来了")
- assert "__en__林风" in processed
- assert "林风" in terms
- def test_preprocess_returns_terms_used(self):
- """Test that preprocess returns list of terms used."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION))
- pipeline = GlossaryPipeline(glossary)
- processed, terms = pipeline.preprocess("林风在青云宗")
- assert len(terms) == 2
- assert "林风" in terms
- assert "青云宗" in terms
- def test_batch_preprocess(self):
- """Test batch preprocessing multiple texts."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- pipeline = GlossaryPipeline(glossary)
- texts = ["林风来了", "林风走了"]
- results = pipeline.batch_preprocess(texts)
- assert len(results) == 2
- for processed, terms in results:
- assert "__en__林风" in processed
- def test_get_statistics(self):
- """Test getting terminology statistics."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- pipeline = GlossaryPipeline(glossary)
- stats = pipeline.get_statistics("林风来了,林风走了")
- assert stats["林风"] == 2
- class TestPostProcessor:
- """Test cases for PostProcessor."""
- def test_remove_lang_tags(self):
- """Test removing language tag prefixes."""
- assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text"
- assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here"
- assert PostProcessor.remove_lang_tags("Normal text") == "Normal text"
- def test_fix_punctuation_multiple_dots(self):
- """Test fixing multiple consecutive dots."""
- assert PostProcessor.fix_punctuation("Hello..") == "Hello."
- assert PostProcessor.fix_punctuation("Text...") == "Text."
- assert PostProcessor.fix_punctuation("End....") == "End."
- def test_fix_punctuation_space_before(self):
- """Test fixing space before punctuation."""
- assert PostProcessor.fix_punctuation("Word .") == "Word."
- assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world"
- def test_process_with_placeholder_map(self):
- """Test processing with placeholder restoration."""
- mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"}
- text = "__en__Lin Feng is __en__Qingyun Sect"
- result = PostProcessor.process(text, mapping)
- assert "Lin Feng" in result
- assert "Qingyun Sect" in result
- assert "__en__" not in result
- def test_process_removes_lang_tags(self):
- """Test that process removes language tags."""
- result = PostProcessor.process("__en__Some text here")
- assert result == "Some text here"
- def test_process_fixes_punctuation(self):
- """Test that process fixes punctuation issues."""
- result = PostProcessor.process("Hello..")
- assert result == "Hello."
- def test_clean_whitespace(self):
- """Test cleaning whitespace."""
- assert PostProcessor.clean_whitespace("Hello world") == "Hello world"
- assert PostProcessor.clean_whitespace(" Text ") == "Text"
- class TestEndToEnd:
- """End-to-end integration tests."""
- def test_full_pipeline(self):
- """Test complete preprocessing and postprocessing."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- # Preprocess
- pipeline = GlossaryPipeline(glossary)
- processed, terms = pipeline.preprocess("林风来了")
- # Simulate translation
- mock_translated = "__en__Lin Feng came"
- # Postprocess
- final = PostProcessor.process(mock_translated)
- assert "Lin Feng came" == final
- assert "__en__" not in final
|