223-template-236
/
blank
forked from 137-template-113/blank


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
							"""
Unit tests for GlossaryPipeline and PostProcessor.
"""

import pytest

from src.glossary.models import Glossary, GlossaryEntry, TermCategory
from src.glossary.pipeline import GlossaryPipeline
from src.glossary.post_process import PostProcessor


class TestGlossaryPipeline:
    """Test cases for GlossaryPipeline."""

    def test_preprocess_single_text(self):
        """Test preprocessing a single text."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        pipeline = GlossaryPipeline(glossary)
        processed, terms = pipeline.preprocess("林风来了")

        assert "__en__林风" in processed
        assert "林风" in terms

    def test_preprocess_returns_terms_used(self):
        """Test that preprocess returns list of terms used."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("青云宗", "Qingyun Sect", TermCategory.LOCATION))

        pipeline = GlossaryPipeline(glossary)
        processed, terms = pipeline.preprocess("林风在青云宗")

        assert len(terms) == 2
        assert "林风" in terms
        assert "青云宗" in terms

    def test_batch_preprocess(self):
        """Test batch preprocessing multiple texts."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        pipeline = GlossaryPipeline(glossary)
        texts = ["林风来了", "林风走了"]
        results = pipeline.batch_preprocess(texts)

        assert len(results) == 2
        for processed, terms in results:
            assert "__en__林风" in processed

    def test_get_statistics(self):
        """Test getting terminology statistics."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        pipeline = GlossaryPipeline(glossary)
        stats = pipeline.get_statistics("林风来了，林风走了")

        assert stats["林风"] == 2


class TestPostProcessor:
    """Test cases for PostProcessor."""

    def test_remove_lang_tags(self):
        """Test removing language tag prefixes."""
        assert PostProcessor.remove_lang_tags("__en__Some text") == "Some text"
        assert PostProcessor.remove_lang_tags("__zh__Text here") == "Text here"
        assert PostProcessor.remove_lang_tags("Normal text") == "Normal text"

    def test_fix_punctuation_multiple_dots(self):
        """Test fixing multiple consecutive dots."""
        assert PostProcessor.fix_punctuation("Hello..") == "Hello."
        assert PostProcessor.fix_punctuation("Text...") == "Text."
        assert PostProcessor.fix_punctuation("End....") == "End."

    def test_fix_punctuation_space_before(self):
        """Test fixing space before punctuation."""
        assert PostProcessor.fix_punctuation("Word .") == "Word."
        assert PostProcessor.fix_punctuation("Hello , world") == "Hello, world"

    def test_process_with_placeholder_map(self):
        """Test processing with placeholder restoration."""
        mapping = {"__en__林风": "Lin Feng", "__en__青云宗": "Qingyun Sect"}
        text = "__en__Lin Feng is __en__Qingyun Sect"

        result = PostProcessor.process(text, mapping)

        assert "Lin Feng" in result
        assert "Qingyun Sect" in result
        assert "__en__" not in result

    def test_process_removes_lang_tags(self):
        """Test that process removes language tags."""
        result = PostProcessor.process("__en__Some text here")
        assert result == "Some text here"

    def test_process_fixes_punctuation(self):
        """Test that process fixes punctuation issues."""
        result = PostProcessor.process("Hello..")
        assert result == "Hello."

    def test_clean_whitespace(self):
        """Test cleaning whitespace."""
        assert PostProcessor.clean_whitespace("Hello    world") == "Hello world"
        assert PostProcessor.clean_whitespace("  Text  ") == "Text"


class TestEndToEnd:
    """End-to-end integration tests."""

    def test_full_pipeline(self):
        """Test complete preprocessing and postprocessing."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        # Preprocess
        pipeline = GlossaryPipeline(glossary)
        processed, terms = pipeline.preprocess("林风来了")

        # Simulate translation
        mock_translated = "__en__Lin Feng came"

        # Postprocess
        final = PostProcessor.process(mock_translated)

        assert "Lin Feng came" == final
        assert "__en__" not in final