223-template-236
/
blank
派生自 137-template-113/blank


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
							"""
Unit tests for the glossary module.

Tests cover terminology matching, preprocessing, postprocessing,
and integration scenarios.
"""

import pytest
from pathlib import Path
import tempfile
import json
import os

from src.glossary.models import Glossary, GlossaryEntry, TermCategory
from src.glossary.matcher import GlossaryMatcher, TermMatch
from src.glossary.preprocessor import GlossaryPreprocessor
from src.glossary.postprocessor import GlossaryPostprocessor


class TestGlossary:
    """Test cases for Glossary class."""

    def test_add_and_retrieve_term(self):
        """Test adding and retrieving a term."""
        glossary = Glossary()
        entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)

        glossary.add(entry)
        retrieved = glossary.get("林风")

        assert retrieved is not None
        assert retrieved.source == "林风"
        assert retrieved.target == "Lin Feng"
        assert retrieved.category == TermCategory.CHARACTER

    def test_remove_term(self):
        """Test removing a term."""
        glossary = Glossary()
        entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
        glossary.add(entry)

        assert glossary.remove("林风") is True
        assert glossary.get("林风") is None
        assert glossary.remove("林风") is False

    def test_sort_by_length_desc(self):
        """Test sorting terms by length (longest first)."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
        glossary.add(GlossaryEntry("三阶魔法师", "Tier 3 Mage", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        sorted_terms = glossary.sort_by_length_desc()
        assert sorted_terms[0] == "三阶魔法师"  # 5 chars
        assert sorted_terms[1] == "火球术"  # 3 chars
        assert sorted_terms[2] == "林风"  # 2 chars

    def test_get_all(self):
        """Test getting all terms."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))

        all_terms = glossary.get_all()
        assert len(all_terms) == 2

    def test_contains_operator(self):
        """Test the 'in' operator."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        assert "林风" in glossary
        assert "火球术" not in glossary

    def test_save_to_file(self):
        """Test saving glossary to a JSON file."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))

        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "glossary.json"
            glossary.save_to_file(filepath)

            # Verify file exists and contains correct data
            assert filepath.exists()

            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)

            assert len(data) == 2
            assert data[0]["source"] == "林风"
            assert data[0]["target"] == "Lin Feng"
            assert data[0]["category"] == "character"

    def test_load_from_file(self):
        """Test loading glossary from a JSON file."""
        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "glossary.json"

            # Create test JSON file
            test_data = [
                {
                    "source": "林风",
                    "target": "Lin Feng",
                    "category": "character",
                    "context": "Main protagonist"
                },
                {
                    "source": "火球术",
                    "target": "Fireball",
                    "category": "skill",
                    "context": ""
                }
            ]

            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(test_data, f, ensure_ascii=False)

            # Load and verify
            glossary = Glossary()
            glossary.load_from_file(filepath)

            assert len(glossary) == 2
            assert "林风" in glossary
            assert glossary.get("林风").target == "Lin Feng"
            assert glossary.get("林风").context == "Main protagonist"
            assert glossary.get("火球术").category == TermCategory.SKILL

    def test_load_from_file_clears_existing_entries(self):
        """Test that loading from file clears existing entries."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "glossary.json"
            test_data = [
                {
                    "source": "火球术",
                    "target": "Fireball",
                    "category": "skill",
                    "context": ""
                }
            ]

            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(test_data, f)

            glossary.load_from_file(filepath)

            # Old entry should be gone
            assert "林风" not in glossary
            # New entry should be present
            assert "火球术" in glossary

    def test_save_and_load_roundtrip(self):
        """Test that save and load preserves all data."""
        original = Glossary()
        original.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER, "Protagonist"))
        original.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
        original.add(GlossaryEntry("东方大陆", "Eastern Continent", TermCategory.LOCATION))

        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "glossary.json"
            original.save_to_file(filepath)

            loaded = Glossary()
            loaded.load_from_file(filepath)

            # Verify all entries preserved
            assert len(loaded) == len(original)
            assert loaded.get("林风").target == "Lin Feng"
            assert loaded.get("林风").context == "Protagonist"
            assert loaded.get("火球术").category == TermCategory.SKILL
            assert loaded.get("东方大陆").target == "Eastern Continent"

    def test_load_from_file_creates_parent_directories(self):
        """Test that save_to_file creates parent directories."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        with tempfile.TemporaryDirectory() as tmpdir:
            # Create a nested path that doesn't exist
            filepath = Path(tmpdir) / "nested" / "dir" / "glossary.json"

            glossary.save_to_file(filepath)

            assert filepath.exists()

    def test_load_from_file_not_found(self):
        """Test loading from non-existent file raises error."""
        glossary = Glossary()

        with pytest.raises(FileNotFoundError):
            glossary.load_from_file(Path("/nonexistent/path/glossary.json"))

    def test_load_from_file_invalid_json(self):
        """Test loading from file with invalid JSON raises error."""
        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "invalid.json"

            with open(filepath, "w") as f:
                f.write("not valid json {]")

            glossary = Glossary()
            with pytest.raises(json.JSONDecodeError):
                glossary.load_from_file(filepath)

    def test_load_from_file_invalid_category(self):
        """Test that invalid category defaults to OTHER."""
        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "glossary.json"
            test_data = [
                {
                    "source": "林风",
                    "target": "Lin Feng",
                    "category": "invalid_category",
                    "context": ""
                }
            ]

            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(test_data, f)

            glossary = Glossary()
            glossary.load_from_file(filepath)

            # Should default to OTHER
            assert glossary.get("林风").category == TermCategory.OTHER

    def test_load_from_file_missing_optional_fields(self):
        """Test loading entries with missing optional fields."""
        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "glossary.json"
            test_data = [
                {
                    "source": "林风",
                    "target": "Lin Feng"
                    # Missing category and context
                }
            ]

            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(test_data, f)

            glossary = Glossary()
            glossary.load_from_file(filepath)

            # Should use defaults
            assert glossary.get("林风").category == TermCategory.OTHER
            assert glossary.get("林风").context == ""

    def test_save_to_file_empty_glossary(self):
        """Test saving an empty glossary."""
        glossary = Glossary()

        with tempfile.TemporaryDirectory() as tmpdir:
            filepath = Path(tmpdir) / "empty.json"
            glossary.save_to_file(filepath)

            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)

            assert data == []


class TestGlossaryMatcher:
    """Test cases for GlossaryMatcher."""

    def test_find_single_term(self):
        """Test finding a single term in text."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        matcher = GlossaryMatcher(glossary)
        matches = matcher.find_matches("林风释放了火球术")

        assert len(matches) == 1
        assert matches[0].source == "林风"
        assert matches[0].target == "Lin Feng"
        assert matches[0].start == 0
        assert matches[0].end == 2

    def test_longest_term_priority(self):
        """Test that longer terms are matched first."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("魔法", "Magic", TermCategory.OTHER))
        glossary.add(GlossaryEntry("魔法师", "Mage", TermCategory.CHARACTER))

        matcher = GlossaryMatcher(glossary)
        matches = matcher.find_matches("魔法师使用了魔法")

        # Should match "魔法师" but not the "魔法" within it
        assert len(matches) == 2
        assert matches[0].source == "魔法师"
        assert matches[1].source == "魔法"

    def test_placeholder_generation(self):
        """Test placeholder generation."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        matcher = GlossaryMatcher(glossary)
        processed, mapping = matcher.replace_with_placeholder("林风来了")

        assert processed == "__en__林风来了"
        assert mapping == {"__en__林风": "Lin Feng"}

    def test_non_overlapping_matches(self):
        """Test that matches don't overlap."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("林", "Lin", TermCategory.CHARACTER))

        matcher = GlossaryMatcher(glossary)
        matches = matcher.find_matches("林风走了")

        # Should only match "林风", not "林" within it
        assert len(matches) == 1
        assert matches[0].source == "林风"


class TestGlossaryPreprocessor:
    """Test cases for GlossaryPreprocessor."""

    def test_process_text_with_terms(self):
        """Test processing text with terminology."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))

        preprocessor = GlossaryPreprocessor(glossary)
        result = preprocessor.process("林风释放了火球术")

        assert result.processed_text == "__en__林风释放了__en__火球术"
        assert result.terms_found["林风"] == 1
        assert result.terms_found["火球术"] == 1

    def test_batch_processing(self):
        """Test batch processing of multiple texts."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        preprocessor = GlossaryPreprocessor(glossary)
        texts = ["林风来了", "林风走了"]
        results = preprocessor.process_batch(texts)

        assert len(results) == 2
        assert "__en__林风" in results[0].processed_text
        assert "__en__林风" in results[1].processed_text


class TestGlossaryPostprocessor:
    """Test cases for GlossaryPostprocessor."""

    def test_restore_from_placeholder(self):
        """Test restoring placeholders to translations."""
        postprocessor = GlossaryPostprocessor()
        mapping = {"__en__林风": "Lin Feng", "__en__火球术": "Fireball"}

        result = postprocessor.restore_from_placeholder("__en__林风 released __en__火球术", mapping)

        assert result == "Lin Feng released Fireball"

    def test_fix_punctuation(self):
        """Test punctuation fixing."""
        postprocessor = GlossaryPostprocessor()

        # Remove space before punctuation
        assert postprocessor.fix_punctuation("Lin Feng .") == "Lin Feng."
        # Fix Chinese comma after English
        assert postprocessor.fix_punctuation("Lin Feng，走了") == "Lin Feng, 走了"

    def test_validate_translation_success(self):
        """Test successful validation."""
        postprocessor = GlossaryPostprocessor()
        mapping = {"__en__林风": "Lin Feng"}

        result = postprocessor.validate_translation("林风来了", "Lin Feng came", mapping)

        assert result.is_valid is True
        assert len(result.missing_terms) == 0

    def test_validate_translation_missing_terms(self):
        """Test validation with missing terms."""
        postprocessor = GlossaryPostprocessor()
        mapping = {"__en__林风": "Lin Feng"}

        result = postprocessor.validate_translation("林风来了", "Lin came", mapping)

        assert result.is_valid is False


class TestGlossaryIntegration:
    """Integration tests for the glossary module."""

    def test_full_pipeline(self):
        """Test complete preprocessing and postprocessing pipeline."""
        # Setup glossary
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))

        # Preprocess
        preprocessor = GlossaryPreprocessor(glossary)
        original = "林风释放了火球术"
        pre_result = preprocessor.process(original)

        assert pre_result.processed_text == "__en__林风释放了__en__火球术"

        # Simulate translation
        mock_translated = "__en__林风 released __en__火球术"

        # Postprocess
        postprocessor = GlossaryPostprocessor()
        final = postprocessor.process(mock_translated, pre_result.placeholder_map)

        assert final == "Lin Feng released Fireball"

    def test_phase_0_validation_scenario(self):
        """Test the Phase 0 validation scenario."""
        # Without glossary (simulated by empty glossary)
        empty_glossary = Glossary()
        preprocessor = GlossaryPreprocessor(empty_glossary)
        result = preprocessor.process("林风释放了火球术")

        # No placeholders added
        assert result.placeholder_map == {}
        assert result.terms_found == {}

        # With glossary
        full_glossary = Glossary()
        full_glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        full_glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))

        preprocessor = GlossaryPreprocessor(full_glossary)
        result = preprocessor.process("林风释放了火球术")

        # Placeholders added
        assert len(result.placeholder_map) == 2
        assert result.terms_found["林风"] == 1
        assert result.terms_found["火球术"] == 1

    def test_retention_rate_calculation(self):
        """Test retention rate calculation."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        preprocessor = GlossaryPreprocessor(glossary)
        original = "林风释放了火球术"
        result = preprocessor.process(original)

        # Retention rate should be calculated
        assert 0 <= result.retention_rate <= 100

    def test_empty_string_retention_rate(self):
        """Test retention rate with empty string."""
        glossary = Glossary()
        preprocessor = GlossaryPreprocessor(glossary)

        # Empty string should return 100% retention
        rate = preprocessor.calculate_retention_rate("", "")
        assert rate == 100.0

    def test_matcher_restore_from_placeholder(self):
        """Test GlossaryMatcher.restore_from_placeholder method."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        matcher = GlossaryMatcher(glossary)
        text = "__en__林风 came here"
        mapping = {"__en__林风": "Lin Feng"}

        result = matcher.restore_from_placeholder(text, mapping)
        assert result == "Lin Feng came here"

    def test_glossary_entry_validation(self):
        """Test GlossaryEntry validation."""
        with pytest.raises(ValueError):
            GlossaryEntry("", "Lin Feng", TermCategory.CHARACTER)

        with pytest.raises(ValueError):
            GlossaryEntry("林风", "", TermCategory.CHARACTER)

    def test_multiple_occurrences_same_term(self):
        """Test matching the same term multiple times."""
        glossary = Glossary()
        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))

        matcher = GlossaryMatcher(glossary)
        matches = matcher.find_matches("林风说，林风知道")

        # Should find both occurrences
        assert len(matches) == 2
        assert matches[0].source == "林风"
        assert matches[1].source == "林风"

    def test_postprocessor_clean_language_tags(self):
        """Test clean_language_tags method."""
        postprocessor = GlossaryPostprocessor()

        # Clean orphaned __en__ prefixes
        result = postprocessor.clean_language_tags("__en__  some text here")
        assert "__en__" not in result
        assert "some text here" in result

    def test_glossary_len_and_contains(self):
        """Test __len__ and __contains__ methods."""
        glossary = Glossary()
        assert len(glossary) == 0

        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
        assert len(glossary) == 1
        assert "林风" in glossary
        assert "不存在" not in glossary