| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515 |
- """
- Unit tests for the glossary module.
- Tests cover terminology matching, preprocessing, postprocessing,
- and integration scenarios.
- """
- import pytest
- from pathlib import Path
- import tempfile
- import json
- import os
- from src.glossary.models import Glossary, GlossaryEntry, TermCategory
- from src.glossary.matcher import GlossaryMatcher, TermMatch
- from src.glossary.preprocessor import GlossaryPreprocessor
- from src.glossary.postprocessor import GlossaryPostprocessor
- class TestGlossary:
- """Test cases for Glossary class."""
- def test_add_and_retrieve_term(self):
- """Test adding and retrieving a term."""
- glossary = Glossary()
- entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
- glossary.add(entry)
- retrieved = glossary.get("林风")
- assert retrieved is not None
- assert retrieved.source == "林风"
- assert retrieved.target == "Lin Feng"
- assert retrieved.category == TermCategory.CHARACTER
- def test_remove_term(self):
- """Test removing a term."""
- glossary = Glossary()
- entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
- glossary.add(entry)
- assert glossary.remove("林风") is True
- assert glossary.get("林风") is None
- assert glossary.remove("林风") is False
- def test_sort_by_length_desc(self):
- """Test sorting terms by length (longest first)."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- glossary.add(GlossaryEntry("三阶魔法师", "Tier 3 Mage", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- sorted_terms = glossary.sort_by_length_desc()
- assert sorted_terms[0] == "三阶魔法师" # 5 chars
- assert sorted_terms[1] == "火球术" # 3 chars
- assert sorted_terms[2] == "林风" # 2 chars
- def test_get_all(self):
- """Test getting all terms."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- all_terms = glossary.get_all()
- assert len(all_terms) == 2
- def test_contains_operator(self):
- """Test the 'in' operator."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- assert "林风" in glossary
- assert "火球术" not in glossary
- def test_save_to_file(self):
- """Test saving glossary to a JSON file."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "glossary.json"
- glossary.save_to_file(filepath)
- # Verify file exists and contains correct data
- assert filepath.exists()
- with open(filepath, "r", encoding="utf-8") as f:
- data = json.load(f)
- assert len(data) == 2
- assert data[0]["source"] == "林风"
- assert data[0]["target"] == "Lin Feng"
- assert data[0]["category"] == "character"
- def test_load_from_file(self):
- """Test loading glossary from a JSON file."""
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "glossary.json"
- # Create test JSON file
- test_data = [
- {
- "source": "林风",
- "target": "Lin Feng",
- "category": "character",
- "context": "Main protagonist"
- },
- {
- "source": "火球术",
- "target": "Fireball",
- "category": "skill",
- "context": ""
- }
- ]
- with open(filepath, "w", encoding="utf-8") as f:
- json.dump(test_data, f, ensure_ascii=False)
- # Load and verify
- glossary = Glossary()
- glossary.load_from_file(filepath)
- assert len(glossary) == 2
- assert "林风" in glossary
- assert glossary.get("林风").target == "Lin Feng"
- assert glossary.get("林风").context == "Main protagonist"
- assert glossary.get("火球术").category == TermCategory.SKILL
- def test_load_from_file_clears_existing_entries(self):
- """Test that loading from file clears existing entries."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "glossary.json"
- test_data = [
- {
- "source": "火球术",
- "target": "Fireball",
- "category": "skill",
- "context": ""
- }
- ]
- with open(filepath, "w", encoding="utf-8") as f:
- json.dump(test_data, f)
- glossary.load_from_file(filepath)
- # Old entry should be gone
- assert "林风" not in glossary
- # New entry should be present
- assert "火球术" in glossary
- def test_save_and_load_roundtrip(self):
- """Test that save and load preserves all data."""
- original = Glossary()
- original.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER, "Protagonist"))
- original.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- original.add(GlossaryEntry("东方大陆", "Eastern Continent", TermCategory.LOCATION))
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "glossary.json"
- original.save_to_file(filepath)
- loaded = Glossary()
- loaded.load_from_file(filepath)
- # Verify all entries preserved
- assert len(loaded) == len(original)
- assert loaded.get("林风").target == "Lin Feng"
- assert loaded.get("林风").context == "Protagonist"
- assert loaded.get("火球术").category == TermCategory.SKILL
- assert loaded.get("东方大陆").target == "Eastern Continent"
- def test_load_from_file_creates_parent_directories(self):
- """Test that save_to_file creates parent directories."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- with tempfile.TemporaryDirectory() as tmpdir:
- # Create a nested path that doesn't exist
- filepath = Path(tmpdir) / "nested" / "dir" / "glossary.json"
- glossary.save_to_file(filepath)
- assert filepath.exists()
- def test_load_from_file_not_found(self):
- """Test loading from non-existent file raises error."""
- glossary = Glossary()
- with pytest.raises(FileNotFoundError):
- glossary.load_from_file(Path("/nonexistent/path/glossary.json"))
- def test_load_from_file_invalid_json(self):
- """Test loading from file with invalid JSON raises error."""
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "invalid.json"
- with open(filepath, "w") as f:
- f.write("not valid json {]")
- glossary = Glossary()
- with pytest.raises(json.JSONDecodeError):
- glossary.load_from_file(filepath)
- def test_load_from_file_invalid_category(self):
- """Test that invalid category defaults to OTHER."""
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "glossary.json"
- test_data = [
- {
- "source": "林风",
- "target": "Lin Feng",
- "category": "invalid_category",
- "context": ""
- }
- ]
- with open(filepath, "w", encoding="utf-8") as f:
- json.dump(test_data, f)
- glossary = Glossary()
- glossary.load_from_file(filepath)
- # Should default to OTHER
- assert glossary.get("林风").category == TermCategory.OTHER
- def test_load_from_file_missing_optional_fields(self):
- """Test loading entries with missing optional fields."""
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "glossary.json"
- test_data = [
- {
- "source": "林风",
- "target": "Lin Feng"
- # Missing category and context
- }
- ]
- with open(filepath, "w", encoding="utf-8") as f:
- json.dump(test_data, f)
- glossary = Glossary()
- glossary.load_from_file(filepath)
- # Should use defaults
- assert glossary.get("林风").category == TermCategory.OTHER
- assert glossary.get("林风").context == ""
- def test_save_to_file_empty_glossary(self):
- """Test saving an empty glossary."""
- glossary = Glossary()
- with tempfile.TemporaryDirectory() as tmpdir:
- filepath = Path(tmpdir) / "empty.json"
- glossary.save_to_file(filepath)
- with open(filepath, "r", encoding="utf-8") as f:
- data = json.load(f)
- assert data == []
- class TestGlossaryMatcher:
- """Test cases for GlossaryMatcher."""
- def test_find_single_term(self):
- """Test finding a single term in text."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- matcher = GlossaryMatcher(glossary)
- matches = matcher.find_matches("林风释放了火球术")
- assert len(matches) == 1
- assert matches[0].source == "林风"
- assert matches[0].target == "Lin Feng"
- assert matches[0].start == 0
- assert matches[0].end == 2
- def test_longest_term_priority(self):
- """Test that longer terms are matched first."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("魔法", "Magic", TermCategory.OTHER))
- glossary.add(GlossaryEntry("魔法师", "Mage", TermCategory.CHARACTER))
- matcher = GlossaryMatcher(glossary)
- matches = matcher.find_matches("魔法师使用了魔法")
- # Should match "魔法师" but not the "魔法" within it
- assert len(matches) == 2
- assert matches[0].source == "魔法师"
- assert matches[1].source == "魔法"
- def test_placeholder_generation(self):
- """Test placeholder generation."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- matcher = GlossaryMatcher(glossary)
- processed, mapping = matcher.replace_with_placeholder("林风来了")
- assert processed == "__en__林风来了"
- assert mapping == {"__en__林风": "Lin Feng"}
- def test_non_overlapping_matches(self):
- """Test that matches don't overlap."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("林", "Lin", TermCategory.CHARACTER))
- matcher = GlossaryMatcher(glossary)
- matches = matcher.find_matches("林风走了")
- # Should only match "林风", not "林" within it
- assert len(matches) == 1
- assert matches[0].source == "林风"
- class TestGlossaryPreprocessor:
- """Test cases for GlossaryPreprocessor."""
- def test_process_text_with_terms(self):
- """Test processing text with terminology."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- preprocessor = GlossaryPreprocessor(glossary)
- result = preprocessor.process("林风释放了火球术")
- assert result.processed_text == "__en__林风释放了__en__火球术"
- assert result.terms_found["林风"] == 1
- assert result.terms_found["火球术"] == 1
- def test_batch_processing(self):
- """Test batch processing of multiple texts."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- preprocessor = GlossaryPreprocessor(glossary)
- texts = ["林风来了", "林风走了"]
- results = preprocessor.process_batch(texts)
- assert len(results) == 2
- assert "__en__林风" in results[0].processed_text
- assert "__en__林风" in results[1].processed_text
- class TestGlossaryPostprocessor:
- """Test cases for GlossaryPostprocessor."""
- def test_restore_from_placeholder(self):
- """Test restoring placeholders to translations."""
- postprocessor = GlossaryPostprocessor()
- mapping = {"__en__林风": "Lin Feng", "__en__火球术": "Fireball"}
- result = postprocessor.restore_from_placeholder("__en__林风 released __en__火球术", mapping)
- assert result == "Lin Feng released Fireball"
- def test_fix_punctuation(self):
- """Test punctuation fixing."""
- postprocessor = GlossaryPostprocessor()
- # Remove space before punctuation
- assert postprocessor.fix_punctuation("Lin Feng .") == "Lin Feng."
- # Fix Chinese comma after English
- assert postprocessor.fix_punctuation("Lin Feng,走了") == "Lin Feng, 走了"
- def test_validate_translation_success(self):
- """Test successful validation."""
- postprocessor = GlossaryPostprocessor()
- mapping = {"__en__林风": "Lin Feng"}
- result = postprocessor.validate_translation("林风来了", "Lin Feng came", mapping)
- assert result.is_valid is True
- assert len(result.missing_terms) == 0
- def test_validate_translation_missing_terms(self):
- """Test validation with missing terms."""
- postprocessor = GlossaryPostprocessor()
- mapping = {"__en__林风": "Lin Feng"}
- result = postprocessor.validate_translation("林风来了", "Lin came", mapping)
- assert result.is_valid is False
- class TestGlossaryIntegration:
- """Integration tests for the glossary module."""
- def test_full_pipeline(self):
- """Test complete preprocessing and postprocessing pipeline."""
- # Setup glossary
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- # Preprocess
- preprocessor = GlossaryPreprocessor(glossary)
- original = "林风释放了火球术"
- pre_result = preprocessor.process(original)
- assert pre_result.processed_text == "__en__林风释放了__en__火球术"
- # Simulate translation
- mock_translated = "__en__林风 released __en__火球术"
- # Postprocess
- postprocessor = GlossaryPostprocessor()
- final = postprocessor.process(mock_translated, pre_result.placeholder_map)
- assert final == "Lin Feng released Fireball"
- def test_phase_0_validation_scenario(self):
- """Test the Phase 0 validation scenario."""
- # Without glossary (simulated by empty glossary)
- empty_glossary = Glossary()
- preprocessor = GlossaryPreprocessor(empty_glossary)
- result = preprocessor.process("林风释放了火球术")
- # No placeholders added
- assert result.placeholder_map == {}
- assert result.terms_found == {}
- # With glossary
- full_glossary = Glossary()
- full_glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- full_glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
- preprocessor = GlossaryPreprocessor(full_glossary)
- result = preprocessor.process("林风释放了火球术")
- # Placeholders added
- assert len(result.placeholder_map) == 2
- assert result.terms_found["林风"] == 1
- assert result.terms_found["火球术"] == 1
- def test_retention_rate_calculation(self):
- """Test retention rate calculation."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- preprocessor = GlossaryPreprocessor(glossary)
- original = "林风释放了火球术"
- result = preprocessor.process(original)
- # Retention rate should be calculated
- assert 0 <= result.retention_rate <= 100
- def test_empty_string_retention_rate(self):
- """Test retention rate with empty string."""
- glossary = Glossary()
- preprocessor = GlossaryPreprocessor(glossary)
- # Empty string should return 100% retention
- rate = preprocessor.calculate_retention_rate("", "")
- assert rate == 100.0
- def test_matcher_restore_from_placeholder(self):
- """Test GlossaryMatcher.restore_from_placeholder method."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- matcher = GlossaryMatcher(glossary)
- text = "__en__林风 came here"
- mapping = {"__en__林风": "Lin Feng"}
- result = matcher.restore_from_placeholder(text, mapping)
- assert result == "Lin Feng came here"
- def test_glossary_entry_validation(self):
- """Test GlossaryEntry validation."""
- with pytest.raises(ValueError):
- GlossaryEntry("", "Lin Feng", TermCategory.CHARACTER)
- with pytest.raises(ValueError):
- GlossaryEntry("林风", "", TermCategory.CHARACTER)
- def test_multiple_occurrences_same_term(self):
- """Test matching the same term multiple times."""
- glossary = Glossary()
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- matcher = GlossaryMatcher(glossary)
- matches = matcher.find_matches("林风说,林风知道")
- # Should find both occurrences
- assert len(matches) == 2
- assert matches[0].source == "林风"
- assert matches[1].source == "林风"
- def test_postprocessor_clean_language_tags(self):
- """Test clean_language_tags method."""
- postprocessor = GlossaryPostprocessor()
- # Clean orphaned __en__ prefixes
- result = postprocessor.clean_language_tags("__en__ some text here")
- assert "__en__" not in result
- assert "some text here" in result
- def test_glossary_len_and_contains(self):
- """Test __len__ and __contains__ methods."""
- glossary = Glossary()
- assert len(glossary) == 0
- glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
- assert len(glossary) == 1
- assert "林风" in glossary
- assert "不存在" not in glossary
|