""" Unit tests for the glossary module. Tests cover terminology matching, preprocessing, postprocessing, and integration scenarios. """ import pytest from pathlib import Path import tempfile import json import os from src.glossary.models import Glossary, GlossaryEntry, TermCategory from src.glossary.matcher import GlossaryMatcher, TermMatch from src.glossary.preprocessor import GlossaryPreprocessor from src.glossary.postprocessor import GlossaryPostprocessor class TestGlossary: """Test cases for Glossary class.""" def test_add_and_retrieve_term(self): """Test adding and retrieving a term.""" glossary = Glossary() entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER) glossary.add(entry) retrieved = glossary.get("林风") assert retrieved is not None assert retrieved.source == "林风" assert retrieved.target == "Lin Feng" assert retrieved.category == TermCategory.CHARACTER def test_remove_term(self): """Test removing a term.""" glossary = Glossary() entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER) glossary.add(entry) assert glossary.remove("林风") is True assert glossary.get("林风") is None assert glossary.remove("林风") is False def test_sort_by_length_desc(self): """Test sorting terms by length (longest first).""" glossary = Glossary() glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) glossary.add(GlossaryEntry("三阶魔法师", "Tier 3 Mage", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) sorted_terms = glossary.sort_by_length_desc() assert sorted_terms[0] == "三阶魔法师" # 5 chars assert sorted_terms[1] == "火球术" # 3 chars assert sorted_terms[2] == "林风" # 2 chars def test_get_all(self): """Test getting all terms.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) all_terms = glossary.get_all() assert len(all_terms) == 2 def test_contains_operator(self): """Test the 'in' operator.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) assert "林风" in glossary assert "火球术" not in glossary def test_save_to_file(self): """Test saving glossary to a JSON file.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "glossary.json" glossary.save_to_file(filepath) # Verify file exists and contains correct data assert filepath.exists() with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) assert len(data) == 2 assert data[0]["source"] == "林风" assert data[0]["target"] == "Lin Feng" assert data[0]["category"] == "character" def test_load_from_file(self): """Test loading glossary from a JSON file.""" with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "glossary.json" # Create test JSON file test_data = [ { "source": "林风", "target": "Lin Feng", "category": "character", "context": "Main protagonist" }, { "source": "火球术", "target": "Fireball", "category": "skill", "context": "" } ] with open(filepath, "w", encoding="utf-8") as f: json.dump(test_data, f, ensure_ascii=False) # Load and verify glossary = Glossary() glossary.load_from_file(filepath) assert len(glossary) == 2 assert "林风" in glossary assert glossary.get("林风").target == "Lin Feng" assert glossary.get("林风").context == "Main protagonist" assert glossary.get("火球术").category == TermCategory.SKILL def test_load_from_file_clears_existing_entries(self): """Test that loading from file clears existing entries.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "glossary.json" test_data = [ { "source": "火球术", "target": "Fireball", "category": "skill", "context": "" } ] with open(filepath, "w", encoding="utf-8") as f: json.dump(test_data, f) glossary.load_from_file(filepath) # Old entry should be gone assert "林风" not in glossary # New entry should be present assert "火球术" in glossary def test_save_and_load_roundtrip(self): """Test that save and load preserves all data.""" original = Glossary() original.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER, "Protagonist")) original.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) original.add(GlossaryEntry("东方大陆", "Eastern Continent", TermCategory.LOCATION)) with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "glossary.json" original.save_to_file(filepath) loaded = Glossary() loaded.load_from_file(filepath) # Verify all entries preserved assert len(loaded) == len(original) assert loaded.get("林风").target == "Lin Feng" assert loaded.get("林风").context == "Protagonist" assert loaded.get("火球术").category == TermCategory.SKILL assert loaded.get("东方大陆").target == "Eastern Continent" def test_load_from_file_creates_parent_directories(self): """Test that save_to_file creates parent directories.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) with tempfile.TemporaryDirectory() as tmpdir: # Create a nested path that doesn't exist filepath = Path(tmpdir) / "nested" / "dir" / "glossary.json" glossary.save_to_file(filepath) assert filepath.exists() def test_load_from_file_not_found(self): """Test loading from non-existent file raises error.""" glossary = Glossary() with pytest.raises(FileNotFoundError): glossary.load_from_file(Path("/nonexistent/path/glossary.json")) def test_load_from_file_invalid_json(self): """Test loading from file with invalid JSON raises error.""" with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "invalid.json" with open(filepath, "w") as f: f.write("not valid json {]") glossary = Glossary() with pytest.raises(json.JSONDecodeError): glossary.load_from_file(filepath) def test_load_from_file_invalid_category(self): """Test that invalid category defaults to OTHER.""" with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "glossary.json" test_data = [ { "source": "林风", "target": "Lin Feng", "category": "invalid_category", "context": "" } ] with open(filepath, "w", encoding="utf-8") as f: json.dump(test_data, f) glossary = Glossary() glossary.load_from_file(filepath) # Should default to OTHER assert glossary.get("林风").category == TermCategory.OTHER def test_load_from_file_missing_optional_fields(self): """Test loading entries with missing optional fields.""" with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "glossary.json" test_data = [ { "source": "林风", "target": "Lin Feng" # Missing category and context } ] with open(filepath, "w", encoding="utf-8") as f: json.dump(test_data, f) glossary = Glossary() glossary.load_from_file(filepath) # Should use defaults assert glossary.get("林风").category == TermCategory.OTHER assert glossary.get("林风").context == "" def test_save_to_file_empty_glossary(self): """Test saving an empty glossary.""" glossary = Glossary() with tempfile.TemporaryDirectory() as tmpdir: filepath = Path(tmpdir) / "empty.json" glossary.save_to_file(filepath) with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) assert data == [] class TestGlossaryMatcher: """Test cases for GlossaryMatcher.""" def test_find_single_term(self): """Test finding a single term in text.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) matcher = GlossaryMatcher(glossary) matches = matcher.find_matches("林风释放了火球术") assert len(matches) == 1 assert matches[0].source == "林风" assert matches[0].target == "Lin Feng" assert matches[0].start == 0 assert matches[0].end == 2 def test_longest_term_priority(self): """Test that longer terms are matched first.""" glossary = Glossary() glossary.add(GlossaryEntry("魔法", "Magic", TermCategory.OTHER)) glossary.add(GlossaryEntry("魔法师", "Mage", TermCategory.CHARACTER)) matcher = GlossaryMatcher(glossary) matches = matcher.find_matches("魔法师使用了魔法") # Should match "魔法师" but not the "魔法" within it assert len(matches) == 2 assert matches[0].source == "魔法师" assert matches[1].source == "魔法" def test_placeholder_generation(self): """Test placeholder generation.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) matcher = GlossaryMatcher(glossary) processed, mapping = matcher.replace_with_placeholder("林风来了") assert processed == "__en__林风来了" assert mapping == {"__en__林风": "Lin Feng"} def test_non_overlapping_matches(self): """Test that matches don't overlap.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("林", "Lin", TermCategory.CHARACTER)) matcher = GlossaryMatcher(glossary) matches = matcher.find_matches("林风走了") # Should only match "林风", not "林" within it assert len(matches) == 1 assert matches[0].source == "林风" class TestGlossaryPreprocessor: """Test cases for GlossaryPreprocessor.""" def test_process_text_with_terms(self): """Test processing text with terminology.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) preprocessor = GlossaryPreprocessor(glossary) result = preprocessor.process("林风释放了火球术") assert result.processed_text == "__en__林风释放了__en__火球术" assert result.terms_found["林风"] == 1 assert result.terms_found["火球术"] == 1 def test_batch_processing(self): """Test batch processing of multiple texts.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) preprocessor = GlossaryPreprocessor(glossary) texts = ["林风来了", "林风走了"] results = preprocessor.process_batch(texts) assert len(results) == 2 assert "__en__林风" in results[0].processed_text assert "__en__林风" in results[1].processed_text class TestGlossaryPostprocessor: """Test cases for GlossaryPostprocessor.""" def test_restore_from_placeholder(self): """Test restoring placeholders to translations.""" postprocessor = GlossaryPostprocessor() mapping = {"__en__林风": "Lin Feng", "__en__火球术": "Fireball"} result = postprocessor.restore_from_placeholder("__en__林风 released __en__火球术", mapping) assert result == "Lin Feng released Fireball" def test_fix_punctuation(self): """Test punctuation fixing.""" postprocessor = GlossaryPostprocessor() # Remove space before punctuation assert postprocessor.fix_punctuation("Lin Feng .") == "Lin Feng." # Fix Chinese comma after English assert postprocessor.fix_punctuation("Lin Feng,走了") == "Lin Feng, 走了" def test_validate_translation_success(self): """Test successful validation.""" postprocessor = GlossaryPostprocessor() mapping = {"__en__林风": "Lin Feng"} result = postprocessor.validate_translation("林风来了", "Lin Feng came", mapping) assert result.is_valid is True assert len(result.missing_terms) == 0 def test_validate_translation_missing_terms(self): """Test validation with missing terms.""" postprocessor = GlossaryPostprocessor() mapping = {"__en__林风": "Lin Feng"} result = postprocessor.validate_translation("林风来了", "Lin came", mapping) assert result.is_valid is False class TestGlossaryIntegration: """Integration tests for the glossary module.""" def test_full_pipeline(self): """Test complete preprocessing and postprocessing pipeline.""" # Setup glossary glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) # Preprocess preprocessor = GlossaryPreprocessor(glossary) original = "林风释放了火球术" pre_result = preprocessor.process(original) assert pre_result.processed_text == "__en__林风释放了__en__火球术" # Simulate translation mock_translated = "__en__林风 released __en__火球术" # Postprocess postprocessor = GlossaryPostprocessor() final = postprocessor.process(mock_translated, pre_result.placeholder_map) assert final == "Lin Feng released Fireball" def test_phase_0_validation_scenario(self): """Test the Phase 0 validation scenario.""" # Without glossary (simulated by empty glossary) empty_glossary = Glossary() preprocessor = GlossaryPreprocessor(empty_glossary) result = preprocessor.process("林风释放了火球术") # No placeholders added assert result.placeholder_map == {} assert result.terms_found == {} # With glossary full_glossary = Glossary() full_glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) full_glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL)) preprocessor = GlossaryPreprocessor(full_glossary) result = preprocessor.process("林风释放了火球术") # Placeholders added assert len(result.placeholder_map) == 2 assert result.terms_found["林风"] == 1 assert result.terms_found["火球术"] == 1 def test_retention_rate_calculation(self): """Test retention rate calculation.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) preprocessor = GlossaryPreprocessor(glossary) original = "林风释放了火球术" result = preprocessor.process(original) # Retention rate should be calculated assert 0 <= result.retention_rate <= 100 def test_empty_string_retention_rate(self): """Test retention rate with empty string.""" glossary = Glossary() preprocessor = GlossaryPreprocessor(glossary) # Empty string should return 100% retention rate = preprocessor.calculate_retention_rate("", "") assert rate == 100.0 def test_matcher_restore_from_placeholder(self): """Test GlossaryMatcher.restore_from_placeholder method.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) matcher = GlossaryMatcher(glossary) text = "__en__林风 came here" mapping = {"__en__林风": "Lin Feng"} result = matcher.restore_from_placeholder(text, mapping) assert result == "Lin Feng came here" def test_glossary_entry_validation(self): """Test GlossaryEntry validation.""" with pytest.raises(ValueError): GlossaryEntry("", "Lin Feng", TermCategory.CHARACTER) with pytest.raises(ValueError): GlossaryEntry("林风", "", TermCategory.CHARACTER) def test_multiple_occurrences_same_term(self): """Test matching the same term multiple times.""" glossary = Glossary() glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) matcher = GlossaryMatcher(glossary) matches = matcher.find_matches("林风说,林风知道") # Should find both occurrences assert len(matches) == 2 assert matches[0].source == "林风" assert matches[1].source == "林风" def test_postprocessor_clean_language_tags(self): """Test clean_language_tags method.""" postprocessor = GlossaryPostprocessor() # Clean orphaned __en__ prefixes result = postprocessor.clean_language_tags("__en__ some text here") assert "__en__" not in result assert "some text here" in result def test_glossary_len_and_contains(self): """Test __len__ and __contains__ methods.""" glossary = Glossary() assert len(glossary) == 0 glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)) assert len(glossary) == 1 assert "林风" in glossary assert "不存在" not in glossary