Răsfoiți Sursa

feat(glossary): Fix preprocessor and postprocessor bugs (Story 4.3 + 4.4)

- Fix preprocessor.py: Correct retention_rate calculation for GlossaryEntry list
- Fix postprocessor.py: Add clean_language_tags() to remove orphaned __en__ prefixes
  * Handles m2m100 translating content within placeholders
  * e.g., "__en__林风" → "__en__Lin" → "Lin"

Tests:
- End-to-end pipeline test passes
- Language tag cleaning works correctly
- Punctuation fixing verified

Part of Epic 4 (P0): Glossary for translation quality
d8dfun 2 zile în urmă
părinte
comite
4672fcf861
2 a modificat fișierele cu 28 adăugiri și 3 ștergeri
  1. 24 0
      src/glossary/postprocessor.py
  2. 4 3
      src/glossary/preprocessor.py

+ 24 - 0
src/glossary/postprocessor.py

@@ -51,7 +51,11 @@ class GlossaryPostprocessor:
         Returns:
             Text with placeholders replaced by translations
         """
+        # First, restore placeholders that survived translation intact
         result = self.restore_from_placeholder(translated_text, placeholder_map)
+        # Then, fix any remaining __en__ prefixes (from translated content)
+        result = self.clean_language_tags(result)
+        # Finally, fix punctuation
         result = self.fix_punctuation(result)
         return result
 
@@ -103,6 +107,26 @@ class GlossaryPostprocessor:
 
         return text
 
+    def clean_language_tags(self, text: str) -> str:
+        """
+        Remove any remaining language tag prefixes like __en__.
+
+        This handles cases where m2m100 translates the content within
+        placeholders, leaving behind orphaned __en__ prefixes.
+
+        Args:
+            text: The text to clean
+
+        Returns:
+            Text with language tag prefixes removed
+        """
+        # Remove __en__ followed by any non-space content
+        # This pattern catches: __en__Lin, __en__Qingyun, etc.
+        result = re.sub(r"__en__\s*", "", text)
+        # Also clean other potential language tag formats
+        result = re.sub(r"__\w+__\s*", "", result)
+        return result
+
     def validate_translation(
         self, original: str, translated: str, placeholder_map: Dict[str, str]
     ) -> ValidationResult:

+ 4 - 3
src/glossary/preprocessor.py

@@ -115,9 +115,10 @@ class GlossaryPreprocessor:
         # Count placeholder characters added
         placeholder_chars = processed.count(placeholder_prefix) * len(placeholder_prefix)
 
-        # Retention = (original length - replaced length) / original length * 100
-        # But we want to show how much of the original meaning is preserved
-        preserved_chars = len(original) - sum(len(term) for term in self.glossary.get_all())
+        # Calculate how many characters were replaced by placeholders
+        all_entries = self.glossary.get_all()
+        replaced_chars = sum(len(entry.source) for entry in all_entries)
+        preserved_chars = len(original) - replaced_chars
         total_chars = len(original)
 
         if total_chars == 0: