2 dni temu · 4672fcf861
--- a/src/glossary/postprocessor.py
+++ b/src/glossary/postprocessor.py
@@ -51,7 +51,11 @@ class GlossaryPostprocessor:
 
				         Returns:
			
 
				             Text with placeholders replaced by translations
			
 
				         """
			
 
				+        # First, restore placeholders that survived translation intact
			
 
				         result = self.restore_from_placeholder(translated_text, placeholder_map)
			
 
				+        # Then, fix any remaining __en__ prefixes (from translated content)
			
 
				+        result = self.clean_language_tags(result)
			
 
				+        # Finally, fix punctuation
			
 
				         result = self.fix_punctuation(result)
			
 
				         return result
			
 
				 
			
@@ -103,6 +107,26 @@ class GlossaryPostprocessor:
 
				 
			
 
				         return text
			
 
				 
			
 
				+    def clean_language_tags(self, text: str) -> str:
			
 
				+        """
			
 
				+        Remove any remaining language tag prefixes like __en__.
			
 
				+
			
 
				+        This handles cases where m2m100 translates the content within
			
 
				+        placeholders, leaving behind orphaned __en__ prefixes.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to clean
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with language tag prefixes removed
			
 
				+        """
			
 
				+        # Remove __en__ followed by any non-space content
			
 
				+        # This pattern catches: __en__Lin, __en__Qingyun, etc.
			
 
				+        result = re.sub(r"__en__\s*", "", text)
			
 
				+        # Also clean other potential language tag formats
			
 
				+        result = re.sub(r"__\w+__\s*", "", result)
			
 
				+        return result
			
 
				+
			
 
				     def validate_translation(
			
 
				         self, original: str, translated: str, placeholder_map: Dict[str, str]
			
 
				     ) -> ValidationResult:
			
--- a/src/glossary/preprocessor.py
+++ b/src/glossary/preprocessor.py
@@ -115,9 +115,10 @@ class GlossaryPreprocessor:
 
				         # Count placeholder characters added
			
 
				         placeholder_chars = processed.count(placeholder_prefix) * len(placeholder_prefix)
			
 
				 
			
 
				-        # Retention = (original length - replaced length) / original length * 100
			
 
				-        # But we want to show how much of the original meaning is preserved
			
 
				-        preserved_chars = len(original) - sum(len(term) for term in self.glossary.get_all())
			
 
				+        # Calculate how many characters were replaced by placeholders
			
 
				+        all_entries = self.glossary.get_all()
			
 
				+        replaced_chars = sum(len(entry.source) for entry in all_entries)
			
 
				+        preserved_chars = len(original) - replaced_chars
			
 
				         total_chars = len(original)
			
 
				 
			
 
				         if total_chars == 0: