|
|
@@ -51,7 +51,11 @@ class GlossaryPostprocessor:
|
|
|
Returns:
|
|
|
Text with placeholders replaced by translations
|
|
|
"""
|
|
|
+ # First, restore placeholders that survived translation intact
|
|
|
result = self.restore_from_placeholder(translated_text, placeholder_map)
|
|
|
+ # Then, fix any remaining __en__ prefixes (from translated content)
|
|
|
+ result = self.clean_language_tags(result)
|
|
|
+ # Finally, fix punctuation
|
|
|
result = self.fix_punctuation(result)
|
|
|
return result
|
|
|
|
|
|
@@ -103,6 +107,26 @@ class GlossaryPostprocessor:
|
|
|
|
|
|
return text
|
|
|
|
|
|
+ def clean_language_tags(self, text: str) -> str:
|
|
|
+ """
|
|
|
+ Remove any remaining language tag prefixes like __en__.
|
|
|
+
|
|
|
+ This handles cases where m2m100 translates the content within
|
|
|
+ placeholders, leaving behind orphaned __en__ prefixes.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: The text to clean
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Text with language tag prefixes removed
|
|
|
+ """
|
|
|
+ # Remove __en__ followed by any non-space content
|
|
|
+ # This pattern catches: __en__Lin, __en__Qingyun, etc.
|
|
|
+ result = re.sub(r"__en__\s*", "", text)
|
|
|
+ # Also clean other potential language tag formats
|
|
|
+ result = re.sub(r"__\w+__\s*", "", result)
|
|
|
+ return result
|
|
|
+
|
|
|
def validate_translation(
|
|
|
self, original: str, translated: str, placeholder_map: Dict[str, str]
|
|
|
) -> ValidationResult:
|