|
|
@@ -0,0 +1,121 @@
|
|
|
+"""
|
|
|
+Post-processing utilities for cleaning translated text.
|
|
|
+
|
|
|
+This module provides simple post-processing functions to fix common
|
|
|
+issues in machine translation output, such as language tag prefixes
|
|
|
+and punctuation problems.
|
|
|
+"""
|
|
|
+
|
|
|
+import re
|
|
|
+from typing import Dict
|
|
|
+
|
|
|
+
|
|
|
+class PostProcessor:
|
|
|
+ """
|
|
|
+ Post-processor for cleaning translated text.
|
|
|
+
|
|
|
+ This class provides static methods to fix common issues in
|
|
|
+ machine translation output, including language tag prefixes
|
|
|
+ and punctuation problems.
|
|
|
+ """
|
|
|
+
|
|
|
+ # Pattern for language tag prefixes like __en__, __zh__, etc.
|
|
|
+ LANG_TAG_PATTERN = re.compile(r'__\w+__\s*')
|
|
|
+
|
|
|
+ # Pattern for multiple consecutive dots
|
|
|
+ MULTI_DOT_PATTERN = re.compile(r'\.{2,}')
|
|
|
+
|
|
|
+ # Pattern for space before punctuation
|
|
|
+ SPACE_BEFORE_PUNCT_PATTERN = re.compile(r'\s+([.,!?;:)\]])')
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def remove_lang_tags(text: str) -> str:
|
|
|
+ """
|
|
|
+ Remove language tag prefixes like __en__, __zh__ from text.
|
|
|
+
|
|
|
+ These tags are sometimes left behind by the m2m100 model
|
|
|
+ after translation.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: The text to clean
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Text with language tags removed
|
|
|
+ """
|
|
|
+ # Remove language tags at the start of the text
|
|
|
+ text = PostProcessor.LANG_TAG_PATTERN.sub('', text)
|
|
|
+ return text.strip()
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def fix_punctuation(text: str) -> str:
|
|
|
+ """
|
|
|
+ Fix punctuation issues in translated text.
|
|
|
+
|
|
|
+ Fixes:
|
|
|
+ - Multiple consecutive dots (e.g., "..." → ".")
|
|
|
+ - Spaces before punctuation (e.g., "Lin Feng ." → "Lin Feng.")
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: The text to fix
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Text with fixed punctuation
|
|
|
+ """
|
|
|
+ # Fix multiple consecutive dots first
|
|
|
+ text = PostProcessor.MULTI_DOT_PATTERN.sub('.', text)
|
|
|
+
|
|
|
+ # Remove space before common punctuation
|
|
|
+ text = PostProcessor.SPACE_BEFORE_PUNCT_PATTERN.sub(r'\1', text)
|
|
|
+
|
|
|
+ return text.strip()
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def process(text: str, placeholder_map: Dict[str, str] = None) -> str:
|
|
|
+ """
|
|
|
+ Execute all post-processing steps on translated text.
|
|
|
+
|
|
|
+ This method applies all post-processing fixes in sequence:
|
|
|
+ 1. Restore placeholders (if map provided)
|
|
|
+ 2. Remove language tags
|
|
|
+ 3. Fix punctuation
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: The translated text to process
|
|
|
+ placeholder_map: Optional mapping of placeholders to translations
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Cleaned text
|
|
|
+ """
|
|
|
+ result = text
|
|
|
+
|
|
|
+ # Restore placeholders if map is provided
|
|
|
+ if placeholder_map:
|
|
|
+ for placeholder, translation in placeholder_map.items():
|
|
|
+ result = result.replace(placeholder, translation)
|
|
|
+
|
|
|
+ # Remove language tags
|
|
|
+ result = PostProcessor.remove_lang_tags(result)
|
|
|
+
|
|
|
+ # Fix punctuation
|
|
|
+ result = PostProcessor.fix_punctuation(result)
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def clean_whitespace(text: str) -> str:
|
|
|
+ """
|
|
|
+ Clean up whitespace issues in translated text.
|
|
|
+
|
|
|
+ Fixes:
|
|
|
+ - Multiple consecutive spaces → single space
|
|
|
+ - Leading/trailing whitespace
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: The text to clean
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Text with cleaned whitespace
|
|
|
+ """
|
|
|
+ # Replace multiple spaces with single space
|
|
|
+ text = re.sub(r'\s+', ' ', text)
|
|
|
+ return text.strip()
|