2
0
فهرست منبع

feat(glossary): Add pipeline and PostProcessor utilities (Story 4.3 + 4.4)

- Add GlossaryPipeline: unified preprocessing interface
- Add PostProcessor: simple static-method utilities for post-processing
- Update __init__.py to export new modules
- Fix punctuation: handle multiple consecutive dots

These provide cleaner alternatives to GlossaryPreprocessor/GlossaryPostprocessor
for simpler use cases.

Part of Epic 4 (P0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
d8dfun 2 روز پیش
والد
کامیت
7e18baa8f2

+ 4 - 0
src/glossary/__init__.py

@@ -7,7 +7,9 @@ to ensure consistent translations of proper nouns and character names.
 
 from .models import Glossary, GlossaryEntry
 from .matcher import GlossaryMatcher, TermMatch
+from .pipeline import GlossaryPipeline
 from .preprocessor import GlossaryPreprocessor, PreprocessingResult
+from .post_process import PostProcessor
 from .postprocessor import GlossaryPostprocessor
 
 __all__ = [
@@ -15,7 +17,9 @@ __all__ = [
     "GlossaryEntry",
     "GlossaryMatcher",
     "TermMatch",
+    "GlossaryPipeline",
     "GlossaryPreprocessor",
     "PreprocessingResult",
+    "PostProcessor",
     "GlossaryPostprocessor",
 ]

BIN
src/glossary/__pycache__/__init__.cpython-311.pyc


BIN
src/glossary/__pycache__/matcher.cpython-311.pyc


BIN
src/glossary/__pycache__/models.cpython-311.pyc


BIN
src/glossary/__pycache__/pipeline.cpython-311.pyc


BIN
src/glossary/__pycache__/post_process.cpython-311.pyc


BIN
src/glossary/__pycache__/postprocessor.cpython-311.pyc


BIN
src/glossary/__pycache__/preprocessor.cpython-311.pyc


+ 75 - 0
src/glossary/pipeline.py

@@ -0,0 +1,75 @@
+"""
+Glossary preprocessing pipeline.
+
+This module provides a unified pipeline for terminology preprocessing
+that integrates matching, preprocessing, and batch processing.
+"""
+
+from typing import List, Tuple, Dict
+
+from .models import Glossary
+from .matcher import GlossaryMatcher, TermMatch
+
+
+class GlossaryPipeline:
+    """
+    Unified preprocessing pipeline for terminology management.
+
+    This pipeline integrates the matcher and preprocessor to provide
+    a simple interface for text preprocessing before translation.
+    """
+
+    def __init__(self, glossary: Glossary):
+        """
+        Initialize the pipeline with a glossary.
+
+        Args:
+            glossary: The Glossary to use for preprocessing
+        """
+        self.glossary = glossary
+        self.matcher = GlossaryMatcher(glossary)
+
+    def preprocess(self, text: str) -> Tuple[str, List[str]]:
+        """
+        Preprocess text by replacing terms with placeholders.
+
+        Args:
+            text: The text to preprocess
+
+        Returns:
+            Tuple of (processed_text, list_of_terms_used)
+        """
+        matches = self.matcher.find_matches(text)
+        terms_used = list(set(match.source for match in matches))
+
+        processed_text, _ = self.matcher.replace_with_placeholder(text)
+
+        return processed_text, terms_used
+
+    def batch_preprocess(self, texts: List[str]) -> List[Tuple[str, List[str]]]:
+        """
+        Preprocess multiple texts in batch.
+
+        Args:
+            texts: List of texts to preprocess
+
+        Returns:
+            List of tuples (processed_text, list_of_terms_used)
+        """
+        return [self.preprocess(text) for text in texts]
+
+    def get_statistics(self, text: str) -> Dict[str, int]:
+        """
+        Get statistics about terminology usage in text.
+
+        Args:
+            text: The text to analyze
+
+        Returns:
+            Dictionary mapping term names to occurrence counts
+        """
+        matches = self.matcher.find_matches(text)
+        stats = {}
+        for match in matches:
+            stats[match.source] = stats.get(match.source, 0) + 1
+        return stats

+ 121 - 0
src/glossary/post_process.py

@@ -0,0 +1,121 @@
+"""
+Post-processing utilities for cleaning translated text.
+
+This module provides simple post-processing functions to fix common
+issues in machine translation output, such as language tag prefixes
+and punctuation problems.
+"""
+
+import re
+from typing import Dict
+
+
+class PostProcessor:
+    """
+    Post-processor for cleaning translated text.
+
+    This class provides static methods to fix common issues in
+    machine translation output, including language tag prefixes
+    and punctuation problems.
+    """
+
+    # Pattern for language tag prefixes like __en__, __zh__, etc.
+    LANG_TAG_PATTERN = re.compile(r'__\w+__\s*')
+
+    # Pattern for multiple consecutive dots
+    MULTI_DOT_PATTERN = re.compile(r'\.{2,}')
+
+    # Pattern for space before punctuation
+    SPACE_BEFORE_PUNCT_PATTERN = re.compile(r'\s+([.,!?;:)\]])')
+
+    @staticmethod
+    def remove_lang_tags(text: str) -> str:
+        """
+        Remove language tag prefixes like __en__, __zh__ from text.
+
+        These tags are sometimes left behind by the m2m100 model
+        after translation.
+
+        Args:
+            text: The text to clean
+
+        Returns:
+            Text with language tags removed
+        """
+        # Remove language tags at the start of the text
+        text = PostProcessor.LANG_TAG_PATTERN.sub('', text)
+        return text.strip()
+
+    @staticmethod
+    def fix_punctuation(text: str) -> str:
+        """
+        Fix punctuation issues in translated text.
+
+        Fixes:
+        - Multiple consecutive dots (e.g., "..." → ".")
+        - Spaces before punctuation (e.g., "Lin Feng ." → "Lin Feng.")
+
+        Args:
+            text: The text to fix
+
+        Returns:
+            Text with fixed punctuation
+        """
+        # Fix multiple consecutive dots first
+        text = PostProcessor.MULTI_DOT_PATTERN.sub('.', text)
+
+        # Remove space before common punctuation
+        text = PostProcessor.SPACE_BEFORE_PUNCT_PATTERN.sub(r'\1', text)
+
+        return text.strip()
+
+    @staticmethod
+    def process(text: str, placeholder_map: Dict[str, str] = None) -> str:
+        """
+        Execute all post-processing steps on translated text.
+
+        This method applies all post-processing fixes in sequence:
+        1. Restore placeholders (if map provided)
+        2. Remove language tags
+        3. Fix punctuation
+
+        Args:
+            text: The translated text to process
+            placeholder_map: Optional mapping of placeholders to translations
+
+        Returns:
+            Cleaned text
+        """
+        result = text
+
+        # Restore placeholders if map is provided
+        if placeholder_map:
+            for placeholder, translation in placeholder_map.items():
+                result = result.replace(placeholder, translation)
+
+        # Remove language tags
+        result = PostProcessor.remove_lang_tags(result)
+
+        # Fix punctuation
+        result = PostProcessor.fix_punctuation(result)
+
+        return result
+
+    @staticmethod
+    def clean_whitespace(text: str) -> str:
+        """
+        Clean up whitespace issues in translated text.
+
+        Fixes:
+        - Multiple consecutive spaces → single space
+        - Leading/trailing whitespace
+
+        Args:
+            text: The text to clean
+
+        Returns:
+            Text with cleaned whitespace
+        """
+        # Replace multiple spaces with single space
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()

+ 4 - 0
src/glossary/postprocessor.py

@@ -87,6 +87,7 @@ class GlossaryPostprocessor:
 
         Fixes:
         - Spaces before punctuation (e.g., "Lin Feng ." → "Lin Feng.")
+        - Multiple consecutive dots (e.g., "..." → ".")
         - Chinese punctuation after English (e.g., "Lin Feng," → "Lin Feng,")
 
         Args:
@@ -95,6 +96,9 @@ class GlossaryPostprocessor:
         Returns:
             Text with fixed punctuation
         """
+        # Fix multiple consecutive dots (common m2m100 output issue)
+        text = re.sub(r"\.{2,}", ".", text)
+
         # Remove space before common punctuation
         text = re.sub(r"\s+([.,!?;:)])", r"\1", text)