2 zile în urmă · 7e18baa8f2
--- a/src/glossary/__init__.py
+++ b/src/glossary/__init__.py
@@ -7,7 +7,9 @@ to ensure consistent translations of proper nouns and character names.
 
				 
			
 
				 from .models import Glossary, GlossaryEntry
			
 
				 from .matcher import GlossaryMatcher, TermMatch
			
 
				+from .pipeline import GlossaryPipeline
			
 
				 from .preprocessor import GlossaryPreprocessor, PreprocessingResult
			
 
				+from .post_process import PostProcessor
			
 
				 from .postprocessor import GlossaryPostprocessor
			
 
				 
			
 
				 __all__ = [
			
@@ -15,7 +17,9 @@ __all__ = [
 
				     "GlossaryEntry",
			
 
				     "GlossaryMatcher",
			
 
				     "TermMatch",
			
 
				+    "GlossaryPipeline",
			
 
				     "GlossaryPreprocessor",
			
 
				     "PreprocessingResult",
			
 
				+    "PostProcessor",
			
 
				     "GlossaryPostprocessor",
			
 
				 ]
			
--- a/src/glossary/__pycache__/__init__.cpython-311.pyc
+++ b/src/glossary/__pycache__/__init__.cpython-311.pyc
--- a/src/glossary/__pycache__/matcher.cpython-311.pyc
+++ b/src/glossary/__pycache__/matcher.cpython-311.pyc
--- a/src/glossary/__pycache__/models.cpython-311.pyc
+++ b/src/glossary/__pycache__/models.cpython-311.pyc
--- a/src/glossary/__pycache__/pipeline.cpython-311.pyc
+++ b/src/glossary/__pycache__/pipeline.cpython-311.pyc
--- a/src/glossary/__pycache__/post_process.cpython-311.pyc
+++ b/src/glossary/__pycache__/post_process.cpython-311.pyc
--- a/src/glossary/__pycache__/postprocessor.cpython-311.pyc
+++ b/src/glossary/__pycache__/postprocessor.cpython-311.pyc
--- a/src/glossary/__pycache__/preprocessor.cpython-311.pyc
+++ b/src/glossary/__pycache__/preprocessor.cpython-311.pyc
--- a/src/glossary/pipeline.py
+++ b/src/glossary/pipeline.py
@@ -0,0 +1,75 @@
 
				+"""
			
 
				+Glossary preprocessing pipeline.
			
 
				+
			
 
				+This module provides a unified pipeline for terminology preprocessing
			
 
				+that integrates matching, preprocessing, and batch processing.
			
 
				+"""
			
 
				+
			
 
				+from typing import List, Tuple, Dict
			
 
				+
			
 
				+from .models import Glossary
			
 
				+from .matcher import GlossaryMatcher, TermMatch
			
 
				+
			
 
				+
			
 
				+class GlossaryPipeline:
			
 
				+    """
			
 
				+    Unified preprocessing pipeline for terminology management.
			
 
				+
			
 
				+    This pipeline integrates the matcher and preprocessor to provide
			
 
				+    a simple interface for text preprocessing before translation.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, glossary: Glossary):
			
 
				+        """
			
 
				+        Initialize the pipeline with a glossary.
			
 
				+
			
 
				+        Args:
			
 
				+            glossary: The Glossary to use for preprocessing
			
 
				+        """
			
 
				+        self.glossary = glossary
			
 
				+        self.matcher = GlossaryMatcher(glossary)
			
 
				+
			
 
				+    def preprocess(self, text: str) -> Tuple[str, List[str]]:
			
 
				+        """
			
 
				+        Preprocess text by replacing terms with placeholders.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to preprocess
			
 
				+
			
 
				+        Returns:
			
 
				+            Tuple of (processed_text, list_of_terms_used)
			
 
				+        """
			
 
				+        matches = self.matcher.find_matches(text)
			
 
				+        terms_used = list(set(match.source for match in matches))
			
 
				+
			
 
				+        processed_text, _ = self.matcher.replace_with_placeholder(text)
			
 
				+
			
 
				+        return processed_text, terms_used
			
 
				+
			
 
				+    def batch_preprocess(self, texts: List[str]) -> List[Tuple[str, List[str]]]:
			
 
				+        """
			
 
				+        Preprocess multiple texts in batch.
			
 
				+
			
 
				+        Args:
			
 
				+            texts: List of texts to preprocess
			
 
				+
			
 
				+        Returns:
			
 
				+            List of tuples (processed_text, list_of_terms_used)
			
 
				+        """
			
 
				+        return [self.preprocess(text) for text in texts]
			
 
				+
			
 
				+    def get_statistics(self, text: str) -> Dict[str, int]:
			
 
				+        """
			
 
				+        Get statistics about terminology usage in text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to analyze
			
 
				+
			
 
				+        Returns:
			
 
				+            Dictionary mapping term names to occurrence counts
			
 
				+        """
			
 
				+        matches = self.matcher.find_matches(text)
			
 
				+        stats = {}
			
 
				+        for match in matches:
			
 
				+            stats[match.source] = stats.get(match.source, 0) + 1
			
 
				+        return stats
			
--- a/src/glossary/post_process.py
+++ b/src/glossary/post_process.py
@@ -0,0 +1,121 @@
 
				+"""
			
 
				+Post-processing utilities for cleaning translated text.
			
 
				+
			
 
				+This module provides simple post-processing functions to fix common
			
 
				+issues in machine translation output, such as language tag prefixes
			
 
				+and punctuation problems.
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from typing import Dict
			
 
				+
			
 
				+
			
 
				+class PostProcessor:
			
 
				+    """
			
 
				+    Post-processor for cleaning translated text.
			
 
				+
			
 
				+    This class provides static methods to fix common issues in
			
 
				+    machine translation output, including language tag prefixes
			
 
				+    and punctuation problems.
			
 
				+    """
			
 
				+
			
 
				+    # Pattern for language tag prefixes like __en__, __zh__, etc.
			
 
				+    LANG_TAG_PATTERN = re.compile(r'__\w+__\s*')
			
 
				+
			
 
				+    # Pattern for multiple consecutive dots
			
 
				+    MULTI_DOT_PATTERN = re.compile(r'\.{2,}')
			
 
				+
			
 
				+    # Pattern for space before punctuation
			
 
				+    SPACE_BEFORE_PUNCT_PATTERN = re.compile(r'\s+([.,!?;:)\]])')
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def remove_lang_tags(text: str) -> str:
			
 
				+        """
			
 
				+        Remove language tag prefixes like __en__, __zh__ from text.
			
 
				+
			
 
				+        These tags are sometimes left behind by the m2m100 model
			
 
				+        after translation.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to clean
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with language tags removed
			
 
				+        """
			
 
				+        # Remove language tags at the start of the text
			
 
				+        text = PostProcessor.LANG_TAG_PATTERN.sub('', text)
			
 
				+        return text.strip()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def fix_punctuation(text: str) -> str:
			
 
				+        """
			
 
				+        Fix punctuation issues in translated text.
			
 
				+
			
 
				+        Fixes:
			
 
				+        - Multiple consecutive dots (e.g., "..." → ".")
			
 
				+        - Spaces before punctuation (e.g., "Lin Feng ." → "Lin Feng.")
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to fix
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with fixed punctuation
			
 
				+        """
			
 
				+        # Fix multiple consecutive dots first
			
 
				+        text = PostProcessor.MULTI_DOT_PATTERN.sub('.', text)
			
 
				+
			
 
				+        # Remove space before common punctuation
			
 
				+        text = PostProcessor.SPACE_BEFORE_PUNCT_PATTERN.sub(r'\1', text)
			
 
				+
			
 
				+        return text.strip()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def process(text: str, placeholder_map: Dict[str, str] = None) -> str:
			
 
				+        """
			
 
				+        Execute all post-processing steps on translated text.
			
 
				+
			
 
				+        This method applies all post-processing fixes in sequence:
			
 
				+        1. Restore placeholders (if map provided)
			
 
				+        2. Remove language tags
			
 
				+        3. Fix punctuation
			
 
				+
			
 
				+        Args:
			
 
				+            text: The translated text to process
			
 
				+            placeholder_map: Optional mapping of placeholders to translations
			
 
				+
			
 
				+        Returns:
			
 
				+            Cleaned text
			
 
				+        """
			
 
				+        result = text
			
 
				+
			
 
				+        # Restore placeholders if map is provided
			
 
				+        if placeholder_map:
			
 
				+            for placeholder, translation in placeholder_map.items():
			
 
				+                result = result.replace(placeholder, translation)
			
 
				+
			
 
				+        # Remove language tags
			
 
				+        result = PostProcessor.remove_lang_tags(result)
			
 
				+
			
 
				+        # Fix punctuation
			
 
				+        result = PostProcessor.fix_punctuation(result)
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def clean_whitespace(text: str) -> str:
			
 
				+        """
			
 
				+        Clean up whitespace issues in translated text.
			
 
				+
			
 
				+        Fixes:
			
 
				+        - Multiple consecutive spaces → single space
			
 
				+        - Leading/trailing whitespace
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to clean
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with cleaned whitespace
			
 
				+        """
			
 
				+        # Replace multiple spaces with single space
			
 
				+        text = re.sub(r'\s+', ' ', text)
			
 
				+        return text.strip()
			
--- a/src/glossary/postprocessor.py
+++ b/src/glossary/postprocessor.py
@@ -87,6 +87,7 @@ class GlossaryPostprocessor:
 
				 
			
 
				         Fixes:
			
 
				         - Spaces before punctuation (e.g., "Lin Feng ." → "Lin Feng.")
			
 
				+        - Multiple consecutive dots (e.g., "..." → ".")
			
 
				         - Chinese punctuation after English (e.g., "Lin Feng，" → "Lin Feng,")
			
 
				 
			
 
				         Args:
			
@@ -95,6 +96,9 @@ class GlossaryPostprocessor:
 
				         Returns:
			
 
				             Text with fixed punctuation
			
 
				         """
			
 
				+        # Fix multiple consecutive dots (common m2m100 output issue)
			
 
				+        text = re.sub(r"\.{2,}", ".", text)
			
 
				+
			
 
				         # Remove space before common punctuation
			
 
				         text = re.sub(r"\s+([.,!?;:)])", r"\1", text)