""" Translation quality checker module. This module provides functionality for checking translation quality by detecting common issues like missing content, untranslated terms, abnormal lengths, and duplicate content. """ import re import logging from dataclasses import dataclass, field from typing import List, Dict, Any, Optional, Set from enum import Enum logger = logging.getLogger(__name__) class QualityIssueType(str, Enum): """Types of quality issues.""" MISSING_CONTENT = "missing_content" UNTRANSLATED_TERM = "untranslated_term" ABNORMAL_LENGTH = "abnormal_length" DUPLICATE_CONTENT = "duplicate_content" LINE_COUNT_MISMATCH = "line_count_mismatch" EMPTY_TRANSLATION = "empty_translation" @dataclass class QualityIssue: """ Represents a quality issue found in translation. Attributes: issue_type: The type of issue location: Location description (e.g., "paragraph 5", "line 10") message: Human-readable description severity: Issue severity ("error", "warning", "info") source_text: The source text that has the issue target_text: The translated text with the issue """ issue_type: QualityIssueType location: str message: str severity: str = "warning" source_text: str = "" target_text: str = "" def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { "issue_type": self.issue_type.value, "location": self.location, "message": self.message, "severity": self.severity, "source_text": self.source_text[:200], # Truncate long text "target_text": self.target_text[:200], } @dataclass class QualityReport: """ Report of quality check results. Attributes: total_issues: Total number of issues found error_count: Number of error-level issues warning_count: Number of warning-level issues info_count: Number of info-level issues issues: List of all issues found by_type: Breakdown of issues by type is_valid: Whether translation passed quality check """ total_issues: int error_count: int warning_count: int info_count: int issues: List[QualityIssue] = field(default_factory=list) by_type: Dict[str, int] = field(default_factory=dict) is_valid: bool = True @property def errors(self) -> List[QualityIssue]: """Get all error-level issues.""" return [i for i in self.issues if i.severity == "error"] @property def warnings(self) -> List[QualityIssue]: """Get all warning-level issues.""" return [i for i in self.issues if i.severity == "warning"] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { "total_issues": self.total_issues, "error_count": self.error_count, "warning_count": self.warning_count, "info_count": self.info_count, "is_valid": self.is_valid, "by_type": self.by_type, "issues": [i.to_dict() for i in self.issues], } def generate_report(self) -> str: """ Generate a human-readable report. Returns: Formatted report string """ lines = [ "=== Translation Quality Report ===", f"Valid: {self.is_valid}", f"Total Issues: {self.total_issues}", f" Errors: {self.error_count}", f" Warnings: {self.warning_count}", f" Info: {self.info_count}", "", "Issues by Type:", ] for issue_type, count in sorted(self.by_type.items()): lines.append(f" {issue_type}: {count}") if self.issues: lines.append("") lines.append("Detailed Issues:") for issue in self.issues: lines.append(f" [{issue.severity.upper()}] {issue.issue_type.value}") lines.append(f" Location: {issue.location}") lines.append(f" Message: {issue.message}") return "\n".join(lines) class QualityChecker: """ Checker for translation quality. This class checks translations for common quality issues including missing content, untranslated terms, abnormal lengths, and duplicates. """ # Default thresholds for quality checks DEFAULT_MIN_LENGTH_RATIO = 0.3 # Target length should be at least 30% of source DEFAULT_MAX_LENGTH_RATIO = 3.0 # Target length should be at most 3x source def __init__( self, min_length_ratio: float = DEFAULT_MIN_LENGTH_RATIO, max_length_ratio: float = DEFAULT_MAX_LENGTH_RATIO, untranslated_terms: Optional[Set[str]] = None ): """ Initialize the quality checker. Args: min_length_ratio: Minimum acceptable length ratio (target/source) max_length_ratio: Maximum acceptable length ratio (target/source) untranslated_terms: Set of terms that should always be translated """ self.min_length_ratio = min_length_ratio self.max_length_ratio = max_length_ratio self.untranslated_terms = untranslated_terms or set() # Default terms that should always be translated from Chinese self._default_chinese_terms = { "的", "了", "是", "在", "和", "与", "或", "但", "而", "如果", "因为", "所以", "然后", "之后", "之前", "已经", "还在", "可以", "应该", "需要", "想要", "希望", "觉得", "认为", "知道", "看到", "听到", "说到", } def check_translation( self, source: str, target: str, source_lang: str = "zh", tgt_lang: str = "en" ) -> QualityReport: """ Perform a comprehensive quality check on translation. Args: source: Original source text target: Translated text source_lang: Source language code tgt_lang: Target language code Returns: QualityReport with all issues found """ issues: List[QualityIssue] = [] # Check for empty translation issues.extend(self._check_empty(source, target)) # Check line count mismatch issues.extend(self._check_line_count(source, target)) # Check for untranslated terms issues.extend(self._check_untranslated_terms(source, target, source_lang)) # Check for abnormal length issues.extend(self._check_length_ratio(source, target)) # Check for duplicate content issues.extend(self._check_duplicates(target)) # Calculate statistics error_count = sum(1 for i in issues if i.severity == "error") warning_count = sum(1 for i in issues if i.severity == "warning") info_count = sum(1 for i in issues if i.severity == "info") # Group by type by_type: Dict[str, int] = {} for issue in issues: by_type[issue.issue_type.value] = by_type.get(issue.issue_type.value, 0) + 1 # Determine if translation is valid (no errors) is_valid = error_count == 0 return QualityReport( total_issues=len(issues), error_count=error_count, warning_count=warning_count, info_count=info_count, issues=issues, by_type=by_type, is_valid=is_valid ) def _check_empty(self, source: str, target: str) -> List[QualityIssue]: """Check for empty translations.""" issues = [] if not target or not target.strip(): issues.append(QualityIssue( issue_type=QualityIssueType.EMPTY_TRANSLATION, location="entire text", message="Translation is empty", severity="error", source_text=source[:100], target_text=target, )) return issues def _check_line_count(self, source: str, target: str) -> List[QualityIssue]: """Check for line count mismatches.""" issues = [] source_lines = source.split('\n') target_lines = target.split('\n') # Filter out empty lines for comparison source_nonempty = [l for l in source_lines if l.strip()] target_nonempty = [l for l in target_lines if l.strip()] source_count = len(source_nonempty) target_count = len(target_nonempty) # Allow some tolerance (±20%) if source_count > 0: ratio = target_count / source_count if ratio < 0.8 or ratio > 1.2: issues.append(QualityIssue( issue_type=QualityIssueType.LINE_COUNT_MISMATCH, location="entire text", message=f"Line count mismatch: source has {source_count} lines, target has {target_count} lines", severity="warning" if 0.5 < ratio < 1.5 else "error", )) return issues def _check_untranslated_terms( self, source: str, target: str, source_lang: str ) -> List[QualityIssue]: """Check for untranslated terms.""" issues = [] # Use language-specific checks if source_lang == "zh": # Check for remaining Chinese characters chinese_pattern = re.compile(r'[\u4e00-\u9fff]+') chinese_matches = chinese_pattern.findall(target) if chinese_matches: # Count how many Chinese characters remain total_chinese = sum(len(m) for m in chinese_matches) source_chinese = sum(len(m) for m in chinese_pattern.findall(source)) if source_chinese > 0: untranslated_ratio = total_chinese / source_chinese if untranslated_ratio > 0.1: # More than 10% untranslated issues.append(QualityIssue( issue_type=QualityIssueType.UNTRANSLATED_TERM, location="scattered", message=f"Found {total_chinese} Chinese characters in translation ({untranslated_ratio:.1%} of source)", severity="warning", source_text="", target_text=" ".join(chinese_matches[:10]), # Show first 10 )) # Check for specific untranslated terms for term in self.untranslated_terms: if term in target: issues.append(QualityIssue( issue_type=QualityIssueType.UNTRANSLATED_TERM, location="scattered", message=f"Source term '{term}' found untranslated", severity="warning", source_text=term, target_text=term, )) return issues def _check_length_ratio(self, source: str, target: str) -> List[QualityIssue]: """Check for abnormal length ratios.""" issues = [] source_len = len(source.strip()) target_len = len(target.strip()) if source_len == 0: return issues ratio = target_len / source_len if ratio < self.min_length_ratio: issues.append(QualityIssue( issue_type=QualityIssueType.ABNORMAL_LENGTH, location="entire text", message=f"Translation too short: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})", severity="warning" if ratio > 0.1 else "error", )) elif ratio > self.max_length_ratio: issues.append(QualityIssue( issue_type=QualityIssueType.ABNORMAL_LENGTH, location="entire text", message=f"Translation too long: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})", severity="warning", )) return issues def _check_duplicates(self, text: str) -> List[QualityIssue]: """Check for duplicate content.""" issues = [] # Split into paragraphs and check for duplicates paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] # Find duplicates seen: Dict[str, int] = {} for i, para in enumerate(paragraphs): if para in seen: issues.append(QualityIssue( issue_type=QualityIssueType.DUPLICATE_CONTENT, location=f"paragraph {i}", message=f"Duplicate content (first seen at paragraph {seen[para]})", severity="info", source_text="", target_text=para[:100] + "..." if len(para) > 100 else para, )) seen[para] = i return issues def check_batch( self, sources: List[str], targets: List[str], source_lang: str = "zh", tgt_lang: str = "en" ) -> List[QualityReport]: """ Check multiple translation pairs. Args: sources: List of source texts targets: List of target texts source_lang: Source language code tgt_lang: Target language code Returns: List of QualityReport objects """ if len(sources) != len(targets): raise ValueError("Source and target lists must have the same length") return [ self.check_translation(s, t, source_lang, tgt_lang) for s, t in zip(sources, targets) ] def get_summary(self, reports: List[QualityReport]) -> Dict[str, Any]: """ Get summary statistics from multiple reports. Args: reports: List of QualityReport objects Returns: Dictionary with summary statistics """ total_reports = len(reports) valid_reports = sum(1 for r in reports if r.is_valid) total_issues = sum(r.total_issues for r in reports) total_errors = sum(r.error_count for r in reports) total_warnings = sum(r.warning_count for r in reports) # Aggregate by type by_type: Dict[str, int] = {} for report in reports: for issue_type, count in report.by_type.items(): by_type[issue_type] = by_type.get(issue_type, 0) + count return { "total_translations": total_reports, "valid_translations": valid_reports, "invalid_translations": total_reports - valid_reports, "validity_rate": (valid_reports / total_reports * 100) if total_reports > 0 else 100, "total_issues": total_issues, "total_errors": total_errors, "total_warnings": total_warnings, "by_type": by_type, }