|
|
@@ -0,0 +1,435 @@
|
|
|
+"""
|
|
|
+Translation quality checker module.
|
|
|
+
|
|
|
+This module provides functionality for checking translation quality
|
|
|
+by detecting common issues like missing content, untranslated terms,
|
|
|
+abnormal lengths, and duplicate content.
|
|
|
+"""
|
|
|
+
|
|
|
+import re
|
|
|
+import logging
|
|
|
+from dataclasses import dataclass, field
|
|
|
+from typing import List, Dict, Any, Optional, Set
|
|
|
+from enum import Enum
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+class QualityIssueType(str, Enum):
|
|
|
+ """Types of quality issues."""
|
|
|
+
|
|
|
+ MISSING_CONTENT = "missing_content"
|
|
|
+ UNTRANSLATED_TERM = "untranslated_term"
|
|
|
+ ABNORMAL_LENGTH = "abnormal_length"
|
|
|
+ DUPLICATE_CONTENT = "duplicate_content"
|
|
|
+ LINE_COUNT_MISMATCH = "line_count_mismatch"
|
|
|
+ EMPTY_TRANSLATION = "empty_translation"
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class QualityIssue:
|
|
|
+ """
|
|
|
+ Represents a quality issue found in translation.
|
|
|
+
|
|
|
+ Attributes:
|
|
|
+ issue_type: The type of issue
|
|
|
+ location: Location description (e.g., "paragraph 5", "line 10")
|
|
|
+ message: Human-readable description
|
|
|
+ severity: Issue severity ("error", "warning", "info")
|
|
|
+ source_text: The source text that has the issue
|
|
|
+ target_text: The translated text with the issue
|
|
|
+ """
|
|
|
+
|
|
|
+ issue_type: QualityIssueType
|
|
|
+ location: str
|
|
|
+ message: str
|
|
|
+ severity: str = "warning"
|
|
|
+ source_text: str = ""
|
|
|
+ target_text: str = ""
|
|
|
+
|
|
|
+ def to_dict(self) -> Dict[str, Any]:
|
|
|
+ """Convert to dictionary for serialization."""
|
|
|
+ return {
|
|
|
+ "issue_type": self.issue_type.value,
|
|
|
+ "location": self.location,
|
|
|
+ "message": self.message,
|
|
|
+ "severity": self.severity,
|
|
|
+ "source_text": self.source_text[:200], # Truncate long text
|
|
|
+ "target_text": self.target_text[:200],
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class QualityReport:
|
|
|
+ """
|
|
|
+ Report of quality check results.
|
|
|
+
|
|
|
+ Attributes:
|
|
|
+ total_issues: Total number of issues found
|
|
|
+ error_count: Number of error-level issues
|
|
|
+ warning_count: Number of warning-level issues
|
|
|
+ info_count: Number of info-level issues
|
|
|
+ issues: List of all issues found
|
|
|
+ by_type: Breakdown of issues by type
|
|
|
+ is_valid: Whether translation passed quality check
|
|
|
+ """
|
|
|
+
|
|
|
+ total_issues: int
|
|
|
+ error_count: int
|
|
|
+ warning_count: int
|
|
|
+ info_count: int
|
|
|
+ issues: List[QualityIssue] = field(default_factory=list)
|
|
|
+ by_type: Dict[str, int] = field(default_factory=dict)
|
|
|
+ is_valid: bool = True
|
|
|
+
|
|
|
+ @property
|
|
|
+ def errors(self) -> List[QualityIssue]:
|
|
|
+ """Get all error-level issues."""
|
|
|
+ return [i for i in self.issues if i.severity == "error"]
|
|
|
+
|
|
|
+ @property
|
|
|
+ def warnings(self) -> List[QualityIssue]:
|
|
|
+ """Get all warning-level issues."""
|
|
|
+ return [i for i in self.issues if i.severity == "warning"]
|
|
|
+
|
|
|
+ def to_dict(self) -> Dict[str, Any]:
|
|
|
+ """Convert to dictionary for serialization."""
|
|
|
+ return {
|
|
|
+ "total_issues": self.total_issues,
|
|
|
+ "error_count": self.error_count,
|
|
|
+ "warning_count": self.warning_count,
|
|
|
+ "info_count": self.info_count,
|
|
|
+ "is_valid": self.is_valid,
|
|
|
+ "by_type": self.by_type,
|
|
|
+ "issues": [i.to_dict() for i in self.issues],
|
|
|
+ }
|
|
|
+
|
|
|
+ def generate_report(self) -> str:
|
|
|
+ """
|
|
|
+ Generate a human-readable report.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Formatted report string
|
|
|
+ """
|
|
|
+ lines = [
|
|
|
+ "=== Translation Quality Report ===",
|
|
|
+ f"Valid: {self.is_valid}",
|
|
|
+ f"Total Issues: {self.total_issues}",
|
|
|
+ f" Errors: {self.error_count}",
|
|
|
+ f" Warnings: {self.warning_count}",
|
|
|
+ f" Info: {self.info_count}",
|
|
|
+ "",
|
|
|
+ "Issues by Type:",
|
|
|
+ ]
|
|
|
+
|
|
|
+ for issue_type, count in sorted(self.by_type.items()):
|
|
|
+ lines.append(f" {issue_type}: {count}")
|
|
|
+
|
|
|
+ if self.issues:
|
|
|
+ lines.append("")
|
|
|
+ lines.append("Detailed Issues:")
|
|
|
+
|
|
|
+ for issue in self.issues:
|
|
|
+ lines.append(f" [{issue.severity.upper()}] {issue.issue_type.value}")
|
|
|
+ lines.append(f" Location: {issue.location}")
|
|
|
+ lines.append(f" Message: {issue.message}")
|
|
|
+
|
|
|
+ return "\n".join(lines)
|
|
|
+
|
|
|
+
|
|
|
+class QualityChecker:
|
|
|
+ """
|
|
|
+ Checker for translation quality.
|
|
|
+
|
|
|
+ This class checks translations for common quality issues including
|
|
|
+ missing content, untranslated terms, abnormal lengths, and duplicates.
|
|
|
+ """
|
|
|
+
|
|
|
+ # Default thresholds for quality checks
|
|
|
+ DEFAULT_MIN_LENGTH_RATIO = 0.3 # Target length should be at least 30% of source
|
|
|
+ DEFAULT_MAX_LENGTH_RATIO = 3.0 # Target length should be at most 3x source
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ min_length_ratio: float = DEFAULT_MIN_LENGTH_RATIO,
|
|
|
+ max_length_ratio: float = DEFAULT_MAX_LENGTH_RATIO,
|
|
|
+ untranslated_terms: Optional[Set[str]] = None
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Initialize the quality checker.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ min_length_ratio: Minimum acceptable length ratio (target/source)
|
|
|
+ max_length_ratio: Maximum acceptable length ratio (target/source)
|
|
|
+ untranslated_terms: Set of terms that should always be translated
|
|
|
+ """
|
|
|
+ self.min_length_ratio = min_length_ratio
|
|
|
+ self.max_length_ratio = max_length_ratio
|
|
|
+ self.untranslated_terms = untranslated_terms or set()
|
|
|
+
|
|
|
+ # Default terms that should always be translated from Chinese
|
|
|
+ self._default_chinese_terms = {
|
|
|
+ "的", "了", "是", "在", "和", "与", "或", "但", "而", "如果", "因为",
|
|
|
+ "所以", "然后", "之后", "之前", "已经", "还在", "可以", "应该", "需要",
|
|
|
+ "想要", "希望", "觉得", "认为", "知道", "看到", "听到", "说到",
|
|
|
+ }
|
|
|
+
|
|
|
+ def check_translation(
|
|
|
+ self,
|
|
|
+ source: str,
|
|
|
+ target: str,
|
|
|
+ source_lang: str = "zh",
|
|
|
+ tgt_lang: str = "en"
|
|
|
+ ) -> QualityReport:
|
|
|
+ """
|
|
|
+ Perform a comprehensive quality check on translation.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ source: Original source text
|
|
|
+ target: Translated text
|
|
|
+ source_lang: Source language code
|
|
|
+ tgt_lang: Target language code
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ QualityReport with all issues found
|
|
|
+ """
|
|
|
+ issues: List[QualityIssue] = []
|
|
|
+
|
|
|
+ # Check for empty translation
|
|
|
+ issues.extend(self._check_empty(source, target))
|
|
|
+
|
|
|
+ # Check line count mismatch
|
|
|
+ issues.extend(self._check_line_count(source, target))
|
|
|
+
|
|
|
+ # Check for untranslated terms
|
|
|
+ issues.extend(self._check_untranslated_terms(source, target, source_lang))
|
|
|
+
|
|
|
+ # Check for abnormal length
|
|
|
+ issues.extend(self._check_length_ratio(source, target))
|
|
|
+
|
|
|
+ # Check for duplicate content
|
|
|
+ issues.extend(self._check_duplicates(target))
|
|
|
+
|
|
|
+ # Calculate statistics
|
|
|
+ error_count = sum(1 for i in issues if i.severity == "error")
|
|
|
+ warning_count = sum(1 for i in issues if i.severity == "warning")
|
|
|
+ info_count = sum(1 for i in issues if i.severity == "info")
|
|
|
+
|
|
|
+ # Group by type
|
|
|
+ by_type: Dict[str, int] = {}
|
|
|
+ for issue in issues:
|
|
|
+ by_type[issue.issue_type.value] = by_type.get(issue.issue_type.value, 0) + 1
|
|
|
+
|
|
|
+ # Determine if translation is valid (no errors)
|
|
|
+ is_valid = error_count == 0
|
|
|
+
|
|
|
+ return QualityReport(
|
|
|
+ total_issues=len(issues),
|
|
|
+ error_count=error_count,
|
|
|
+ warning_count=warning_count,
|
|
|
+ info_count=info_count,
|
|
|
+ issues=issues,
|
|
|
+ by_type=by_type,
|
|
|
+ is_valid=is_valid
|
|
|
+ )
|
|
|
+
|
|
|
+ def _check_empty(self, source: str, target: str) -> List[QualityIssue]:
|
|
|
+ """Check for empty translations."""
|
|
|
+ issues = []
|
|
|
+
|
|
|
+ if not target or not target.strip():
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.EMPTY_TRANSLATION,
|
|
|
+ location="entire text",
|
|
|
+ message="Translation is empty",
|
|
|
+ severity="error",
|
|
|
+ source_text=source[:100],
|
|
|
+ target_text=target,
|
|
|
+ ))
|
|
|
+
|
|
|
+ return issues
|
|
|
+
|
|
|
+ def _check_line_count(self, source: str, target: str) -> List[QualityIssue]:
|
|
|
+ """Check for line count mismatches."""
|
|
|
+ issues = []
|
|
|
+
|
|
|
+ source_lines = source.split('\n')
|
|
|
+ target_lines = target.split('\n')
|
|
|
+
|
|
|
+ # Filter out empty lines for comparison
|
|
|
+ source_nonempty = [l for l in source_lines if l.strip()]
|
|
|
+ target_nonempty = [l for l in target_lines if l.strip()]
|
|
|
+
|
|
|
+ source_count = len(source_nonempty)
|
|
|
+ target_count = len(target_nonempty)
|
|
|
+
|
|
|
+ # Allow some tolerance (±20%)
|
|
|
+ if source_count > 0:
|
|
|
+ ratio = target_count / source_count
|
|
|
+ if ratio < 0.8 or ratio > 1.2:
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.LINE_COUNT_MISMATCH,
|
|
|
+ location="entire text",
|
|
|
+ message=f"Line count mismatch: source has {source_count} lines, target has {target_count} lines",
|
|
|
+ severity="warning" if 0.5 < ratio < 1.5 else "error",
|
|
|
+ ))
|
|
|
+
|
|
|
+ return issues
|
|
|
+
|
|
|
+ def _check_untranslated_terms(
|
|
|
+ self,
|
|
|
+ source: str,
|
|
|
+ target: str,
|
|
|
+ source_lang: str
|
|
|
+ ) -> List[QualityIssue]:
|
|
|
+ """Check for untranslated terms."""
|
|
|
+ issues = []
|
|
|
+
|
|
|
+ # Use language-specific checks
|
|
|
+ if source_lang == "zh":
|
|
|
+ # Check for remaining Chinese characters
|
|
|
+ chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
|
|
|
+ chinese_matches = chinese_pattern.findall(target)
|
|
|
+
|
|
|
+ if chinese_matches:
|
|
|
+ # Count how many Chinese characters remain
|
|
|
+ total_chinese = sum(len(m) for m in chinese_matches)
|
|
|
+ source_chinese = sum(len(m) for m in chinese_pattern.findall(source))
|
|
|
+
|
|
|
+ if source_chinese > 0:
|
|
|
+ untranslated_ratio = total_chinese / source_chinese
|
|
|
+ if untranslated_ratio > 0.1: # More than 10% untranslated
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.UNTRANSLATED_TERM,
|
|
|
+ location="scattered",
|
|
|
+ message=f"Found {total_chinese} Chinese characters in translation ({untranslated_ratio:.1%} of source)",
|
|
|
+ severity="warning",
|
|
|
+ source_text="",
|
|
|
+ target_text=" ".join(chinese_matches[:10]), # Show first 10
|
|
|
+ ))
|
|
|
+
|
|
|
+ # Check for specific untranslated terms
|
|
|
+ for term in self.untranslated_terms:
|
|
|
+ if term in target:
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.UNTRANSLATED_TERM,
|
|
|
+ location="scattered",
|
|
|
+ message=f"Source term '{term}' found untranslated",
|
|
|
+ severity="warning",
|
|
|
+ source_text=term,
|
|
|
+ target_text=term,
|
|
|
+ ))
|
|
|
+
|
|
|
+ return issues
|
|
|
+
|
|
|
+ def _check_length_ratio(self, source: str, target: str) -> List[QualityIssue]:
|
|
|
+ """Check for abnormal length ratios."""
|
|
|
+ issues = []
|
|
|
+
|
|
|
+ source_len = len(source.strip())
|
|
|
+ target_len = len(target.strip())
|
|
|
+
|
|
|
+ if source_len == 0:
|
|
|
+ return issues
|
|
|
+
|
|
|
+ ratio = target_len / source_len
|
|
|
+
|
|
|
+ if ratio < self.min_length_ratio:
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.ABNORMAL_LENGTH,
|
|
|
+ location="entire text",
|
|
|
+ message=f"Translation too short: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
|
|
|
+ severity="warning" if ratio > 0.1 else "error",
|
|
|
+ ))
|
|
|
+ elif ratio > self.max_length_ratio:
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.ABNORMAL_LENGTH,
|
|
|
+ location="entire text",
|
|
|
+ message=f"Translation too long: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
|
|
|
+ severity="warning",
|
|
|
+ ))
|
|
|
+
|
|
|
+ return issues
|
|
|
+
|
|
|
+ def _check_duplicates(self, text: str) -> List[QualityIssue]:
|
|
|
+ """Check for duplicate content."""
|
|
|
+ issues = []
|
|
|
+
|
|
|
+ # Split into paragraphs and check for duplicates
|
|
|
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
|
|
+
|
|
|
+ # Find duplicates
|
|
|
+ seen: Dict[str, int] = {}
|
|
|
+ for i, para in enumerate(paragraphs):
|
|
|
+ if para in seen:
|
|
|
+ issues.append(QualityIssue(
|
|
|
+ issue_type=QualityIssueType.DUPLICATE_CONTENT,
|
|
|
+ location=f"paragraph {i}",
|
|
|
+ message=f"Duplicate content (first seen at paragraph {seen[para]})",
|
|
|
+ severity="info",
|
|
|
+ source_text="",
|
|
|
+ target_text=para[:100] + "..." if len(para) > 100 else para,
|
|
|
+ ))
|
|
|
+ seen[para] = i
|
|
|
+
|
|
|
+ return issues
|
|
|
+
|
|
|
+ def check_batch(
|
|
|
+ self,
|
|
|
+ sources: List[str],
|
|
|
+ targets: List[str],
|
|
|
+ source_lang: str = "zh",
|
|
|
+ tgt_lang: str = "en"
|
|
|
+ ) -> List[QualityReport]:
|
|
|
+ """
|
|
|
+ Check multiple translation pairs.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ sources: List of source texts
|
|
|
+ targets: List of target texts
|
|
|
+ source_lang: Source language code
|
|
|
+ tgt_lang: Target language code
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ List of QualityReport objects
|
|
|
+ """
|
|
|
+ if len(sources) != len(targets):
|
|
|
+ raise ValueError("Source and target lists must have the same length")
|
|
|
+
|
|
|
+ return [
|
|
|
+ self.check_translation(s, t, source_lang, tgt_lang)
|
|
|
+ for s, t in zip(sources, targets)
|
|
|
+ ]
|
|
|
+
|
|
|
+ def get_summary(self, reports: List[QualityReport]) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ Get summary statistics from multiple reports.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ reports: List of QualityReport objects
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dictionary with summary statistics
|
|
|
+ """
|
|
|
+ total_reports = len(reports)
|
|
|
+ valid_reports = sum(1 for r in reports if r.is_valid)
|
|
|
+ total_issues = sum(r.total_issues for r in reports)
|
|
|
+ total_errors = sum(r.error_count for r in reports)
|
|
|
+ total_warnings = sum(r.warning_count for r in reports)
|
|
|
+
|
|
|
+ # Aggregate by type
|
|
|
+ by_type: Dict[str, int] = {}
|
|
|
+ for report in reports:
|
|
|
+ for issue_type, count in report.by_type.items():
|
|
|
+ by_type[issue_type] = by_type.get(issue_type, 0) + count
|
|
|
+
|
|
|
+ return {
|
|
|
+ "total_translations": total_reports,
|
|
|
+ "valid_translations": valid_reports,
|
|
|
+ "invalid_translations": total_reports - valid_reports,
|
|
|
+ "validity_rate": (valid_reports / total_reports * 100) if total_reports > 0 else 100,
|
|
|
+ "total_issues": total_issues,
|
|
|
+ "total_errors": total_errors,
|
|
|
+ "total_warnings": total_warnings,
|
|
|
+ "by_type": by_type,
|
|
|
+ }
|