| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435 |
- """
- Translation quality checker module.
- This module provides functionality for checking translation quality
- by detecting common issues like missing content, untranslated terms,
- abnormal lengths, and duplicate content.
- """
- import re
- import logging
- from dataclasses import dataclass, field
- from typing import List, Dict, Any, Optional, Set
- from enum import Enum
- logger = logging.getLogger(__name__)
- class QualityIssueType(str, Enum):
- """Types of quality issues."""
- MISSING_CONTENT = "missing_content"
- UNTRANSLATED_TERM = "untranslated_term"
- ABNORMAL_LENGTH = "abnormal_length"
- DUPLICATE_CONTENT = "duplicate_content"
- LINE_COUNT_MISMATCH = "line_count_mismatch"
- EMPTY_TRANSLATION = "empty_translation"
- @dataclass
- class QualityIssue:
- """
- Represents a quality issue found in translation.
- Attributes:
- issue_type: The type of issue
- location: Location description (e.g., "paragraph 5", "line 10")
- message: Human-readable description
- severity: Issue severity ("error", "warning", "info")
- source_text: The source text that has the issue
- target_text: The translated text with the issue
- """
- issue_type: QualityIssueType
- location: str
- message: str
- severity: str = "warning"
- source_text: str = ""
- target_text: str = ""
- def to_dict(self) -> Dict[str, Any]:
- """Convert to dictionary for serialization."""
- return {
- "issue_type": self.issue_type.value,
- "location": self.location,
- "message": self.message,
- "severity": self.severity,
- "source_text": self.source_text[:200], # Truncate long text
- "target_text": self.target_text[:200],
- }
- @dataclass
- class QualityReport:
- """
- Report of quality check results.
- Attributes:
- total_issues: Total number of issues found
- error_count: Number of error-level issues
- warning_count: Number of warning-level issues
- info_count: Number of info-level issues
- issues: List of all issues found
- by_type: Breakdown of issues by type
- is_valid: Whether translation passed quality check
- """
- total_issues: int
- error_count: int
- warning_count: int
- info_count: int
- issues: List[QualityIssue] = field(default_factory=list)
- by_type: Dict[str, int] = field(default_factory=dict)
- is_valid: bool = True
- @property
- def errors(self) -> List[QualityIssue]:
- """Get all error-level issues."""
- return [i for i in self.issues if i.severity == "error"]
- @property
- def warnings(self) -> List[QualityIssue]:
- """Get all warning-level issues."""
- return [i for i in self.issues if i.severity == "warning"]
- def to_dict(self) -> Dict[str, Any]:
- """Convert to dictionary for serialization."""
- return {
- "total_issues": self.total_issues,
- "error_count": self.error_count,
- "warning_count": self.warning_count,
- "info_count": self.info_count,
- "is_valid": self.is_valid,
- "by_type": self.by_type,
- "issues": [i.to_dict() for i in self.issues],
- }
- def generate_report(self) -> str:
- """
- Generate a human-readable report.
- Returns:
- Formatted report string
- """
- lines = [
- "=== Translation Quality Report ===",
- f"Valid: {self.is_valid}",
- f"Total Issues: {self.total_issues}",
- f" Errors: {self.error_count}",
- f" Warnings: {self.warning_count}",
- f" Info: {self.info_count}",
- "",
- "Issues by Type:",
- ]
- for issue_type, count in sorted(self.by_type.items()):
- lines.append(f" {issue_type}: {count}")
- if self.issues:
- lines.append("")
- lines.append("Detailed Issues:")
- for issue in self.issues:
- lines.append(f" [{issue.severity.upper()}] {issue.issue_type.value}")
- lines.append(f" Location: {issue.location}")
- lines.append(f" Message: {issue.message}")
- return "\n".join(lines)
- class QualityChecker:
- """
- Checker for translation quality.
- This class checks translations for common quality issues including
- missing content, untranslated terms, abnormal lengths, and duplicates.
- """
- # Default thresholds for quality checks
- DEFAULT_MIN_LENGTH_RATIO = 0.3 # Target length should be at least 30% of source
- DEFAULT_MAX_LENGTH_RATIO = 3.0 # Target length should be at most 3x source
- def __init__(
- self,
- min_length_ratio: float = DEFAULT_MIN_LENGTH_RATIO,
- max_length_ratio: float = DEFAULT_MAX_LENGTH_RATIO,
- untranslated_terms: Optional[Set[str]] = None
- ):
- """
- Initialize the quality checker.
- Args:
- min_length_ratio: Minimum acceptable length ratio (target/source)
- max_length_ratio: Maximum acceptable length ratio (target/source)
- untranslated_terms: Set of terms that should always be translated
- """
- self.min_length_ratio = min_length_ratio
- self.max_length_ratio = max_length_ratio
- self.untranslated_terms = untranslated_terms or set()
- # Default terms that should always be translated from Chinese
- self._default_chinese_terms = {
- "的", "了", "是", "在", "和", "与", "或", "但", "而", "如果", "因为",
- "所以", "然后", "之后", "之前", "已经", "还在", "可以", "应该", "需要",
- "想要", "希望", "觉得", "认为", "知道", "看到", "听到", "说到",
- }
- def check_translation(
- self,
- source: str,
- target: str,
- source_lang: str = "zh",
- tgt_lang: str = "en"
- ) -> QualityReport:
- """
- Perform a comprehensive quality check on translation.
- Args:
- source: Original source text
- target: Translated text
- source_lang: Source language code
- tgt_lang: Target language code
- Returns:
- QualityReport with all issues found
- """
- issues: List[QualityIssue] = []
- # Check for empty translation
- issues.extend(self._check_empty(source, target))
- # Check line count mismatch
- issues.extend(self._check_line_count(source, target))
- # Check for untranslated terms
- issues.extend(self._check_untranslated_terms(source, target, source_lang))
- # Check for abnormal length
- issues.extend(self._check_length_ratio(source, target))
- # Check for duplicate content
- issues.extend(self._check_duplicates(target))
- # Calculate statistics
- error_count = sum(1 for i in issues if i.severity == "error")
- warning_count = sum(1 for i in issues if i.severity == "warning")
- info_count = sum(1 for i in issues if i.severity == "info")
- # Group by type
- by_type: Dict[str, int] = {}
- for issue in issues:
- by_type[issue.issue_type.value] = by_type.get(issue.issue_type.value, 0) + 1
- # Determine if translation is valid (no errors)
- is_valid = error_count == 0
- return QualityReport(
- total_issues=len(issues),
- error_count=error_count,
- warning_count=warning_count,
- info_count=info_count,
- issues=issues,
- by_type=by_type,
- is_valid=is_valid
- )
- def _check_empty(self, source: str, target: str) -> List[QualityIssue]:
- """Check for empty translations."""
- issues = []
- if not target or not target.strip():
- issues.append(QualityIssue(
- issue_type=QualityIssueType.EMPTY_TRANSLATION,
- location="entire text",
- message="Translation is empty",
- severity="error",
- source_text=source[:100],
- target_text=target,
- ))
- return issues
- def _check_line_count(self, source: str, target: str) -> List[QualityIssue]:
- """Check for line count mismatches."""
- issues = []
- source_lines = source.split('\n')
- target_lines = target.split('\n')
- # Filter out empty lines for comparison
- source_nonempty = [l for l in source_lines if l.strip()]
- target_nonempty = [l for l in target_lines if l.strip()]
- source_count = len(source_nonempty)
- target_count = len(target_nonempty)
- # Allow some tolerance (±20%)
- if source_count > 0:
- ratio = target_count / source_count
- if ratio < 0.8 or ratio > 1.2:
- issues.append(QualityIssue(
- issue_type=QualityIssueType.LINE_COUNT_MISMATCH,
- location="entire text",
- message=f"Line count mismatch: source has {source_count} lines, target has {target_count} lines",
- severity="warning" if 0.5 < ratio < 1.5 else "error",
- ))
- return issues
- def _check_untranslated_terms(
- self,
- source: str,
- target: str,
- source_lang: str
- ) -> List[QualityIssue]:
- """Check for untranslated terms."""
- issues = []
- # Use language-specific checks
- if source_lang == "zh":
- # Check for remaining Chinese characters
- chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
- chinese_matches = chinese_pattern.findall(target)
- if chinese_matches:
- # Count how many Chinese characters remain
- total_chinese = sum(len(m) for m in chinese_matches)
- source_chinese = sum(len(m) for m in chinese_pattern.findall(source))
- if source_chinese > 0:
- untranslated_ratio = total_chinese / source_chinese
- if untranslated_ratio > 0.1: # More than 10% untranslated
- issues.append(QualityIssue(
- issue_type=QualityIssueType.UNTRANSLATED_TERM,
- location="scattered",
- message=f"Found {total_chinese} Chinese characters in translation ({untranslated_ratio:.1%} of source)",
- severity="warning",
- source_text="",
- target_text=" ".join(chinese_matches[:10]), # Show first 10
- ))
- # Check for specific untranslated terms
- for term in self.untranslated_terms:
- if term in target:
- issues.append(QualityIssue(
- issue_type=QualityIssueType.UNTRANSLATED_TERM,
- location="scattered",
- message=f"Source term '{term}' found untranslated",
- severity="warning",
- source_text=term,
- target_text=term,
- ))
- return issues
- def _check_length_ratio(self, source: str, target: str) -> List[QualityIssue]:
- """Check for abnormal length ratios."""
- issues = []
- source_len = len(source.strip())
- target_len = len(target.strip())
- if source_len == 0:
- return issues
- ratio = target_len / source_len
- if ratio < self.min_length_ratio:
- issues.append(QualityIssue(
- issue_type=QualityIssueType.ABNORMAL_LENGTH,
- location="entire text",
- message=f"Translation too short: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
- severity="warning" if ratio > 0.1 else "error",
- ))
- elif ratio > self.max_length_ratio:
- issues.append(QualityIssue(
- issue_type=QualityIssueType.ABNORMAL_LENGTH,
- location="entire text",
- message=f"Translation too long: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
- severity="warning",
- ))
- return issues
- def _check_duplicates(self, text: str) -> List[QualityIssue]:
- """Check for duplicate content."""
- issues = []
- # Split into paragraphs and check for duplicates
- paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
- # Find duplicates
- seen: Dict[str, int] = {}
- for i, para in enumerate(paragraphs):
- if para in seen:
- issues.append(QualityIssue(
- issue_type=QualityIssueType.DUPLICATE_CONTENT,
- location=f"paragraph {i}",
- message=f"Duplicate content (first seen at paragraph {seen[para]})",
- severity="info",
- source_text="",
- target_text=para[:100] + "..." if len(para) > 100 else para,
- ))
- seen[para] = i
- return issues
- def check_batch(
- self,
- sources: List[str],
- targets: List[str],
- source_lang: str = "zh",
- tgt_lang: str = "en"
- ) -> List[QualityReport]:
- """
- Check multiple translation pairs.
- Args:
- sources: List of source texts
- targets: List of target texts
- source_lang: Source language code
- tgt_lang: Target language code
- Returns:
- List of QualityReport objects
- """
- if len(sources) != len(targets):
- raise ValueError("Source and target lists must have the same length")
- return [
- self.check_translation(s, t, source_lang, tgt_lang)
- for s, t in zip(sources, targets)
- ]
- def get_summary(self, reports: List[QualityReport]) -> Dict[str, Any]:
- """
- Get summary statistics from multiple reports.
- Args:
- reports: List of QualityReport objects
- Returns:
- Dictionary with summary statistics
- """
- total_reports = len(reports)
- valid_reports = sum(1 for r in reports if r.is_valid)
- total_issues = sum(r.total_issues for r in reports)
- total_errors = sum(r.error_count for r in reports)
- total_warnings = sum(r.warning_count for r in reports)
- # Aggregate by type
- by_type: Dict[str, int] = {}
- for report in reports:
- for issue_type, count in report.by_type.items():
- by_type[issue_type] = by_type.get(issue_type, 0) + count
- return {
- "total_translations": total_reports,
- "valid_translations": valid_reports,
- "invalid_translations": total_reports - valid_reports,
- "validity_rate": (valid_reports / total_reports * 100) if total_reports > 0 else 100,
- "total_issues": total_issues,
- "total_errors": total_errors,
- "total_warnings": total_warnings,
- "by_type": by_type,
- }
|