223-template-236
/
blank
ответвлено от 137-template-113/blank


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
							"""
Translation quality checker module.

This module provides functionality for checking translation quality
by detecting common issues like missing content, untranslated terms,
abnormal lengths, and duplicate content.
"""

import re
import logging
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Set
from enum import Enum

logger = logging.getLogger(__name__)


class QualityIssueType(str, Enum):
    """Types of quality issues."""

    MISSING_CONTENT = "missing_content"
    UNTRANSLATED_TERM = "untranslated_term"
    ABNORMAL_LENGTH = "abnormal_length"
    DUPLICATE_CONTENT = "duplicate_content"
    LINE_COUNT_MISMATCH = "line_count_mismatch"
    EMPTY_TRANSLATION = "empty_translation"


@dataclass
class QualityIssue:
    """
    Represents a quality issue found in translation.

    Attributes:
        issue_type: The type of issue
        location: Location description (e.g., "paragraph 5", "line 10")
        message: Human-readable description
        severity: Issue severity ("error", "warning", "info")
        source_text: The source text that has the issue
        target_text: The translated text with the issue
    """

    issue_type: QualityIssueType
    location: str
    message: str
    severity: str = "warning"
    source_text: str = ""
    target_text: str = ""

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "issue_type": self.issue_type.value,
            "location": self.location,
            "message": self.message,
            "severity": self.severity,
            "source_text": self.source_text[:200],  # Truncate long text
            "target_text": self.target_text[:200],
        }


@dataclass
class QualityReport:
    """
    Report of quality check results.

    Attributes:
        total_issues: Total number of issues found
        error_count: Number of error-level issues
        warning_count: Number of warning-level issues
        info_count: Number of info-level issues
        issues: List of all issues found
        by_type: Breakdown of issues by type
        is_valid: Whether translation passed quality check
    """

    total_issues: int
    error_count: int
    warning_count: int
    info_count: int
    issues: List[QualityIssue] = field(default_factory=list)
    by_type: Dict[str, int] = field(default_factory=dict)
    is_valid: bool = True

    @property
    def errors(self) -> List[QualityIssue]:
        """Get all error-level issues."""
        return [i for i in self.issues if i.severity == "error"]

    @property
    def warnings(self) -> List[QualityIssue]:
        """Get all warning-level issues."""
        return [i for i in self.issues if i.severity == "warning"]

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "total_issues": self.total_issues,
            "error_count": self.error_count,
            "warning_count": self.warning_count,
            "info_count": self.info_count,
            "is_valid": self.is_valid,
            "by_type": self.by_type,
            "issues": [i.to_dict() for i in self.issues],
        }

    def generate_report(self) -> str:
        """
        Generate a human-readable report.

        Returns:
            Formatted report string
        """
        lines = [
            "=== Translation Quality Report ===",
            f"Valid: {self.is_valid}",
            f"Total Issues: {self.total_issues}",
            f"  Errors: {self.error_count}",
            f"  Warnings: {self.warning_count}",
            f"  Info: {self.info_count}",
            "",
            "Issues by Type:",
        ]

        for issue_type, count in sorted(self.by_type.items()):
            lines.append(f"  {issue_type}: {count}")

        if self.issues:
            lines.append("")
            lines.append("Detailed Issues:")

            for issue in self.issues:
                lines.append(f"  [{issue.severity.upper()}] {issue.issue_type.value}")
                lines.append(f"    Location: {issue.location}")
                lines.append(f"    Message: {issue.message}")

        return "\n".join(lines)


class QualityChecker:
    """
    Checker for translation quality.

    This class checks translations for common quality issues including
    missing content, untranslated terms, abnormal lengths, and duplicates.
    """

    # Default thresholds for quality checks
    DEFAULT_MIN_LENGTH_RATIO = 0.3  # Target length should be at least 30% of source
    DEFAULT_MAX_LENGTH_RATIO = 3.0   # Target length should be at most 3x source

    def __init__(
        self,
        min_length_ratio: float = DEFAULT_MIN_LENGTH_RATIO,
        max_length_ratio: float = DEFAULT_MAX_LENGTH_RATIO,
        untranslated_terms: Optional[Set[str]] = None
    ):
        """
        Initialize the quality checker.

        Args:
            min_length_ratio: Minimum acceptable length ratio (target/source)
            max_length_ratio: Maximum acceptable length ratio (target/source)
            untranslated_terms: Set of terms that should always be translated
        """
        self.min_length_ratio = min_length_ratio
        self.max_length_ratio = max_length_ratio
        self.untranslated_terms = untranslated_terms or set()

        # Default terms that should always be translated from Chinese
        self._default_chinese_terms = {
            "的", "了", "是", "在", "和", "与", "或", "但", "而", "如果", "因为",
            "所以", "然后", "之后", "之前", "已经", "还在", "可以", "应该", "需要",
            "想要", "希望", "觉得", "认为", "知道", "看到", "听到", "说到",
        }

    def check_translation(
        self,
        source: str,
        target: str,
        source_lang: str = "zh",
        tgt_lang: str = "en"
    ) -> QualityReport:
        """
        Perform a comprehensive quality check on translation.

        Args:
            source: Original source text
            target: Translated text
            source_lang: Source language code
            tgt_lang: Target language code

        Returns:
            QualityReport with all issues found
        """
        issues: List[QualityIssue] = []

        # Check for empty translation
        issues.extend(self._check_empty(source, target))

        # Check line count mismatch
        issues.extend(self._check_line_count(source, target))

        # Check for untranslated terms
        issues.extend(self._check_untranslated_terms(source, target, source_lang))

        # Check for abnormal length
        issues.extend(self._check_length_ratio(source, target))

        # Check for duplicate content
        issues.extend(self._check_duplicates(target))

        # Calculate statistics
        error_count = sum(1 for i in issues if i.severity == "error")
        warning_count = sum(1 for i in issues if i.severity == "warning")
        info_count = sum(1 for i in issues if i.severity == "info")

        # Group by type
        by_type: Dict[str, int] = {}
        for issue in issues:
            by_type[issue.issue_type.value] = by_type.get(issue.issue_type.value, 0) + 1

        # Determine if translation is valid (no errors)
        is_valid = error_count == 0

        return QualityReport(
            total_issues=len(issues),
            error_count=error_count,
            warning_count=warning_count,
            info_count=info_count,
            issues=issues,
            by_type=by_type,
            is_valid=is_valid
        )

    def _check_empty(self, source: str, target: str) -> List[QualityIssue]:
        """Check for empty translations."""
        issues = []

        if not target or not target.strip():
            issues.append(QualityIssue(
                issue_type=QualityIssueType.EMPTY_TRANSLATION,
                location="entire text",
                message="Translation is empty",
                severity="error",
                source_text=source[:100],
                target_text=target,
            ))

        return issues

    def _check_line_count(self, source: str, target: str) -> List[QualityIssue]:
        """Check for line count mismatches."""
        issues = []

        source_lines = source.split('\n')
        target_lines = target.split('\n')

        # Filter out empty lines for comparison
        source_nonempty = [l for l in source_lines if l.strip()]
        target_nonempty = [l for l in target_lines if l.strip()]

        source_count = len(source_nonempty)
        target_count = len(target_nonempty)

        # Allow some tolerance (±20%)
        if source_count > 0:
            ratio = target_count / source_count
            if ratio < 0.8 or ratio > 1.2:
                issues.append(QualityIssue(
                    issue_type=QualityIssueType.LINE_COUNT_MISMATCH,
                    location="entire text",
                    message=f"Line count mismatch: source has {source_count} lines, target has {target_count} lines",
                    severity="warning" if 0.5 < ratio < 1.5 else "error",
                ))

        return issues

    def _check_untranslated_terms(
        self,
        source: str,
        target: str,
        source_lang: str
    ) -> List[QualityIssue]:
        """Check for untranslated terms."""
        issues = []

        # Use language-specific checks
        if source_lang == "zh":
            # Check for remaining Chinese characters
            chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
            chinese_matches = chinese_pattern.findall(target)

            if chinese_matches:
                # Count how many Chinese characters remain
                total_chinese = sum(len(m) for m in chinese_matches)
                source_chinese = sum(len(m) for m in chinese_pattern.findall(source))

                if source_chinese > 0:
                    untranslated_ratio = total_chinese / source_chinese
                    if untranslated_ratio > 0.1:  # More than 10% untranslated
                        issues.append(QualityIssue(
                            issue_type=QualityIssueType.UNTRANSLATED_TERM,
                            location="scattered",
                            message=f"Found {total_chinese} Chinese characters in translation ({untranslated_ratio:.1%} of source)",
                            severity="warning",
                            source_text="",
                            target_text=" ".join(chinese_matches[:10]),  # Show first 10
                        ))

        # Check for specific untranslated terms
        for term in self.untranslated_terms:
            if term in target:
                issues.append(QualityIssue(
                    issue_type=QualityIssueType.UNTRANSLATED_TERM,
                    location="scattered",
                    message=f"Source term '{term}' found untranslated",
                    severity="warning",
                    source_text=term,
                    target_text=term,
                ))

        return issues

    def _check_length_ratio(self, source: str, target: str) -> List[QualityIssue]:
        """Check for abnormal length ratios."""
        issues = []

        source_len = len(source.strip())
        target_len = len(target.strip())

        if source_len == 0:
            return issues

        ratio = target_len / source_len

        if ratio < self.min_length_ratio:
            issues.append(QualityIssue(
                issue_type=QualityIssueType.ABNORMAL_LENGTH,
                location="entire text",
                message=f"Translation too short: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
                severity="warning" if ratio > 0.1 else "error",
            ))
        elif ratio > self.max_length_ratio:
            issues.append(QualityIssue(
                issue_type=QualityIssueType.ABNORMAL_LENGTH,
                location="entire text",
                message=f"Translation too long: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
                severity="warning",
            ))

        return issues

    def _check_duplicates(self, text: str) -> List[QualityIssue]:
        """Check for duplicate content."""
        issues = []

        # Split into paragraphs and check for duplicates
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

        # Find duplicates
        seen: Dict[str, int] = {}
        for i, para in enumerate(paragraphs):
            if para in seen:
                issues.append(QualityIssue(
                    issue_type=QualityIssueType.DUPLICATE_CONTENT,
                    location=f"paragraph {i}",
                    message=f"Duplicate content (first seen at paragraph {seen[para]})",
                    severity="info",
                    source_text="",
                    target_text=para[:100] + "..." if len(para) > 100 else para,
                ))
            seen[para] = i

        return issues

    def check_batch(
        self,
        sources: List[str],
        targets: List[str],
        source_lang: str = "zh",
        tgt_lang: str = "en"
    ) -> List[QualityReport]:
        """
        Check multiple translation pairs.

        Args:
            sources: List of source texts
            targets: List of target texts
            source_lang: Source language code
            tgt_lang: Target language code

        Returns:
            List of QualityReport objects
        """
        if len(sources) != len(targets):
            raise ValueError("Source and target lists must have the same length")

        return [
            self.check_translation(s, t, source_lang, tgt_lang)
            for s, t in zip(sources, targets)
        ]

    def get_summary(self, reports: List[QualityReport]) -> Dict[str, Any]:
        """
        Get summary statistics from multiple reports.

        Args:
            reports: List of QualityReport objects

        Returns:
            Dictionary with summary statistics
        """
        total_reports = len(reports)
        valid_reports = sum(1 for r in reports if r.is_valid)
        total_issues = sum(r.total_issues for r in reports)
        total_errors = sum(r.error_count for r in reports)
        total_warnings = sum(r.warning_count for r in reports)

        # Aggregate by type
        by_type: Dict[str, int] = {}
        for report in reports:
            for issue_type, count in report.by_type.items():
                by_type[issue_type] = by_type.get(issue_type, 0) + count

        return {
            "total_translations": total_reports,
            "valid_translations": valid_reports,
            "invalid_translations": total_reports - valid_reports,
            "validity_rate": (valid_reports / total_reports * 100) if total_reports > 0 else 100,
            "total_issues": total_issues,
            "total_errors": total_errors,
            "total_warnings": total_warnings,
            "by_type": by_type,
        }