2
0

quality_checker.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. """
  2. Translation quality checker module.
  3. This module provides functionality for checking translation quality
  4. by detecting common issues like missing content, untranslated terms,
  5. abnormal lengths, and duplicate content.
  6. """
  7. import re
  8. import logging
  9. from dataclasses import dataclass, field
  10. from typing import List, Dict, Any, Optional, Set
  11. from enum import Enum
  12. logger = logging.getLogger(__name__)
  13. class QualityIssueType(str, Enum):
  14. """Types of quality issues."""
  15. MISSING_CONTENT = "missing_content"
  16. UNTRANSLATED_TERM = "untranslated_term"
  17. ABNORMAL_LENGTH = "abnormal_length"
  18. DUPLICATE_CONTENT = "duplicate_content"
  19. LINE_COUNT_MISMATCH = "line_count_mismatch"
  20. EMPTY_TRANSLATION = "empty_translation"
  21. @dataclass
  22. class QualityIssue:
  23. """
  24. Represents a quality issue found in translation.
  25. Attributes:
  26. issue_type: The type of issue
  27. location: Location description (e.g., "paragraph 5", "line 10")
  28. message: Human-readable description
  29. severity: Issue severity ("error", "warning", "info")
  30. source_text: The source text that has the issue
  31. target_text: The translated text with the issue
  32. """
  33. issue_type: QualityIssueType
  34. location: str
  35. message: str
  36. severity: str = "warning"
  37. source_text: str = ""
  38. target_text: str = ""
  39. def to_dict(self) -> Dict[str, Any]:
  40. """Convert to dictionary for serialization."""
  41. return {
  42. "issue_type": self.issue_type.value,
  43. "location": self.location,
  44. "message": self.message,
  45. "severity": self.severity,
  46. "source_text": self.source_text[:200], # Truncate long text
  47. "target_text": self.target_text[:200],
  48. }
  49. @dataclass
  50. class QualityReport:
  51. """
  52. Report of quality check results.
  53. Attributes:
  54. total_issues: Total number of issues found
  55. error_count: Number of error-level issues
  56. warning_count: Number of warning-level issues
  57. info_count: Number of info-level issues
  58. issues: List of all issues found
  59. by_type: Breakdown of issues by type
  60. is_valid: Whether translation passed quality check
  61. """
  62. total_issues: int
  63. error_count: int
  64. warning_count: int
  65. info_count: int
  66. issues: List[QualityIssue] = field(default_factory=list)
  67. by_type: Dict[str, int] = field(default_factory=dict)
  68. is_valid: bool = True
  69. @property
  70. def errors(self) -> List[QualityIssue]:
  71. """Get all error-level issues."""
  72. return [i for i in self.issues if i.severity == "error"]
  73. @property
  74. def warnings(self) -> List[QualityIssue]:
  75. """Get all warning-level issues."""
  76. return [i for i in self.issues if i.severity == "warning"]
  77. def to_dict(self) -> Dict[str, Any]:
  78. """Convert to dictionary for serialization."""
  79. return {
  80. "total_issues": self.total_issues,
  81. "error_count": self.error_count,
  82. "warning_count": self.warning_count,
  83. "info_count": self.info_count,
  84. "is_valid": self.is_valid,
  85. "by_type": self.by_type,
  86. "issues": [i.to_dict() for i in self.issues],
  87. }
  88. def generate_report(self) -> str:
  89. """
  90. Generate a human-readable report.
  91. Returns:
  92. Formatted report string
  93. """
  94. lines = [
  95. "=== Translation Quality Report ===",
  96. f"Valid: {self.is_valid}",
  97. f"Total Issues: {self.total_issues}",
  98. f" Errors: {self.error_count}",
  99. f" Warnings: {self.warning_count}",
  100. f" Info: {self.info_count}",
  101. "",
  102. "Issues by Type:",
  103. ]
  104. for issue_type, count in sorted(self.by_type.items()):
  105. lines.append(f" {issue_type}: {count}")
  106. if self.issues:
  107. lines.append("")
  108. lines.append("Detailed Issues:")
  109. for issue in self.issues:
  110. lines.append(f" [{issue.severity.upper()}] {issue.issue_type.value}")
  111. lines.append(f" Location: {issue.location}")
  112. lines.append(f" Message: {issue.message}")
  113. return "\n".join(lines)
  114. class QualityChecker:
  115. """
  116. Checker for translation quality.
  117. This class checks translations for common quality issues including
  118. missing content, untranslated terms, abnormal lengths, and duplicates.
  119. """
  120. # Default thresholds for quality checks
  121. DEFAULT_MIN_LENGTH_RATIO = 0.3 # Target length should be at least 30% of source
  122. DEFAULT_MAX_LENGTH_RATIO = 3.0 # Target length should be at most 3x source
  123. def __init__(
  124. self,
  125. min_length_ratio: float = DEFAULT_MIN_LENGTH_RATIO,
  126. max_length_ratio: float = DEFAULT_MAX_LENGTH_RATIO,
  127. untranslated_terms: Optional[Set[str]] = None
  128. ):
  129. """
  130. Initialize the quality checker.
  131. Args:
  132. min_length_ratio: Minimum acceptable length ratio (target/source)
  133. max_length_ratio: Maximum acceptable length ratio (target/source)
  134. untranslated_terms: Set of terms that should always be translated
  135. """
  136. self.min_length_ratio = min_length_ratio
  137. self.max_length_ratio = max_length_ratio
  138. self.untranslated_terms = untranslated_terms or set()
  139. # Default terms that should always be translated from Chinese
  140. self._default_chinese_terms = {
  141. "的", "了", "是", "在", "和", "与", "或", "但", "而", "如果", "因为",
  142. "所以", "然后", "之后", "之前", "已经", "还在", "可以", "应该", "需要",
  143. "想要", "希望", "觉得", "认为", "知道", "看到", "听到", "说到",
  144. }
  145. def check_translation(
  146. self,
  147. source: str,
  148. target: str,
  149. source_lang: str = "zh",
  150. tgt_lang: str = "en"
  151. ) -> QualityReport:
  152. """
  153. Perform a comprehensive quality check on translation.
  154. Args:
  155. source: Original source text
  156. target: Translated text
  157. source_lang: Source language code
  158. tgt_lang: Target language code
  159. Returns:
  160. QualityReport with all issues found
  161. """
  162. issues: List[QualityIssue] = []
  163. # Check for empty translation
  164. issues.extend(self._check_empty(source, target))
  165. # Check line count mismatch
  166. issues.extend(self._check_line_count(source, target))
  167. # Check for untranslated terms
  168. issues.extend(self._check_untranslated_terms(source, target, source_lang))
  169. # Check for abnormal length
  170. issues.extend(self._check_length_ratio(source, target))
  171. # Check for duplicate content
  172. issues.extend(self._check_duplicates(target))
  173. # Calculate statistics
  174. error_count = sum(1 for i in issues if i.severity == "error")
  175. warning_count = sum(1 for i in issues if i.severity == "warning")
  176. info_count = sum(1 for i in issues if i.severity == "info")
  177. # Group by type
  178. by_type: Dict[str, int] = {}
  179. for issue in issues:
  180. by_type[issue.issue_type.value] = by_type.get(issue.issue_type.value, 0) + 1
  181. # Determine if translation is valid (no errors)
  182. is_valid = error_count == 0
  183. return QualityReport(
  184. total_issues=len(issues),
  185. error_count=error_count,
  186. warning_count=warning_count,
  187. info_count=info_count,
  188. issues=issues,
  189. by_type=by_type,
  190. is_valid=is_valid
  191. )
  192. def _check_empty(self, source: str, target: str) -> List[QualityIssue]:
  193. """Check for empty translations."""
  194. issues = []
  195. if not target or not target.strip():
  196. issues.append(QualityIssue(
  197. issue_type=QualityIssueType.EMPTY_TRANSLATION,
  198. location="entire text",
  199. message="Translation is empty",
  200. severity="error",
  201. source_text=source[:100],
  202. target_text=target,
  203. ))
  204. return issues
  205. def _check_line_count(self, source: str, target: str) -> List[QualityIssue]:
  206. """Check for line count mismatches."""
  207. issues = []
  208. source_lines = source.split('\n')
  209. target_lines = target.split('\n')
  210. # Filter out empty lines for comparison
  211. source_nonempty = [l for l in source_lines if l.strip()]
  212. target_nonempty = [l for l in target_lines if l.strip()]
  213. source_count = len(source_nonempty)
  214. target_count = len(target_nonempty)
  215. # Allow some tolerance (±20%)
  216. if source_count > 0:
  217. ratio = target_count / source_count
  218. if ratio < 0.8 or ratio > 1.2:
  219. issues.append(QualityIssue(
  220. issue_type=QualityIssueType.LINE_COUNT_MISMATCH,
  221. location="entire text",
  222. message=f"Line count mismatch: source has {source_count} lines, target has {target_count} lines",
  223. severity="warning" if 0.5 < ratio < 1.5 else "error",
  224. ))
  225. return issues
  226. def _check_untranslated_terms(
  227. self,
  228. source: str,
  229. target: str,
  230. source_lang: str
  231. ) -> List[QualityIssue]:
  232. """Check for untranslated terms."""
  233. issues = []
  234. # Use language-specific checks
  235. if source_lang == "zh":
  236. # Check for remaining Chinese characters
  237. chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
  238. chinese_matches = chinese_pattern.findall(target)
  239. if chinese_matches:
  240. # Count how many Chinese characters remain
  241. total_chinese = sum(len(m) for m in chinese_matches)
  242. source_chinese = sum(len(m) for m in chinese_pattern.findall(source))
  243. if source_chinese > 0:
  244. untranslated_ratio = total_chinese / source_chinese
  245. if untranslated_ratio > 0.1: # More than 10% untranslated
  246. issues.append(QualityIssue(
  247. issue_type=QualityIssueType.UNTRANSLATED_TERM,
  248. location="scattered",
  249. message=f"Found {total_chinese} Chinese characters in translation ({untranslated_ratio:.1%} of source)",
  250. severity="warning",
  251. source_text="",
  252. target_text=" ".join(chinese_matches[:10]), # Show first 10
  253. ))
  254. # Check for specific untranslated terms
  255. for term in self.untranslated_terms:
  256. if term in target:
  257. issues.append(QualityIssue(
  258. issue_type=QualityIssueType.UNTRANSLATED_TERM,
  259. location="scattered",
  260. message=f"Source term '{term}' found untranslated",
  261. severity="warning",
  262. source_text=term,
  263. target_text=term,
  264. ))
  265. return issues
  266. def _check_length_ratio(self, source: str, target: str) -> List[QualityIssue]:
  267. """Check for abnormal length ratios."""
  268. issues = []
  269. source_len = len(source.strip())
  270. target_len = len(target.strip())
  271. if source_len == 0:
  272. return issues
  273. ratio = target_len / source_len
  274. if ratio < self.min_length_ratio:
  275. issues.append(QualityIssue(
  276. issue_type=QualityIssueType.ABNORMAL_LENGTH,
  277. location="entire text",
  278. message=f"Translation too short: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
  279. severity="warning" if ratio > 0.1 else "error",
  280. ))
  281. elif ratio > self.max_length_ratio:
  282. issues.append(QualityIssue(
  283. issue_type=QualityIssueType.ABNORMAL_LENGTH,
  284. location="entire text",
  285. message=f"Translation too long: {target_len} chars vs {source_len} chars (ratio: {ratio:.2f})",
  286. severity="warning",
  287. ))
  288. return issues
  289. def _check_duplicates(self, text: str) -> List[QualityIssue]:
  290. """Check for duplicate content."""
  291. issues = []
  292. # Split into paragraphs and check for duplicates
  293. paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
  294. # Find duplicates
  295. seen: Dict[str, int] = {}
  296. for i, para in enumerate(paragraphs):
  297. if para in seen:
  298. issues.append(QualityIssue(
  299. issue_type=QualityIssueType.DUPLICATE_CONTENT,
  300. location=f"paragraph {i}",
  301. message=f"Duplicate content (first seen at paragraph {seen[para]})",
  302. severity="info",
  303. source_text="",
  304. target_text=para[:100] + "..." if len(para) > 100 else para,
  305. ))
  306. seen[para] = i
  307. return issues
  308. def check_batch(
  309. self,
  310. sources: List[str],
  311. targets: List[str],
  312. source_lang: str = "zh",
  313. tgt_lang: str = "en"
  314. ) -> List[QualityReport]:
  315. """
  316. Check multiple translation pairs.
  317. Args:
  318. sources: List of source texts
  319. targets: List of target texts
  320. source_lang: Source language code
  321. tgt_lang: Target language code
  322. Returns:
  323. List of QualityReport objects
  324. """
  325. if len(sources) != len(targets):
  326. raise ValueError("Source and target lists must have the same length")
  327. return [
  328. self.check_translation(s, t, source_lang, tgt_lang)
  329. for s, t in zip(sources, targets)
  330. ]
  331. def get_summary(self, reports: List[QualityReport]) -> Dict[str, Any]:
  332. """
  333. Get summary statistics from multiple reports.
  334. Args:
  335. reports: List of QualityReport objects
  336. Returns:
  337. Dictionary with summary statistics
  338. """
  339. total_reports = len(reports)
  340. valid_reports = sum(1 for r in reports if r.is_valid)
  341. total_issues = sum(r.total_issues for r in reports)
  342. total_errors = sum(r.error_count for r in reports)
  343. total_warnings = sum(r.warning_count for r in reports)
  344. # Aggregate by type
  345. by_type: Dict[str, int] = {}
  346. for report in reports:
  347. for issue_type, count in report.by_type.items():
  348. by_type[issue_type] = by_type.get(issue_type, 0) + count
  349. return {
  350. "total_translations": total_reports,
  351. "valid_translations": valid_reports,
  352. "invalid_translations": total_reports - valid_reports,
  353. "validity_rate": (valid_reports / total_reports * 100) if total_reports > 0 else 100,
  354. "total_issues": total_issues,
  355. "total_errors": total_errors,
  356. "total_warnings": total_warnings,
  357. "by_type": by_type,
  358. }