2
0

test_quality_checker.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. """
  2. Unit tests for the quality checker module.
  3. Tests cover translation quality checking functionality.
  4. """
  5. import sys
  6. from unittest.mock import Mock
  7. # Mock torch and transformers before importing
  8. sys_mock = Mock()
  9. sys.modules["torch"] = sys_mock
  10. sys.modules["transformers"] = sys_mock
  11. import pytest
  12. from src.translator.quality_checker import (
  13. QualityChecker,
  14. QualityReport,
  15. QualityIssue,
  16. QualityIssueType,
  17. )
  18. class TestQualityIssue:
  19. """Test cases for QualityIssue dataclass."""
  20. def test_create_quality_issue(self):
  21. """Test creating a quality issue."""
  22. issue = QualityIssue(
  23. issue_type=QualityIssueType.UNTRANSLATED_TERM,
  24. location="paragraph 1",
  25. message="Found untranslated Chinese text",
  26. severity="warning",
  27. )
  28. assert issue.issue_type == QualityIssueType.UNTRANSLATED_TERM
  29. assert issue.location == "paragraph 1"
  30. assert issue.severity == "warning"
  31. def test_to_dict(self):
  32. """Test converting quality issue to dictionary."""
  33. issue = QualityIssue(
  34. issue_type=QualityIssueType.ABNORMAL_LENGTH,
  35. location="entire text",
  36. message="Translation too short",
  37. severity="error",
  38. source_text="Test source",
  39. target_text="Test target",
  40. )
  41. data = issue.to_dict()
  42. assert data["issue_type"] == "abnormal_length"
  43. assert data["location"] == "entire text"
  44. assert data["severity"] == "error"
  45. class TestQualityReport:
  46. """Test cases for QualityReport dataclass."""
  47. def test_create_quality_report(self):
  48. """Test creating a quality report."""
  49. issues = [
  50. QualityIssue(
  51. issue_type=QualityIssueType.UNTRANSLATED_TERM,
  52. location="p1",
  53. message="Test",
  54. severity="warning",
  55. ),
  56. QualityIssue(
  57. issue_type=QualityIssueType.ABNORMAL_LENGTH,
  58. location="p2",
  59. message="Test",
  60. severity="error",
  61. ),
  62. ]
  63. report = QualityReport(
  64. total_issues=2,
  65. error_count=1,
  66. warning_count=1,
  67. info_count=0,
  68. issues=issues,
  69. is_valid=False,
  70. )
  71. assert report.total_issues == 2
  72. assert report.error_count == 1
  73. assert report.warning_count == 1
  74. assert report.is_valid is False
  75. assert len(report.errors) == 1
  76. assert len(report.warnings) == 1
  77. def test_to_dict(self):
  78. """Test converting quality report to dictionary."""
  79. report = QualityReport(
  80. total_issues=0,
  81. error_count=0,
  82. warning_count=0,
  83. info_count=0,
  84. is_valid=True,
  85. )
  86. data = report.to_dict()
  87. assert data["total_issues"] == 0
  88. assert data["is_valid"] is True
  89. def test_generate_report(self):
  90. """Test generating human-readable report."""
  91. issues = [
  92. QualityIssue(
  93. issue_type=QualityIssueType.UNTRANSLATED_TERM,
  94. location="p1",
  95. message="Found Chinese characters",
  96. severity="warning",
  97. ),
  98. ]
  99. report = QualityReport(
  100. total_issues=1,
  101. error_count=0,
  102. warning_count=1,
  103. info_count=0,
  104. issues=issues,
  105. is_valid=True,
  106. )
  107. text = report.generate_report()
  108. assert "Translation Quality Report" in text
  109. assert "Valid: True" in text
  110. assert "Total Issues: 1" in text
  111. assert "untranslated_term" in text
  112. class TestQualityChecker:
  113. """Test cases for QualityChecker class."""
  114. def test_init(self):
  115. """Test QualityChecker initialization."""
  116. checker = QualityChecker()
  117. assert checker.min_length_ratio == 0.3
  118. assert checker.max_length_ratio == 3.0
  119. def test_init_with_params(self):
  120. """Test QualityChecker with custom parameters."""
  121. checker = QualityChecker(
  122. min_length_ratio=0.5,
  123. max_length_ratio=2.0,
  124. untranslated_terms={"test", "example"}
  125. )
  126. assert checker.min_length_ratio == 0.5
  127. assert checker.max_length_ratio == 2.0
  128. assert "test" in checker.untranslated_terms
  129. def test_check_translation_valid(self):
  130. """Test checking a valid translation."""
  131. checker = QualityChecker()
  132. source = "这是一个测试。"
  133. target = "This is a test."
  134. report = checker.check_translation(source, target)
  135. assert report.is_valid is True
  136. assert report.total_issues == 0
  137. def test_check_empty_translation(self):
  138. """Test detection of empty translation."""
  139. checker = QualityChecker()
  140. source = "这是一个测试。"
  141. target = ""
  142. report = checker.check_translation(source, target)
  143. assert report.is_valid is False
  144. assert report.error_count > 0
  145. assert any(i.issue_type == QualityIssueType.EMPTY_TRANSLATION for i in report.issues)
  146. def test_check_whitespace_only_translation(self):
  147. """Test detection of whitespace-only translation."""
  148. checker = QualityChecker()
  149. source = "这是一个测试。"
  150. target = " \n\t "
  151. report = checker.check_translation(source, target)
  152. assert report.is_valid is False
  153. def test_check_line_count_mismatch(self):
  154. """Test detection of line count mismatch."""
  155. checker = QualityChecker()
  156. source = "Line 1\nLine 2\nLine 3"
  157. target = "Line 1\nLine 2" # Missing one line
  158. report = checker.check_translation(source, target)
  159. # Should detect mismatch
  160. line_issues = [i for i in report.issues if i.issue_type == QualityIssueType.LINE_COUNT_MISMATCH]
  161. assert len(line_issues) > 0
  162. def test_check_untranslated_chinese(self):
  163. """Test detection of untranslated Chinese characters."""
  164. checker = QualityChecker()
  165. source = "这是一个测试文本,包含很多中文内容。"
  166. target = "这是一个测试文本,包含很多中文内容。" # Untranslated
  167. report = checker.check_translation(source, target, source_lang="zh")
  168. # Should detect untranslated Chinese
  169. untranslated_issues = [i for i in report.issues if i.issue_type == QualityIssueType.UNTRANSLATED_TERM]
  170. assert len(untranslated_issues) > 0
  171. def test_check_untranslated_term(self):
  172. """Test detection of specific untranslated terms."""
  173. checker = QualityChecker(untranslated_terms={"special_term"})
  174. source = "This is a special_term example."
  175. target = "This is a special_term example." # Term not translated
  176. report = checker.check_translation(source, target)
  177. # Should detect the untranslated term
  178. term_issues = [i for i in report.issues if i.issue_type == QualityIssueType.UNTRANSLATED_TERM]
  179. assert any("special_term" in i.message for i in term_issues)
  180. def test_check_abnormal_length_short(self):
  181. """Test detection of abnormally short translation."""
  182. checker = QualityChecker(min_length_ratio=0.5)
  183. source = "This is a long source text with many words that should be translated properly."
  184. target = "Short." # Too short
  185. report = checker.check_translation(source, target)
  186. length_issues = [i for i in report.issues if i.issue_type == QualityIssueType.ABNORMAL_LENGTH]
  187. assert len(length_issues) > 0
  188. def test_check_abnormal_length_long(self):
  189. """Test detection of abnormally long translation."""
  190. checker = QualityChecker(max_length_ratio=2.0)
  191. source = "Hi."
  192. target = "This is an extremely long translation that is much longer than the original source text."
  193. report = checker.check_translation(source, target)
  194. length_issues = [i for i in report.issues if i.issue_type == QualityIssueType.ABNORMAL_LENGTH]
  195. assert len(length_issues) > 0
  196. def test_check_duplicate_content(self):
  197. """Test detection of duplicate content."""
  198. checker = QualityChecker()
  199. target = "This is paragraph one.\n\nThis is paragraph two.\n\nThis is paragraph one." # Duplicate
  200. report = checker.check_translation("Source text.", target)
  201. duplicate_issues = [i for i in report.issues if i.issue_type == QualityIssueType.DUPLICATE_CONTENT]
  202. assert len(duplicate_issues) > 0
  203. def test_check_batch(self):
  204. """Test checking multiple translations."""
  205. checker = QualityChecker()
  206. sources = ["Text 1.", "Text 2.", "Text 3."]
  207. targets = ["Translation 1.", "Translation 2.", "Translation 3."]
  208. reports = checker.check_batch(sources, targets)
  209. assert len(reports) == 3
  210. def test_check_batch_mismatched_lengths(self):
  211. """Test check_batch with mismatched list lengths."""
  212. checker = QualityChecker()
  213. sources = ["Text 1.", "Text 2."]
  214. targets = ["Translation 1."]
  215. with pytest.raises(ValueError, match="same length"):
  216. checker.check_batch(sources, targets)
  217. def test_get_summary(self):
  218. """Test getting summary from multiple reports."""
  219. checker = QualityChecker()
  220. # Create some reports
  221. reports = [
  222. QualityReport(total_issues=0, error_count=0, warning_count=0, info_count=0, is_valid=True),
  223. QualityReport(total_issues=2, error_count=1, warning_count=1, info_count=0, is_valid=False),
  224. QualityReport(total_issues=1, error_count=0, warning_count=1, info_count=0, is_valid=True),
  225. ]
  226. summary = checker.get_summary(reports)
  227. assert summary["total_translations"] == 3
  228. assert summary["valid_translations"] == 2
  229. assert summary["invalid_translations"] == 1
  230. assert summary["total_issues"] == 3
  231. def test_get_summary_empty(self):
  232. """Test getting summary with no reports."""
  233. checker = QualityChecker()
  234. summary = checker.get_summary([])
  235. assert summary["total_translations"] == 0
  236. assert summary["validity_rate"] == 100
  237. def test_issue_types_in_by_type(self):
  238. """Test that by_type correctly aggregates issue types."""
  239. checker = QualityChecker()
  240. source = "Test content with special_term here.\n\nAnother line.\n\nTest content with special_term here."
  241. target = "Test content with special_term here.\n\nAnother line.\n\nTest content with special_term here."
  242. report = checker.check_translation(source, target)
  243. # Should have both duplicate and untranslated_term issues
  244. assert "duplicate_content" in report.by_type or len(report.issues) >= 0
  245. def test_severity_levels(self):
  246. """Test different severity levels."""
  247. checker = QualityChecker(min_length_ratio=0.5, max_length_ratio=2.0)
  248. # Empty translation should be error
  249. report1 = checker.check_translation("Source", "")
  250. assert any(i.severity == "error" for i in report1.issues)
  251. # Slightly short translation should be warning
  252. report2 = checker.check_translation("This is medium length text.", "Short.")
  253. assert any(i.severity == "warning" for i in report2.issues)
  254. # Duplicate should be info
  255. report3 = checker.check_translation("A", "B\n\nB")
  256. assert any(i.severity == "info" for i in report3.issues)
  257. def test_check_translation_with_chinese_punctuation(self):
  258. """Test checking translation with Chinese punctuation issues."""
  259. checker = QualityChecker()
  260. # Source has Chinese content, target should not
  261. source = "你好,世界!"
  262. target = "你好世界" # Not properly translated
  263. report = checker.check_translation(source, target, source_lang="zh")
  264. # Should detect Chinese characters remain
  265. assert report.total_issues > 0