| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- """
- Unit tests for the quality checker module.
- Tests cover translation quality checking functionality.
- """
- import sys
- from unittest.mock import Mock
- # Mock torch and transformers before importing
- sys_mock = Mock()
- sys.modules["torch"] = sys_mock
- sys.modules["transformers"] = sys_mock
- import pytest
- from src.translator.quality_checker import (
- QualityChecker,
- QualityReport,
- QualityIssue,
- QualityIssueType,
- )
- class TestQualityIssue:
- """Test cases for QualityIssue dataclass."""
- def test_create_quality_issue(self):
- """Test creating a quality issue."""
- issue = QualityIssue(
- issue_type=QualityIssueType.UNTRANSLATED_TERM,
- location="paragraph 1",
- message="Found untranslated Chinese text",
- severity="warning",
- )
- assert issue.issue_type == QualityIssueType.UNTRANSLATED_TERM
- assert issue.location == "paragraph 1"
- assert issue.severity == "warning"
- def test_to_dict(self):
- """Test converting quality issue to dictionary."""
- issue = QualityIssue(
- issue_type=QualityIssueType.ABNORMAL_LENGTH,
- location="entire text",
- message="Translation too short",
- severity="error",
- source_text="Test source",
- target_text="Test target",
- )
- data = issue.to_dict()
- assert data["issue_type"] == "abnormal_length"
- assert data["location"] == "entire text"
- assert data["severity"] == "error"
- class TestQualityReport:
- """Test cases for QualityReport dataclass."""
- def test_create_quality_report(self):
- """Test creating a quality report."""
- issues = [
- QualityIssue(
- issue_type=QualityIssueType.UNTRANSLATED_TERM,
- location="p1",
- message="Test",
- severity="warning",
- ),
- QualityIssue(
- issue_type=QualityIssueType.ABNORMAL_LENGTH,
- location="p2",
- message="Test",
- severity="error",
- ),
- ]
- report = QualityReport(
- total_issues=2,
- error_count=1,
- warning_count=1,
- info_count=0,
- issues=issues,
- is_valid=False,
- )
- assert report.total_issues == 2
- assert report.error_count == 1
- assert report.warning_count == 1
- assert report.is_valid is False
- assert len(report.errors) == 1
- assert len(report.warnings) == 1
- def test_to_dict(self):
- """Test converting quality report to dictionary."""
- report = QualityReport(
- total_issues=0,
- error_count=0,
- warning_count=0,
- info_count=0,
- is_valid=True,
- )
- data = report.to_dict()
- assert data["total_issues"] == 0
- assert data["is_valid"] is True
- def test_generate_report(self):
- """Test generating human-readable report."""
- issues = [
- QualityIssue(
- issue_type=QualityIssueType.UNTRANSLATED_TERM,
- location="p1",
- message="Found Chinese characters",
- severity="warning",
- ),
- ]
- report = QualityReport(
- total_issues=1,
- error_count=0,
- warning_count=1,
- info_count=0,
- issues=issues,
- is_valid=True,
- )
- text = report.generate_report()
- assert "Translation Quality Report" in text
- assert "Valid: True" in text
- assert "Total Issues: 1" in text
- assert "untranslated_term" in text
- class TestQualityChecker:
- """Test cases for QualityChecker class."""
- def test_init(self):
- """Test QualityChecker initialization."""
- checker = QualityChecker()
- assert checker.min_length_ratio == 0.3
- assert checker.max_length_ratio == 3.0
- def test_init_with_params(self):
- """Test QualityChecker with custom parameters."""
- checker = QualityChecker(
- min_length_ratio=0.5,
- max_length_ratio=2.0,
- untranslated_terms={"test", "example"}
- )
- assert checker.min_length_ratio == 0.5
- assert checker.max_length_ratio == 2.0
- assert "test" in checker.untranslated_terms
- def test_check_translation_valid(self):
- """Test checking a valid translation."""
- checker = QualityChecker()
- source = "这是一个测试。"
- target = "This is a test."
- report = checker.check_translation(source, target)
- assert report.is_valid is True
- assert report.total_issues == 0
- def test_check_empty_translation(self):
- """Test detection of empty translation."""
- checker = QualityChecker()
- source = "这是一个测试。"
- target = ""
- report = checker.check_translation(source, target)
- assert report.is_valid is False
- assert report.error_count > 0
- assert any(i.issue_type == QualityIssueType.EMPTY_TRANSLATION for i in report.issues)
- def test_check_whitespace_only_translation(self):
- """Test detection of whitespace-only translation."""
- checker = QualityChecker()
- source = "这是一个测试。"
- target = " \n\t "
- report = checker.check_translation(source, target)
- assert report.is_valid is False
- def test_check_line_count_mismatch(self):
- """Test detection of line count mismatch."""
- checker = QualityChecker()
- source = "Line 1\nLine 2\nLine 3"
- target = "Line 1\nLine 2" # Missing one line
- report = checker.check_translation(source, target)
- # Should detect mismatch
- line_issues = [i for i in report.issues if i.issue_type == QualityIssueType.LINE_COUNT_MISMATCH]
- assert len(line_issues) > 0
- def test_check_untranslated_chinese(self):
- """Test detection of untranslated Chinese characters."""
- checker = QualityChecker()
- source = "这是一个测试文本,包含很多中文内容。"
- target = "这是一个测试文本,包含很多中文内容。" # Untranslated
- report = checker.check_translation(source, target, source_lang="zh")
- # Should detect untranslated Chinese
- untranslated_issues = [i for i in report.issues if i.issue_type == QualityIssueType.UNTRANSLATED_TERM]
- assert len(untranslated_issues) > 0
- def test_check_untranslated_term(self):
- """Test detection of specific untranslated terms."""
- checker = QualityChecker(untranslated_terms={"special_term"})
- source = "This is a special_term example."
- target = "This is a special_term example." # Term not translated
- report = checker.check_translation(source, target)
- # Should detect the untranslated term
- term_issues = [i for i in report.issues if i.issue_type == QualityIssueType.UNTRANSLATED_TERM]
- assert any("special_term" in i.message for i in term_issues)
- def test_check_abnormal_length_short(self):
- """Test detection of abnormally short translation."""
- checker = QualityChecker(min_length_ratio=0.5)
- source = "This is a long source text with many words that should be translated properly."
- target = "Short." # Too short
- report = checker.check_translation(source, target)
- length_issues = [i for i in report.issues if i.issue_type == QualityIssueType.ABNORMAL_LENGTH]
- assert len(length_issues) > 0
- def test_check_abnormal_length_long(self):
- """Test detection of abnormally long translation."""
- checker = QualityChecker(max_length_ratio=2.0)
- source = "Hi."
- target = "This is an extremely long translation that is much longer than the original source text."
- report = checker.check_translation(source, target)
- length_issues = [i for i in report.issues if i.issue_type == QualityIssueType.ABNORMAL_LENGTH]
- assert len(length_issues) > 0
- def test_check_duplicate_content(self):
- """Test detection of duplicate content."""
- checker = QualityChecker()
- target = "This is paragraph one.\n\nThis is paragraph two.\n\nThis is paragraph one." # Duplicate
- report = checker.check_translation("Source text.", target)
- duplicate_issues = [i for i in report.issues if i.issue_type == QualityIssueType.DUPLICATE_CONTENT]
- assert len(duplicate_issues) > 0
- def test_check_batch(self):
- """Test checking multiple translations."""
- checker = QualityChecker()
- sources = ["Text 1.", "Text 2.", "Text 3."]
- targets = ["Translation 1.", "Translation 2.", "Translation 3."]
- reports = checker.check_batch(sources, targets)
- assert len(reports) == 3
- def test_check_batch_mismatched_lengths(self):
- """Test check_batch with mismatched list lengths."""
- checker = QualityChecker()
- sources = ["Text 1.", "Text 2."]
- targets = ["Translation 1."]
- with pytest.raises(ValueError, match="same length"):
- checker.check_batch(sources, targets)
- def test_get_summary(self):
- """Test getting summary from multiple reports."""
- checker = QualityChecker()
- # Create some reports
- reports = [
- QualityReport(total_issues=0, error_count=0, warning_count=0, info_count=0, is_valid=True),
- QualityReport(total_issues=2, error_count=1, warning_count=1, info_count=0, is_valid=False),
- QualityReport(total_issues=1, error_count=0, warning_count=1, info_count=0, is_valid=True),
- ]
- summary = checker.get_summary(reports)
- assert summary["total_translations"] == 3
- assert summary["valid_translations"] == 2
- assert summary["invalid_translations"] == 1
- assert summary["total_issues"] == 3
- def test_get_summary_empty(self):
- """Test getting summary with no reports."""
- checker = QualityChecker()
- summary = checker.get_summary([])
- assert summary["total_translations"] == 0
- assert summary["validity_rate"] == 100
- def test_issue_types_in_by_type(self):
- """Test that by_type correctly aggregates issue types."""
- checker = QualityChecker()
- source = "Test content with special_term here.\n\nAnother line.\n\nTest content with special_term here."
- target = "Test content with special_term here.\n\nAnother line.\n\nTest content with special_term here."
- report = checker.check_translation(source, target)
- # Should have both duplicate and untranslated_term issues
- assert "duplicate_content" in report.by_type or len(report.issues) >= 0
- def test_severity_levels(self):
- """Test different severity levels."""
- checker = QualityChecker(min_length_ratio=0.5, max_length_ratio=2.0)
- # Empty translation should be error
- report1 = checker.check_translation("Source", "")
- assert any(i.severity == "error" for i in report1.issues)
- # Slightly short translation should be warning
- report2 = checker.check_translation("This is medium length text.", "Short.")
- assert any(i.severity == "warning" for i in report2.issues)
- # Duplicate should be info
- report3 = checker.check_translation("A", "B\n\nB")
- assert any(i.severity == "info" for i in report3.issues)
- def test_check_translation_with_chinese_punctuation(self):
- """Test checking translation with Chinese punctuation issues."""
- checker = QualityChecker()
- # Source has Chinese content, target should not
- source = "你好,世界!"
- target = "你好世界" # Not properly translated
- report = checker.check_translation(source, target, source_lang="zh")
- # Should detect Chinese characters remain
- assert report.total_issues > 0
|