223-template-236
/
blank
forknuté z 137-template-113/blank


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
							"""
Unit tests for TxtReader.
"""

import pytest
from pathlib import Path
from src.cleaning.reader import TxtReader, TxtReaderError


@pytest.fixture
def reader():
    """Create a TxtReader instance."""
    return TxtReader()


@pytest.fixture
def temp_dir(tmp_path):
    """Create temporary directory for test files."""
    return tmp_path


@pytest.fixture
def utf8_file(temp_dir):
    """Create a UTF-8 encoded test file."""
    file_path = temp_dir / "utf8_test.txt"
    content = "这是一个测试文件。\n第二行内容。\n第三行内容。"
    file_path.write_text(content, encoding="utf-8")
    return file_path, content


@pytest.fixture
def gbk_file(temp_dir):
    """Create a GBK encoded test file."""
    file_path = temp_dir / "gbk_test.txt"
    content = "这是一个GBK编码的测试文件。\n第二行内容。"
    file_path.write_bytes(content.encode("gbk"))
    return file_path, content


@pytest.fixture
def utf8_bom_file(temp_dir):
    """Create a UTF-8 with BOM test file."""
    file_path = temp_dir / "utf8_bom_test.txt"
    content = "这是带有BOM的UTF-8文件。\n第二行。"
    # Write UTF-8 BOM + content
    file_path.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
    return file_path, content


@pytest.fixture
def large_file(temp_dir):
    """Create a large test file for performance testing."""
    file_path = temp_dir / "large_test.txt"
    # Create a file with about 100KB of content
    content = "这是测试内容。" * 10000
    file_path.write_text(content, encoding="utf-8")
    return file_path, content


class TestTxtReader:
    """Test suite for TxtReader."""

    def test_read_utf8_file(self, reader, utf8_file):
        """Test reading a UTF-8 encoded file."""
        file_path, expected_content = utf8_file
        content = reader.read(file_path)
        assert content == expected_content

    def test_read_gbk_file(self, reader, gbk_file):
        """Test reading a GBK encoded file."""
        file_path, expected_content = gbk_file
        content = reader.read(file_path)
        assert content == expected_content

    def test_read_utf8_bom_file(self, reader, utf8_bom_file):
        """Test reading a UTF-8 file with BOM."""
        file_path, expected_content = utf8_bom_file
        content = reader.read(file_path)
        assert content == expected_content

    def test_read_nonexistent_file(self, reader):
        """Test reading a non-existent file raises FileNotFoundError."""
        with pytest.raises(FileNotFoundError):
            reader.read("/nonexistent/path/file.txt")

    def test_read_with_info_returns_encoding(self, reader, utf8_file):
        """Test read_with_info returns both content and encoding."""
        file_path, expected_content = utf8_file
        content, encoding = reader.read_with_info(file_path)
        assert content == expected_content
        assert encoding == "utf-8"

    def test_read_with_info_gbk(self, reader, gbk_file):
        """Test read_with_info detects GBK encoding."""
        file_path, expected_content = gbk_file
        content, encoding = reader.read_with_info(file_path)
        assert content == expected_content
        # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
        assert encoding in ["gbk", "gb2312", "gb18030"]

    def test_read_lines_keep_newlines(self, reader, utf8_file):
        """Test reading lines with newlines preserved."""
        file_path, content = utf8_file
        lines = reader.read_lines(file_path, keep_newlines=True)
        assert len(lines) == 3
        assert lines[0].endswith("\n")

    def test_read_lines_without_newlines(self, reader, utf8_file):
        """Test reading lines without newlines."""
        file_path, content = utf8_file
        lines = reader.read_lines(file_path, keep_newlines=False)
        assert len(lines) == 3
        assert not lines[0].endswith("\n")

    def test_detect_encoding_utf8(self, reader, utf8_file):
        """Test encoding detection for UTF-8."""
        file_path, _ = utf8_file
        encoding = reader.detect_encoding(file_path)
        assert encoding in ["utf-8", "ascii"]

    def test_detect_encoding_gbk(self, reader, gbk_file):
        """Test encoding detection for GBK."""
        file_path, _ = gbk_file
        encoding = reader.detect_encoding(file_path)
        # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
        assert encoding in ["gbk", "gb2312", "gb18030"]

    def test_detect_encoding_nonexistent(self, reader):
        """Test encoding detection for non-existent file."""
        encoding = reader.detect_encoding("/nonexistent/file.txt")
        assert encoding is None

    def test_is_binary_with_text_file(self, reader, utf8_file):
        """Test is_binary returns False for text files."""
        file_path, _ = utf8_file
        assert not reader.is_binary(file_path)

    def test_is_binary_with_binary_file(self, reader, temp_dir):
        """Test is_binary returns True for binary files."""
        file_path = temp_dir / "binary_test.bin"
        file_path.write_bytes(b"\x00\x01\x02\x03" * 1000)
        assert reader.is_binary(file_path)

    def test_is_binary_nonexistent(self, reader):
        """Test is_binary returns False for non-existent file."""
        assert not reader.is_binary("/nonexistent/file.txt")

    def test_read_large_file_performance(self, reader, large_file):
        """Test that large files are read efficiently."""
        import time

        file_path, expected_content = large_file
        start_time = time.time()
        content = reader.read(file_path)
        elapsed = time.time() - start_time

        assert content == expected_content
        # Should read 100KB in less than 1 second
        assert elapsed < 1.0

    def test_custom_default_encoding(self, temp_dir):
        """Test reader with custom default encoding."""
        file_path = temp_dir / "gbk_test.txt"
        content = "GBK编码测试"
        file_path.write_bytes(content.encode("gbk"))

        reader = TxtReader(default_encoding="gbk")
        result = reader.read(file_path)
        assert result == content

    def test_empty_file(self, reader, temp_dir):
        """Test reading an empty file."""
        file_path = temp_dir / "empty.txt"
        file_path.write_text("", encoding="utf-8")
        content = reader.read(file_path)
        assert content == ""

    def test_file_with_special_characters(self, reader, temp_dir):
        """Test reading file with various special characters."""
        file_path = temp_dir / "special.txt"
        content = "测试！@#$%^&*()_+-=[]{}|;':\",./<>?\n换行\n制表符\t内容"
        file_path.write_text(content, encoding="utf-8")
        result = reader.read(file_path)
        assert result == content

    def test_file_with_mixed_line_endings(self, reader, temp_dir):
        """Test reading file with mixed line endings gets normalized."""
        file_path = temp_dir / "mixed_endings.txt"
        # Write with binary to preserve exact bytes
        content = "Line1\nLine2\r\nLine3\r"
        file_path.write_bytes(content.encode("utf-8"))
        result = reader.read(file_path)
        # Python's text mode normalizes line endings to \n
        expected = "Line1\nLine2\nLine3\n"
        assert result == expected


class TestTxtReaderErrorHandling:
    """Test error handling in TxtReader."""

    def test_directory_path_raises_error(self, reader, temp_dir):
        """Test that reading a directory raises TxtReaderError."""
        with pytest.raises(TxtReaderError):
            reader.read(temp_dir)

    def test_unreadable_encoding(self, reader, temp_dir):
        """Test handling of file with encoding that can't be auto-detected."""
        # Create a file that's valid UTF-16 but not readable as UTF-8
        file_path = temp_dir / "utf16_test.txt"
        content = "测试内容"
        file_path.write_bytes(content.encode("utf-16-le"))

        # Should still be able to read it via fallback
        result = reader.read(file_path)
        assert "测试" in result or len(result) > 0