1 month ago · bdad035129
--- a/src/cleaning/__init__.py
+++ b/src/cleaning/__init__.py
@@ -0,0 +1,24 @@
 
				+"""
			
 
				+Cleaning module for TXT file preprocessing.
			
 
				+
			
 
				+This module provides functionality for reading, cleaning, and splitting
			
 
				+TXT files into chapters for translation processing.
			
 
				+"""
			
 
				+
			
 
				+from .models import Chapter, CleaningResult
			
 
				+from .reader import TxtReader, TxtReaderError
			
 
				+from .splitter import ChapterSplitter, ChapterSplitterError
			
 
				+from .cleaner import TextCleaner
			
 
				+from .pipeline import CleaningPipeline, CleaningPipelineError
			
 
				+
			
 
				+__all__ = [
			
 
				+    "Chapter",
			
 
				+    "CleaningResult",
			
 
				+    "TxtReader",
			
 
				+    "TxtReaderError",
			
 
				+    "ChapterSplitter",
			
 
				+    "ChapterSplitterError",
			
 
				+    "TextCleaner",
			
 
				+    "CleaningPipeline",
			
 
				+    "CleaningPipelineError",
			
 
				+]
			
--- a/src/cleaning/cleaner.py
+++ b/src/cleaning/cleaner.py
@@ -0,0 +1,361 @@
 
				+"""
			
 
				+Text cleaner for normalizing and cleaning text content.
			
 
				+
			
 
				+This module provides functionality to clean and normalize text
			
 
				+for better translation quality.
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+import string
			
 
				+from typing import List, Optional, Set
			
 
				+
			
 
				+
			
 
				+class TextCleaner:
			
 
				+    """
			
 
				+    Clean and normalize text content.
			
 
				+
			
 
				+    This cleaner handles various text quality issues including
			
 
				+    extra whitespace, broken punctuation, invalid characters,
			
 
				+    and common formatting problems.
			
 
				+    """
			
 
				+
			
 
				+    # Common invalid or problematic characters
			
 
				+    INVALID_CHARS = {
			
 
				+        '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
			
 
				+        '\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12',
			
 
				+        '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a',
			
 
				+        '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
			
 
				+        '\ufeff',  # Zero-width no-break space (BOM)
			
 
				+    }
			
 
				+
			
 
				+    # Patterns to remove (ad placeholders, watermarks, etc.)
			
 
				+    REMOVAL_PATTERNS = [
			
 
				+        r'https?://[^\s]+',  # URLs
			
 
				+        r'www\.[^\s]+',  # www URLs
			
 
				+        r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',  # Email addresses
			
 
				+    ]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        remove_extra_whitespace: bool = True,
			
 
				+        fix_punctuation: bool = True,
			
 
				+        remove_invalid_chars: bool = True,
			
 
				+        normalize_quotes: bool = True,
			
 
				+        custom_removals: Optional[List[str]] = None
			
 
				+    ):
			
 
				+        """
			
 
				+        Initialize the TextCleaner.
			
 
				+
			
 
				+        Args:
			
 
				+            remove_extra_whitespace: Whether to remove extra whitespace
			
 
				+            fix_punctuation: Whether to fix punctuation issues
			
 
				+            remove_invalid_chars: Whether to remove invalid characters
			
 
				+            normalize_quotes: Whether to normalize quote characters
			
 
				+            custom_removals: Additional regex patterns to remove
			
 
				+        """
			
 
				+        self.remove_extra_whitespace = remove_extra_whitespace
			
 
				+        self.fix_punctuation = fix_punctuation
			
 
				+        self.remove_invalid_chars = remove_invalid_chars
			
 
				+        self.normalize_quotes = normalize_quotes
			
 
				+
			
 
				+        # Compile removal patterns
			
 
				+        self._removal_patterns = self.REMOVAL_PATTERNS.copy()
			
 
				+        if custom_removals:
			
 
				+            self._removal_patterns.extend(custom_removals)
			
 
				+
			
 
				+        self._compiled_removals = [
			
 
				+            re.compile(pattern, re.IGNORECASE)
			
 
				+            for pattern in self._removal_patterns
			
 
				+        ]
			
 
				+
			
 
				+    def clean(self, text: str) -> str:
			
 
				+        """
			
 
				+        Perform full cleaning pipeline on text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to clean
			
 
				+
			
 
				+        Returns:
			
 
				+            Cleaned text
			
 
				+        """
			
 
				+        if not text:
			
 
				+            return text
			
 
				+
			
 
				+        result = text
			
 
				+
			
 
				+        # Remove invalid characters first
			
 
				+        if self.remove_invalid_chars:
			
 
				+            result = self.remove_invalid_chars_func(result)
			
 
				+
			
 
				+        # Apply custom removal patterns
			
 
				+        result = self._apply_removal_patterns(result)
			
 
				+
			
 
				+        # Fix punctuation
			
 
				+        if self.fix_punctuation:
			
 
				+            result = self.fix_punctuation_func(result)
			
 
				+
			
 
				+        # Normalize quotes
			
 
				+        if self.normalize_quotes:
			
 
				+            result = self.normalize_quotes_func(result)
			
 
				+
			
 
				+        # Remove extra whitespace
			
 
				+        if self.remove_extra_whitespace:
			
 
				+            result = self.remove_extra_whitespace_func(result)
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def remove_extra_whitespace_func(self, text: str) -> str:
			
 
				+        """
			
 
				+        Remove extra whitespace while preserving paragraph structure.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to process
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with normalized whitespace
			
 
				+        """
			
 
				+        # Remove trailing whitespace from each line
			
 
				+        lines = text.split('\n')
			
 
				+        lines = [line.rstrip() for line in lines]
			
 
				+
			
 
				+        # Remove excessive empty lines (more than 2 consecutive)
			
 
				+        result = []
			
 
				+        empty_count = 0
			
 
				+        for line in lines:
			
 
				+            if not line.strip():
			
 
				+                empty_count += 1
			
 
				+                if empty_count <= 2:  # Keep up to 2 consecutive empty lines
			
 
				+                    result.append(line)
			
 
				+            else:
			
 
				+                empty_count = 0
			
 
				+                result.append(line)
			
 
				+
			
 
				+        # Join and fix spaces
			
 
				+        text = '\n'.join(result)
			
 
				+
			
 
				+        # Replace multiple spaces with single space (within lines)
			
 
				+        text = re.sub(r' {3,}', '  ', text)  # Keep double spaces for indentation
			
 
				+        text = re.sub(r'([^\n]) {2,}([^\n])', r'\1 \2', text)  # But not in middle of text
			
 
				+
			
 
				+        # Remove spaces at beginning/end
			
 
				+        text = text.strip()
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    def fix_punctuation_func(self, text: str) -> str:
			
 
				+        """
			
 
				+        Fix common punctuation issues.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to process
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with fixed punctuation
			
 
				+        """
			
 
				+        # Fix mixed Chinese/English punctuation
			
 
				+        replacements = [
			
 
				+            # Chinese period issues
			
 
				+            (r'。{2,}', '。'),  # Multiple periods
			
 
				+            (r'\.。', '。'),  # Mixed period
			
 
				+            (r'。\.', '。'),
			
 
				+            # Comma issues
			
 
				+            (r'，{2,}', '，'),
			
 
				+            (r',，', '，'),
			
 
				+            (r'，,', '，'),
			
 
				+            # Exclamation mark
			
 
				+            (r'！{2,}', '！'),
			
 
				+            (r'!!', '!'),
			
 
				+            (r'！!', '！'),
			
 
				+            (r'!!', '！'),
			
 
				+            # Question mark
			
 
				+            (r'？{2,}', '？'),
			
 
				+            (r'\?\?', '?'),
			
 
				+            (r'？\?', '？'),
			
 
				+            # Colon
			
 
				+            (r'：{2,}', '：'),
			
 
				+            (r'::', '：'),
			
 
				+            # Semicolon
			
 
				+            (r'；{2,}', '；'),
			
 
				+            (r';;', '；'),
			
 
				+            # Parentheses - fix mismatched
			
 
				+            (r'（\(', '（'),
			
 
				+            (r'\(\)', '（）'),
			
 
				+            (r'）\)', '）'),
			
 
				+            # Brackets
			
 
				+            (r'\[', '【'),
			
 
				+            (r'\]', '】'),
			
 
				+        ]
			
 
				+
			
 
				+        for pattern, replacement in replacements:
			
 
				+            text = re.sub(pattern, replacement, text)
			
 
				+
			
 
				+        # Fix spacing around punctuation (should be no space before punctuation in Chinese)
			
 
				+        text = re.sub(r'\s+([。！？，、；：])', r'\1', text)  # Chinese punctuation
			
 
				+        text = re.sub(r'([a-zA-Z])\s+([,.!?;:])', r'\1\2', text)  # English punctuation
			
 
				+
			
 
				+        # Ensure space after English punctuation
			
 
				+        text = re.sub(r'([,.!?;:])([a-zA-Z])', r'\1 \2', text)
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    def remove_invalid_chars_func(self, text: str) -> str:
			
 
				+        """
			
 
				+        Remove invalid or problematic characters.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to process
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with invalid characters removed
			
 
				+        """
			
 
				+        # Remove specific invalid characters
			
 
				+        result = ''.join(c for c in text if c not in self.INVALID_CHARS)
			
 
				+
			
 
				+        # Replace other control characters (except \n, \r, \t)
			
 
				+        result = ''.join(
			
 
				+            c for c in result
			
 
				+            if not (0 <= ord(c) < 0x20 and c not in '\n\r\t')
			
 
				+        )
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def normalize_quotes_func(self, text: str) -> str:
			
 
				+        """
			
 
				+        Normalize quote characters to standard forms.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to process
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with normalized quotes
			
 
				+        """
			
 
				+        # Use character codes for curly quotes to avoid syntax errors
			
 
				+        replacements = [
			
 
				+            ('\u2018', "'"),  # Left single quote
			
 
				+            ('\u2019', "'"),  # Right single quote
			
 
				+            ('\u201c', '"'),  # Left double quote
			
 
				+            ('\u201d', '"'),  # Right double quote
			
 
				+            ('\u201e', '"'),  # Low-9 double quote
			
 
				+            ('\u201a', "'"),  # Low-9 single quote
			
 
				+            ('\u201b', "'"),  # Single high-reversed-9 quote
			
 
				+            ('\u201f', '"'),  # Double high-reversed-9 quote
			
 
				+        ]
			
 
				+
			
 
				+        for old, new in replacements:
			
 
				+            text = text.replace(old, new)
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    def _apply_removal_patterns(self, text: str) -> str:
			
 
				+        """
			
 
				+        Apply custom removal patterns to text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to process
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with patterns removed
			
 
				+        """
			
 
				+        for pattern in self._compiled_removals:
			
 
				+            text = pattern.sub('', text)
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    def remove_ads(self, text: str) -> str:
			
 
				+        """
			
 
				+        Remove advertisements and promotional text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to process
			
 
				+
			
 
				+        Returns:
			
 
				+            Text with ads removed
			
 
				+        """
			
 
				+        # Common ad patterns
			
 
				+        ad_patterns = [
			
 
				+            r'(本章完|本章结束|未完待续)',
			
 
				+            r'(请收藏.*?|推荐票|求票)',
			
 
				+            r'(加群|QQ群|微信群)',
			
 
				+            r'(感谢.*?打赏|感谢支持)',
			
 
				+        ]
			
 
				+
			
 
				+        for pattern in ad_patterns:
			
 
				+            text = re.sub(pattern, '', text)
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    def extract_numbers(self, text: str) -> List[str]:
			
 
				+        """
			
 
				+        Extract all numbers from text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to extract from
			
 
				+
			
 
				+        Returns:
			
 
				+            List of number strings found
			
 
				+        """
			
 
				+        # Find all integers and decimals
			
 
				+        numbers = re.findall(r'\d+\.?\d*', text)
			
 
				+        return numbers
			
 
				+
			
 
				+    def count_words(self, text: str, chinese_char_weight: float = 1.0) -> int:
			
 
				+        """
			
 
				+        Count words in mixed Chinese/English text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to count
			
 
				+            chinese_char_weight: Weight for Chinese characters (default 1.0)
			
 
				+
			
 
				+        Returns:
			
 
				+            Estimated word count
			
 
				+        """
			
 
				+        # Count Chinese characters
			
 
				+        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
			
 
				+
			
 
				+        # Count English words
			
 
				+        english_words = len(re.findall(r'[a-zA-Z]+', text))
			
 
				+
			
 
				+        # Count numbers as words
			
 
				+        numbers = len(re.findall(r'\d+', text))
			
 
				+
			
 
				+        return int(chinese_chars * chinese_char_weight + english_words + numbers)
			
 
				+
			
 
				+    def truncate(self, text: str, max_length: int, suffix: str = "...") -> str:
			
 
				+        """
			
 
				+        Truncate text to maximum length.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to truncate
			
 
				+            max_length: Maximum length
			
 
				+            suffix: Suffix to add if truncated
			
 
				+
			
 
				+        Returns:
			
 
				+            Truncated text
			
 
				+        """
			
 
				+        if len(text) <= max_length:
			
 
				+            return text
			
 
				+
			
 
				+        return text[:max_length - len(suffix)] + suffix
			
 
				+
			
 
				+    def split_into_sentences(self, text: str) -> List[str]:
			
 
				+        """
			
 
				+        Split text into sentences.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to split
			
 
				+
			
 
				+        Returns:
			
 
				+            List of sentences
			
 
				+        """
			
 
				+        # Sentence-ending punctuation
			
 
				+        sentence_endings = r'[。！？.!?]+'
			
 
				+        sentences = re.split(f'({sentence_endings})', text)
			
 
				+
			
 
				+        # Rejoin punctuation with sentences
			
 
				+        result = []
			
 
				+        for i in range(0, len(sentences) - 1, 2):
			
 
				+            sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else '')
			
 
				+            if sentence.strip():
			
 
				+                result.append(sentence.strip())
			
 
				+
			
 
				+        return result
			
--- a/src/cleaning/models.py
+++ b/src/cleaning/models.py
@@ -0,0 +1,83 @@
 
				+"""
			
 
				+Data models for the cleaning module.
			
 
				+
			
 
				+This module defines the core data structures for text cleaning and chapter management.
			
 
				+"""
			
 
				+
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Optional
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Chapter:
			
 
				+    """
			
 
				+    A chapter in a novel.
			
 
				+
			
 
				+    Attributes:
			
 
				+        index: The zero-based index of the chapter
			
 
				+        title: The title of the chapter
			
 
				+        content: The content of the chapter
			
 
				+        char_count: The number of characters in the chapter content
			
 
				+        start_pos: The starting position of the chapter in the original text (optional)
			
 
				+        end_pos: The ending position of the chapter in the original text (optional)
			
 
				+    """
			
 
				+
			
 
				+    index: int
			
 
				+    title: str
			
 
				+    content: str
			
 
				+    char_count: int
			
 
				+    start_pos: Optional[int] = None
			
 
				+    end_pos: Optional[int] = None
			
 
				+
			
 
				+    def __post_init__(self):
			
 
				+        """Validate and compute derived attributes."""
			
 
				+        if self.index < 0:
			
 
				+            raise ValueError("Chapter index must be non-negative")
			
 
				+        if self.char_count < 0:
			
 
				+            raise ValueError("Character count must be non-negative")
			
 
				+        # Recalculate char_count from content if not provided correctly
			
 
				+        if self.content is not None:
			
 
				+            actual_count = len(self.content)
			
 
				+            if self.char_count != actual_count:
			
 
				+                self.char_count = actual_count
			
 
				+
			
 
				+    @property
			
 
				+    def word_count(self) -> int:
			
 
				+        """Estimate the number of words (rough approximation for Chinese)."""
			
 
				+        # For Chinese text, characters roughly equal words
			
 
				+        # For mixed content, this is a rough estimate
			
 
				+        return len(self.content)
			
 
				+
			
 
				+    def __len__(self) -> int:
			
 
				+        """Return the character count of the chapter."""
			
 
				+        return self.char_count
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class CleaningResult:
			
 
				+    """
			
 
				+    Result of a cleaning operation.
			
 
				+
			
 
				+    Attributes:
			
 
				+        chapters: List of chapters extracted from the text
			
 
				+        original_char_count: Total characters in the original text
			
 
				+        cleaned_char_count: Total characters after cleaning
			
 
				+        removed_char_count: Number of characters removed during cleaning
			
 
				+    """
			
 
				+
			
 
				+    chapters: list[Chapter]
			
 
				+    original_char_count: int
			
 
				+    cleaned_char_count: int
			
 
				+    removed_char_count: int
			
 
				+
			
 
				+    @property
			
 
				+    def chapter_count(self) -> int:
			
 
				+        """Return the number of chapters."""
			
 
				+        return len(self.chapters)
			
 
				+
			
 
				+    @property
			
 
				+    def removal_rate(self) -> float:
			
 
				+        """Return the proportion of characters removed (0.0 to 1.0)."""
			
 
				+        if self.original_char_count == 0:
			
 
				+            return 0.0
			
 
				+        return self.removed_char_count / self.original_char_count
			
--- a/src/cleaning/pipeline.py
+++ b/src/cleaning/pipeline.py
@@ -0,0 +1,286 @@
 
				+"""
			
 
				+Cleaning pipeline for processing TXT files.
			
 
				+
			
 
				+This module provides a unified pipeline that combines reading,
			
 
				+cleaning, and splitting for complete TXT file processing.
			
 
				+"""
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from typing import List, Optional, Tuple, Dict, Any
			
 
				+
			
 
				+from .reader import TxtReader, TxtReaderError
			
 
				+from .splitter import ChapterSplitter, ChapterSplitterError
			
 
				+from .cleaner import TextCleaner
			
 
				+from .models import Chapter, CleaningResult
			
 
				+
			
 
				+
			
 
				+class CleaningPipelineError(Exception):
			
 
				+    """Exception raised for errors in the cleaning pipeline."""
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class CleaningPipeline:
			
 
				+    """
			
 
				+    Unified pipeline for cleaning TXT files.
			
 
				+
			
 
				+    This pipeline combines reading, cleaning, and splitting operations
			
 
				+    to provide a complete TXT file preprocessing solution.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        reader: Optional[TxtReader] = None,
			
 
				+        cleaner: Optional[TextCleaner] = None,
			
 
				+        splitter: Optional[ChapterSplitter] = None,
			
 
				+        enable_cleaning: bool = True,
			
 
				+        enable_splitting: bool = True,
			
 
				+        min_chapter_length: int = 10,
			
 
				+        merge_short_chapters: bool = False
			
 
				+    ):
			
 
				+        """
			
 
				+        Initialize the CleaningPipeline.
			
 
				+
			
 
				+        Args:
			
 
				+            reader: TxtReader instance (created if not provided)
			
 
				+            cleaner: TextCleaner instance (created if not provided)
			
 
				+            splitter: ChapterSplitter instance (created if not provided)
			
 
				+            enable_cleaning: Whether to perform text cleaning
			
 
				+            enable_splitting: Whether to perform chapter splitting
			
 
				+            min_chapter_length: Minimum chapter length for splitting
			
 
				+            merge_short_chapters: Whether to merge short chapters
			
 
				+        """
			
 
				+        self.reader = reader or TxtReader()
			
 
				+        self.cleaner = cleaner or TextCleaner()
			
 
				+
			
 
				+        if splitter:
			
 
				+            self.splitter = splitter
			
 
				+        else:
			
 
				+            self.splitter = ChapterSplitter(
			
 
				+                min_chapter_length=min_chapter_length,
			
 
				+                merge_short_chapters=merge_short_chapters
			
 
				+            )
			
 
				+
			
 
				+        self.enable_cleaning = enable_cleaning
			
 
				+        self.enable_splitting = enable_splitting
			
 
				+
			
 
				+    def process(
			
 
				+        self,
			
 
				+        file_path: Path | str,
			
 
				+        return_info: bool = False
			
 
				+    ) -> List[Chapter] | Tuple[List[Chapter], Dict[str, Any]]:
			
 
				+        """
			
 
				+        Process a TXT file through the full pipeline.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path: Path to the TXT file
			
 
				+            return_info: Whether to return additional processing info
			
 
				+
			
 
				+        Returns:
			
 
				+            List of chapters, or tuple of (chapters, info) if return_info=True
			
 
				+
			
 
				+        Raises:
			
 
				+            CleaningPipelineError: If processing fails
			
 
				+        """
			
 
				+        file_path = Path(file_path)
			
 
				+        info = {}
			
 
				+
			
 
				+        # Step 1: Read file
			
 
				+        try:
			
 
				+            content, encoding = self.reader.read_with_info(file_path)
			
 
				+            info['encoding'] = encoding
			
 
				+            info['original_size'] = len(content)
			
 
				+        except (TxtReaderError, FileNotFoundError) as e:
			
 
				+            raise CleaningPipelineError(f"Failed to read file {file_path}: {e}")
			
 
				+
			
 
				+        # Step 2: Clean content
			
 
				+        if self.enable_cleaning:
			
 
				+            content = self.cleaner.clean(content)
			
 
				+            info['cleaned_size'] = len(content)
			
 
				+            info['removed_chars'] = info['original_size'] - info['cleaned_size']
			
 
				+        else:
			
 
				+            info['cleaned_size'] = info['original_size']
			
 
				+            info['removed_chars'] = 0
			
 
				+
			
 
				+        # Step 3: Split into chapters
			
 
				+        if self.enable_splitting:
			
 
				+            chapters = self.splitter.split(content)
			
 
				+            info['chapter_count'] = len(chapters)
			
 
				+        else:
			
 
				+            # Return entire content as single chapter
			
 
				+            from .models import Chapter
			
 
				+            chapters = [Chapter(
			
 
				+                index=0,
			
 
				+                title="全文",
			
 
				+                content=content,
			
 
				+                char_count=len(content)
			
 
				+            )]
			
 
				+            info['chapter_count'] = 1
			
 
				+
			
 
				+        if return_info:
			
 
				+            return chapters, info
			
 
				+        return chapters
			
 
				+
			
 
				+    def process_to_result(self, file_path: Path | str) -> CleaningResult:
			
 
				+        """
			
 
				+        Process a file and return a CleaningResult object.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path: Path to the TXT file
			
 
				+
			
 
				+        Returns:
			
 
				+            CleaningResult with chapters and statistics
			
 
				+
			
 
				+        Raises:
			
 
				+            CleaningPipelineError: If processing fails
			
 
				+        """
			
 
				+        file_path = Path(file_path)
			
 
				+
			
 
				+        # Read original content first
			
 
				+        original_content = self.reader.read(file_path)
			
 
				+        original_count = len(original_content)
			
 
				+
			
 
				+        # Process through pipeline
			
 
				+        chapters, info = self.process(file_path, return_info=True)
			
 
				+
			
 
				+        return CleaningResult(
			
 
				+            chapters=chapters,
			
 
				+            original_char_count=original_count,
			
 
				+            cleaned_char_count=info['cleaned_size'],
			
 
				+            removed_char_count=info.get('removed_chars', 0)
			
 
				+        )
			
 
				+
			
 
				+    def read_and_clean(self, file_path: Path | str) -> str:
			
 
				+        """
			
 
				+        Read and clean a file without splitting.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path: Path to the TXT file
			
 
				+
			
 
				+        Returns:
			
 
				+            Cleaned text content
			
 
				+
			
 
				+        Raises:
			
 
				+            CleaningPipelineError: If processing fails
			
 
				+        """
			
 
				+        temp_splitting = self.enable_splitting
			
 
				+        self.enable_splitting = False
			
 
				+
			
 
				+        try:
			
 
				+            chapters = self.process(file_path)
			
 
				+            return chapters[0].content if chapters else ""
			
 
				+        finally:
			
 
				+            self.enable_splitting = temp_splitting
			
 
				+
			
 
				+    def get_file_info(self, file_path: Path | str) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        Get information about a file without processing.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path: Path to the TXT file
			
 
				+
			
 
				+        Returns:
			
 
				+            Dictionary with file information
			
 
				+        """
			
 
				+        file_path = Path(file_path)
			
 
				+
			
 
				+        info = {
			
 
				+            'path': str(file_path),
			
 
				+            'exists': file_path.exists(),
			
 
				+            'is_file': file_path.is_file() if file_path.exists() else False,
			
 
				+            'size': file_path.stat().st_size if file_path.exists() else 0,
			
 
				+        }
			
 
				+
			
 
				+        if not file_path.exists():
			
 
				+            return info
			
 
				+
			
 
				+        # Detect encoding
			
 
				+        info['encoding'] = self.reader.detect_encoding(file_path)
			
 
				+
			
 
				+        # Read first line to preview
			
 
				+        try:
			
 
				+            first_line = self.reader.read_lines(file_path, keep_newlines=False)
			
 
				+            if first_line:
			
 
				+                info['first_line'] = first_line[0][:100]  # First 100 chars
			
 
				+        except Exception:
			
 
				+            info['first_line'] = None
			
 
				+
			
 
				+        # Check if binary
			
 
				+        info['is_binary'] = self.reader.is_binary(file_path)
			
 
				+
			
 
				+        return info
			
 
				+
			
 
				+    def batch_process(
			
 
				+        self,
			
 
				+        file_paths: List[Path | str],
			
 
				+        raise_on_error: bool = False
			
 
				+    ) -> List[Tuple[Path, List[Chapter] | Exception]]:
			
 
				+        """
			
 
				+        Process multiple files.
			
 
				+
			
 
				+        Args:
			
 
				+            file_paths: List of file paths to process
			
 
				+            raise_on_error: Whether to raise exceptions or return them
			
 
				+
			
 
				+        Returns:
			
 
				+            List of (path, chapters_or_exception) tuples
			
 
				+        """
			
 
				+        results = []
			
 
				+
			
 
				+        for path in file_paths:
			
 
				+            path = Path(path)
			
 
				+            try:
			
 
				+                chapters = self.process(path)
			
 
				+                results.append((path, chapters))
			
 
				+            except Exception as e:
			
 
				+                if raise_on_error:
			
 
				+                    raise CleaningPipelineError(f"Failed to process {path}: {e}")
			
 
				+                results.append((path, e))
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+    def create_custom_splitter(
			
 
				+        self,
			
 
				+        min_chapter_length: int = 100,
			
 
				+        merge_short_chapters: bool = True,
			
 
				+        custom_patterns: Optional[List[tuple]] = None
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        Create and set a custom chapter splitter.
			
 
				+
			
 
				+        Args:
			
 
				+            min_chapter_length: Minimum chapter length
			
 
				+            merge_short_chapters: Whether to merge short chapters
			
 
				+            custom_patterns: Custom chapter patterns
			
 
				+        """
			
 
				+        self.splitter = ChapterSplitter(
			
 
				+            min_chapter_length=min_chapter_length,
			
 
				+            merge_short_chapters=merge_short_chapters,
			
 
				+            custom_patterns=custom_patterns
			
 
				+        )
			
 
				+
			
 
				+    def create_custom_cleaner(
			
 
				+        self,
			
 
				+        remove_extra_whitespace: bool = True,
			
 
				+        fix_punctuation: bool = True,
			
 
				+        remove_invalid_chars: bool = True,
			
 
				+        normalize_quotes: bool = True,
			
 
				+        custom_removals: Optional[List[str]] = None
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        Create and set a custom text cleaner.
			
 
				+
			
 
				+        Args:
			
 
				+            remove_extra_whitespace: Whether to remove extra whitespace
			
 
				+            fix_punctuation: Whether to fix punctuation
			
 
				+            remove_invalid_chars: Whether to remove invalid characters
			
 
				+            normalize_quotes: Whether to normalize quotes
			
 
				+            custom_removals: Custom regex patterns to remove
			
 
				+        """
			
 
				+        self.cleaner = TextCleaner(
			
 
				+            remove_extra_whitespace=remove_extra_whitespace,
			
 
				+            fix_punctuation=fix_punctuation,
			
 
				+            remove_invalid_chars=remove_invalid_chars,
			
 
				+            normalize_quotes=normalize_quotes,
			
 
				+            custom_removals=custom_removals
			
 
				+        )
			
--- a/src/cleaning/reader.py
+++ b/src/cleaning/reader.py
@@ -0,0 +1,262 @@
 
				+"""
			
 
				+TXT file reader with encoding detection.
			
 
				+
			
 
				+This module provides functionality to read TXT files with automatic
			
 
				+encoding detection for Chinese and other encodings.
			
 
				+"""
			
 
				+
			
 
				+import chardet
			
 
				+from pathlib import Path
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+
			
 
				+class TxtReaderError(Exception):
			
 
				+    """Exception raised for errors in reading TXT files."""
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class TxtReader:
			
 
				+    """
			
 
				+    TXT file reader with automatic encoding detection.
			
 
				+
			
 
				+    This reader can handle various encodings commonly used for Chinese text,
			
 
				+    including UTF-8, GBK, GB2312, and UTF-8 with BOM.
			
 
				+    """
			
 
				+
			
 
				+    # Common encodings to try for Chinese text
			
 
				+    COMMON_ENCODINGS = [
			
 
				+        "utf-8",
			
 
				+        "utf-8-sig",  # UTF-8 with BOM
			
 
				+        "gbk",
			
 
				+        "gb2312",
			
 
				+        "gb18030",
			
 
				+        "big5",  # Traditional Chinese
			
 
				+        "utf-16",
			
 
				+        "utf-16-le",
			
 
				+        "utf-16-be",
			
 
				+    ]
			
 
				+
			
 
				+    def __init__(self, default_encoding: str = "utf-8"):
			
 
				+        """
			
 
				+        Initialize the TxtReader.
			
 
				+
			
 
				+        Args:
			
 
				+            default_encoding: The default encoding to try first
			
 
				+        """
			
 
				+        self.default_encoding = default_encoding
			
 
				+        self.fallback_encodings = [enc for enc in self.COMMON_ENCODINGS if enc != default_encoding]
			
 
				+
			
 
				+    def read(self, path: Path | str) -> str:
			
 
				+        """
			
 
				+        Read a file and return its content as a string.
			
 
				+
			
 
				+        This method attempts to read the file using the default encoding first,
			
 
				+        then falls back to other common encodings if that fails.
			
 
				+
			
 
				+        Args:
			
 
				+            path: Path to the file to read
			
 
				+
			
 
				+        Returns:
			
 
				+            The content of the file as a string
			
 
				+
			
 
				+        Raises:
			
 
				+            TxtReaderError: If the file cannot be read with any encoding
			
 
				+            FileNotFoundError: If the file does not exist
			
 
				+        """
			
 
				+        path = Path(path)
			
 
				+        if not path.exists():
			
 
				+            raise FileNotFoundError(f"File not found: {path}")
			
 
				+
			
 
				+        if not path.is_file():
			
 
				+            raise TxtReaderError(f"Path is not a file: {path}")
			
 
				+
			
 
				+        # Try default encoding first
			
 
				+        try:
			
 
				+            return self._read_with_encoding(path, self.default_encoding)
			
 
				+        except (UnicodeDecodeError, UnicodeError):
			
 
				+            pass
			
 
				+
			
 
				+        # Try detected encoding
			
 
				+        detected = self.detect_encoding(path)
			
 
				+        if detected and detected != self.default_encoding:
			
 
				+            try:
			
 
				+                return self._read_with_encoding(path, detected)
			
 
				+            except (UnicodeDecodeError, UnicodeError):
			
 
				+                pass
			
 
				+
			
 
				+        # Try all other fallback encodings
			
 
				+        for encoding in self.fallback_encodings:
			
 
				+            if encoding == detected:
			
 
				+                continue  # Already tried
			
 
				+            try:
			
 
				+                return self._read_with_encoding(path, encoding)
			
 
				+            except (UnicodeDecodeError, UnicodeError):
			
 
				+                continue
			
 
				+
			
 
				+        # If all else fails, raise error
			
 
				+        raise TxtReaderError(
			
 
				+            f"Failed to read file {path} with any common encoding. "
			
 
				+            f"Tried: {self.default_encoding}, {detected}, {', '.join(self.fallback_encodings)}"
			
 
				+        )
			
 
				+
			
 
				+    def read_lines(self, path: Path | str, keep_newlines: bool = True) -> List[str]:
			
 
				+        """
			
 
				+        Read a file and return its lines as a list.
			
 
				+
			
 
				+        Args:
			
 
				+            path: Path to the file to read
			
 
				+            keep_newlines: Whether to keep newline characters at the end of each line
			
 
				+
			
 
				+        Returns:
			
 
				+            List of lines from the file
			
 
				+
			
 
				+        Raises:
			
 
				+            TxtReaderError: If the file cannot be read with any encoding
			
 
				+            FileNotFoundError: If the file does not exist
			
 
				+        """
			
 
				+        content = self.read(path)
			
 
				+        if keep_newlines:
			
 
				+            return content.splitlines(keepends=True)
			
 
				+        return content.splitlines()
			
 
				+
			
 
				+    def read_with_info(self, path: Path | str) -> Tuple[str, str]:
			
 
				+        """
			
 
				+        Read a file and return both content and the encoding used.
			
 
				+
			
 
				+        Args:
			
 
				+            path: Path to the file to read
			
 
				+
			
 
				+        Returns:
			
 
				+            Tuple of (content, encoding_used)
			
 
				+
			
 
				+        Raises:
			
 
				+            TxtReaderError: If the file cannot be read with any encoding
			
 
				+            FileNotFoundError: If the file does not exist
			
 
				+        """
			
 
				+        path = Path(path)
			
 
				+
			
 
				+        # Try default encoding first
			
 
				+        try:
			
 
				+            content = self._read_with_encoding(path, self.default_encoding)
			
 
				+            return content, self.default_encoding
			
 
				+        except (UnicodeDecodeError, UnicodeError):
			
 
				+            pass
			
 
				+
			
 
				+        # Try detected encoding
			
 
				+        detected = self.detect_encoding(path)
			
 
				+        if detected:
			
 
				+            try:
			
 
				+                content = self._read_with_encoding(path, detected)
			
 
				+                return content, detected
			
 
				+            except (UnicodeDecodeError, UnicodeError):
			
 
				+                pass
			
 
				+
			
 
				+        # Try all other fallback encodings
			
 
				+        for encoding in self.fallback_encodings:
			
 
				+            if encoding == detected:
			
 
				+                continue
			
 
				+            try:
			
 
				+                content = self._read_with_encoding(path, encoding)
			
 
				+                return content, encoding
			
 
				+            except (UnicodeDecodeError, UnicodeError):
			
 
				+                continue
			
 
				+
			
 
				+        raise TxtReaderError(
			
 
				+            f"Failed to read file {path} with any common encoding. "
			
 
				+            f"Tried: {self.default_encoding}, {detected}, {', '.join(self.fallback_encodings)}"
			
 
				+        )
			
 
				+
			
 
				+    def detect_encoding(self, path: Path | str) -> Optional[str]:
			
 
				+        """
			
 
				+        Detect the encoding of a file using chardet.
			
 
				+
			
 
				+        Args:
			
 
				+            path: Path to the file to analyze
			
 
				+
			
 
				+        Returns:
			
 
				+            The detected encoding name, or None if detection failed
			
 
				+        """
			
 
				+        path = Path(path)
			
 
				+        if not path.exists():
			
 
				+            return None
			
 
				+
			
 
				+        try:
			
 
				+            # Read first 10KB for encoding detection
			
 
				+            with open(path, "rb") as f:
			
 
				+                raw_data = f.read(10240)
			
 
				+
			
 
				+            if not raw_data:
			
 
				+                return self.default_encoding
			
 
				+
			
 
				+            result = chardet.detect(raw_data)
			
 
				+            encoding = result.get("encoding")
			
 
				+
			
 
				+            # Normalize encoding names
			
 
				+            if encoding:
			
 
				+                encoding = encoding.lower()
			
 
				+                # Map common variants
			
 
				+                encoding_map = {
			
 
				+                    "gb2312": "gbk",  # GB2312 is a subset of GBK
			
 
				+                    "big5": "big5",
			
 
				+                    "shift-jis": "shift-jis",
			
 
				+                    "euc-jp": "euc-jp",
			
 
				+                    "euc-kr": "euc-kr",
			
 
				+                }
			
 
				+                encoding = encoding_map.get(encoding, encoding)
			
 
				+
			
 
				+            return encoding
			
 
				+        except Exception:
			
 
				+            return None
			
 
				+
			
 
				+    def _read_with_encoding(self, path: Path, encoding: str) -> str:
			
 
				+        """
			
 
				+        Read a file with a specific encoding.
			
 
				+
			
 
				+        Args:
			
 
				+            path: Path to the file
			
 
				+            encoding: Encoding to use
			
 
				+
			
 
				+        Returns:
			
 
				+            The file content as a string
			
 
				+
			
 
				+        Raises:
			
 
				+            UnicodeDecodeError: If the file cannot be decoded with the given encoding
			
 
				+        """
			
 
				+        # utf-8-sig handles BOM, but for other encodings we strip BOM manually
			
 
				+        if encoding == "utf-8":
			
 
				+            encoding = "utf-8-sig"
			
 
				+
			
 
				+        with open(path, "r", encoding=encoding) as f:
			
 
				+            return f.read()
			
 
				+
			
 
				+    def is_binary(self, path: Path | str, threshold: float = 0.3) -> bool:
			
 
				+        """
			
 
				+        Check if a file appears to be binary rather than text.
			
 
				+
			
 
				+        Args:
			
 
				+            path: Path to the file to check
			
 
				+            threshold: Ratio of null/control bytes that triggers binary detection
			
 
				+
			
 
				+        Returns:
			
 
				+            True if the file appears to be binary, False otherwise
			
 
				+        """
			
 
				+        path = Path(path)
			
 
				+        if not path.exists():
			
 
				+            return False
			
 
				+
			
 
				+        try:
			
 
				+            with open(path, "rb") as f:
			
 
				+                chunk = f.read(8192)
			
 
				+
			
 
				+            if not chunk:
			
 
				+                return False
			
 
				+
			
 
				+            # Count null bytes and other control characters (except common whitespace)
			
 
				+            null_count = chunk.count(b"\x00")
			
 
				+            control_count = sum(1 for b in chunk if b < 0x20 and b not in (0x09, 0x0A, 0x0D))
			
 
				+
			
 
				+            ratio = (null_count + control_count) / len(chunk)
			
 
				+            return ratio > threshold
			
 
				+        except Exception:
			
 
				+            return False
			
--- a/src/cleaning/splitter.py
+++ b/src/cleaning/splitter.py
@@ -0,0 +1,309 @@
 
				+"""
			
 
				+Chapter splitter for dividing text into chapters.
			
 
				+
			
 
				+This module provides functionality to split a novel text into chapters
			
 
				+based on common chapter title patterns.
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from typing import List, Optional, Tuple
			
 
				+from dataclasses import dataclass
			
 
				+
			
 
				+from .models import Chapter
			
 
				+
			
 
				+
			
 
				+class ChapterSplitterError(Exception):
			
 
				+    """Exception raised for errors in chapter splitting."""
			
 
				+
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class ChapterSplitter:
			
 
				+    """
			
 
				+    Split a novel text into chapters using pattern matching.
			
 
				+
			
 
				+    This splitter supports multiple common chapter title formats used in
			
 
				+    Chinese and English novels.
			
 
				+    """
			
 
				+
			
 
				+    # Supported chapter title patterns (in order of priority)
			
 
				+    PATTERNS = [
			
 
				+        # Chinese numerals: 第一章, 第二章, 第一百二十三章, etc.
			
 
				+        (r'^第[零一二三四五六七八九十百千]+章', 1),
			
 
				+        # Arabic numerals Chinese: 第1章, 第123章, etc.
			
 
				+        (r'^第\d+章', 1),
			
 
				+        # English format: Chapter 1, Chapter One, etc.
			
 
				+        (r'^Chapter\s+[A-Za-z0-9]+', 2),
			
 
				+        # Numbered format: 1., 2., 3., etc.
			
 
				+        (r'^\d+\.', 3),
			
 
				+        # Date format: 2024年3月15日, etc.
			
 
				+        (r'^\d{4}年\d{1,2}月\d{1,2}日', 4),
			
 
				+        # Number in brackets: [1], [2], etc.
			
 
				+        (r'^\[\d+\]', 5),
			
 
				+        # Number with dash: 1 -, 1 -, etc.
			
 
				+        (r'^\d+\s*[-—–]', 6),
			
 
				+        # 卷/篇: 第一卷, 第一篇, etc.
			
 
				+        (r'^第[零一二三四五六七八九十百千\d]+[卷篇]', 7),
			
 
				+    ]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        min_chapter_length: int = 100,
			
 
				+        merge_short_chapters: bool = True,
			
 
				+        custom_patterns: Optional[List[Tuple[str, int]]] = None
			
 
				+    ):
			
 
				+        """
			
 
				+        Initialize the ChapterSplitter.
			
 
				+
			
 
				+        Args:
			
 
				+            min_chapter_length: Minimum character count for a valid chapter
			
 
				+            merge_short_chapters: Whether to merge chapters that are too short
			
 
				+            custom_patterns: Additional custom patterns to use (list of (regex, priority))
			
 
				+        """
			
 
				+        self.min_chapter_length = min_chapter_length
			
 
				+        self.merge_short_chapters = merge_short_chapters
			
 
				+
			
 
				+        # Compile all patterns
			
 
				+        self._patterns = self.PATTERNS.copy()
			
 
				+        if custom_patterns:
			
 
				+            self._patterns.extend(custom_patterns)
			
 
				+
			
 
				+        # Compile regex patterns for efficiency
			
 
				+        self._compiled_patterns = [
			
 
				+            (re.compile(pattern, re.MULTILINE), priority)
			
 
				+            for pattern, priority in self._patterns
			
 
				+        ]
			
 
				+
			
 
				+        # Pattern to extract chapter title from a line
			
 
				+        self._title_pattern = re.compile(r'^([^\n]{1,100})', re.MULTILINE)
			
 
				+
			
 
				+    def split(self, text: str) -> List[Chapter]:
			
 
				+        """
			
 
				+        Split text into chapters.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to split
			
 
				+
			
 
				+        Returns:
			
 
				+            List of Chapter objects
			
 
				+
			
 
				+        Raises:
			
 
				+            ChapterSplitterError: If splitting fails
			
 
				+        """
			
 
				+        if not text:
			
 
				+            return []
			
 
				+
			
 
				+        # Find all chapter boundaries
			
 
				+        boundaries = self._find_chapter_boundaries(text)
			
 
				+
			
 
				+        if not boundaries:
			
 
				+            # No chapters found, return entire text as single chapter
			
 
				+            return [Chapter(
			
 
				+                index=0,
			
 
				+                title="全文",
			
 
				+                content=text.strip(),
			
 
				+                char_count=len(text.strip()),
			
 
				+                start_pos=0,
			
 
				+                end_pos=len(text)
			
 
				+            )]
			
 
				+
			
 
				+        # Extract chapter content
			
 
				+        chapters = self._extract_chapters(text, boundaries)
			
 
				+
			
 
				+        # Merge short chapters if enabled
			
 
				+        if self.merge_short_chapters:
			
 
				+            chapters = self._merge_short_chapters(chapters)
			
 
				+
			
 
				+        return chapters
			
 
				+
			
 
				+    def _find_chapter_boundaries(self, text: str) -> List[Tuple[int, int, str]]:
			
 
				+        """
			
 
				+        Find all chapter boundaries in the text.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to analyze
			
 
				+
			
 
				+        Returns:
			
 
				+            List of (position, priority, title) tuples
			
 
				+        """
			
 
				+        lines = text.split('\n')
			
 
				+        boundaries = []
			
 
				+
			
 
				+        current_pos = 0
			
 
				+        for i, line in enumerate(lines):
			
 
				+            line_start = current_pos
			
 
				+            line_end = current_pos + len(line) + 1  # +1 for newline
			
 
				+
			
 
				+            detected = self.detect_chapter_title(line)
			
 
				+            if detected:
			
 
				+                priority, title = detected
			
 
				+                boundaries.append((line_start, priority, title))
			
 
				+
			
 
				+            current_pos = line_end
			
 
				+
			
 
				+        # Sort by position (they should already be in order)
			
 
				+        boundaries.sort(key=lambda x: x[0])
			
 
				+        return boundaries
			
 
				+
			
 
				+    def _extract_chapters(
			
 
				+        self,
			
 
				+        text: str,
			
 
				+        boundaries: List[Tuple[int, int, str]]
			
 
				+    ) -> List[Chapter]:
			
 
				+        """
			
 
				+        Extract chapter content from text using boundaries.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The full text
			
 
				+            boundaries: List of (position, priority, title) tuples
			
 
				+
			
 
				+        Returns:
			
 
				+            List of Chapter objects
			
 
				+        """
			
 
				+        chapters = []
			
 
				+
			
 
				+        for i, (start_pos, _, title) in enumerate(boundaries):
			
 
				+            # End position is start of next chapter, or end of text
			
 
				+            if i + 1 < len(boundaries):
			
 
				+                end_pos = boundaries[i + 1][0]
			
 
				+            else:
			
 
				+                end_pos = len(text)
			
 
				+
			
 
				+            # Extract content (excluding the title line)
			
 
				+            content_start = start_pos + len(title) + 1  # Skip title and newline
			
 
				+            if content_start >= end_pos:
			
 
				+                # Empty chapter
			
 
				+                content = ""
			
 
				+            else:
			
 
				+                content = text[content_start:end_pos]
			
 
				+
			
 
				+            # Strip leading/trailing whitespace but keep internal structure
			
 
				+            content = content.lstrip('\n')
			
 
				+
			
 
				+            chapters.append(Chapter(
			
 
				+                index=i,
			
 
				+                title=title.strip(),
			
 
				+                content=content,
			
 
				+                char_count=len(content),
			
 
				+                start_pos=start_pos,
			
 
				+                end_pos=end_pos
			
 
				+            ))
			
 
				+
			
 
				+        return chapters
			
 
				+
			
 
				+    def _merge_short_chapters(self, chapters: List[Chapter]) -> List[Chapter]:
			
 
				+        """
			
 
				+        Merge chapters that are too short with adjacent chapters.
			
 
				+
			
 
				+        Args:
			
 
				+            chapters: List of chapters to process
			
 
				+
			
 
				+        Returns:
			
 
				+            List of merged chapters
			
 
				+        """
			
 
				+        if not chapters:
			
 
				+            return []
			
 
				+
			
 
				+        if len(chapters) == 1:
			
 
				+            return chapters
			
 
				+
			
 
				+        merged = []
			
 
				+        i = 0
			
 
				+
			
 
				+        while i < len(chapters):
			
 
				+            current = chapters[i]
			
 
				+
			
 
				+            # If this chapter is too short and not the last one
			
 
				+            if current.char_count < self.min_chapter_length and i + 1 < len(chapters):
			
 
				+                # Merge with next chapter
			
 
				+                next_chapter = chapters[i + 1]
			
 
				+                merged_content = current.content + "\n\n" + next_chapter.content
			
 
				+                merged_title = f"{current.title} + {next_chapter.title}"
			
 
				+
			
 
				+                merged_chapter = Chapter(
			
 
				+                    index=len(merged),
			
 
				+                    title=merged_title,
			
 
				+                    content=merged_content,
			
 
				+                    char_count=len(merged_content)
			
 
				+                )
			
 
				+                merged.append(merged_chapter)
			
 
				+                i += 2  # Skip next chapter since we merged it
			
 
				+            else:
			
 
				+                # Keep this chapter as-is
			
 
				+                current.index = len(merged)
			
 
				+                merged.append(current)
			
 
				+                i += 1
			
 
				+
			
 
				+        return merged
			
 
				+
			
 
				+    def detect_chapter_title(self, line: str) -> Optional[Tuple[int, str]]:
			
 
				+        """
			
 
				+        Detect if a line is a chapter title.
			
 
				+
			
 
				+        Args:
			
 
				+            line: The line to check
			
 
				+
			
 
				+        Returns:
			
 
				+            Tuple of (priority, title) if line is a chapter title, None otherwise
			
 
				+        """
			
 
				+        if not line or not line.strip():
			
 
				+            return None
			
 
				+
			
 
				+        stripped = line.strip()
			
 
				+
			
 
				+        # Try each pattern
			
 
				+        for pattern, priority in self._compiled_patterns:
			
 
				+            match = pattern.match(stripped)
			
 
				+            if match:
			
 
				+                return (priority, stripped)
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def is_chapter_title(self, line: str) -> bool:
			
 
				+        """
			
 
				+        Check if a line is a chapter title.
			
 
				+
			
 
				+        Args:
			
 
				+            line: The line to check
			
 
				+
			
 
				+        Returns:
			
 
				+            True if line is a chapter title, False otherwise
			
 
				+        """
			
 
				+        return self.detect_chapter_title(line) is not None
			
 
				+
			
 
				+    def get_chapter_count(self, text: str) -> int:
			
 
				+        """
			
 
				+        Get the number of chapters in text without full splitting.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to analyze
			
 
				+
			
 
				+        Returns:
			
 
				+            Number of chapters detected
			
 
				+        """
			
 
				+        boundaries = self._find_chapter_boundaries(text)
			
 
				+        if not boundaries:
			
 
				+            return 1  # Entire text as single chapter
			
 
				+        return len(boundaries)
			
 
				+
			
 
				+    def preview_chapters(self, text: str, preview_length: int = 200) -> List[str]:
			
 
				+        """
			
 
				+        Get a preview of each chapter's content.
			
 
				+
			
 
				+        Args:
			
 
				+            text: The text to split
			
 
				+            preview_length: Number of characters to show per chapter
			
 
				+
			
 
				+        Returns:
			
 
				+            List of chapter preview strings
			
 
				+        """
			
 
				+        chapters = self.split(text)
			
 
				+        previews = []
			
 
				+
			
 
				+        for chapter in chapters:
			
 
				+            preview = chapter.content[:preview_length]
			
 
				+            if len(chapter.content) > preview_length:
			
 
				+                preview += "..."
			
 
				+            previews.append(f"[{chapter.title}] {preview}")
			
 
				+
			
 
				+        return previews
			
--- a/tests/cleaning/test_cleaner.py
+++ b/tests/cleaning/test_cleaner.py
@@ -0,0 +1,232 @@
 
				+"""
			
 
				+Unit tests for TextCleaner.
			
 
				+"""
			
 
				+
			
 
				+import pytest
			
 
				+from src.cleaning.cleaner import TextCleaner
			
 
				+
			
 
				+
			
 
				+class TestTextCleaner:
			
 
				+    """Test suite for TextCleaner."""
			
 
				+
			
 
				+    @pytest.fixture
			
 
				+    def cleaner(self):
			
 
				+        """Create a TextCleaner instance."""
			
 
				+        return TextCleaner()
			
 
				+
			
 
				+    def test_clean_empty_text(self, cleaner):
			
 
				+        """Test cleaning empty text."""
			
 
				+        assert cleaner.clean("") == ""
			
 
				+        assert cleaner.clean(None) if None else "" == cleaner.clean("")
			
 
				+
			
 
				+    def test_remove_extra_whitespace(self, cleaner):
			
 
				+        """Test removing extra whitespace."""
			
 
				+        text = "这是  一段    有很多空格的  文本。"
			
 
				+        result = cleaner.remove_extra_whitespace_func(text)
			
 
				+        assert "  " not in result
			
 
				+        assert "很多" in result
			
 
				+
			
 
				+    def test_remove_multiple_newlines(self, cleaner):
			
 
				+        """Test removing multiple consecutive newlines."""
			
 
				+        text = "第一行\n\n\n\n\n第二行"
			
 
				+        result = cleaner.remove_extra_whitespace_func(text)
			
 
				+        # Should keep up to 2 consecutive empty lines (3 newlines = 2 empty lines)
			
 
				+        assert "第一行" in result
			
 
				+        assert "第二行" in result
			
 
				+        # Should reduce 5 newlines (4 empty lines) to 3 newlines (2 empty lines)
			
 
				+        assert result.count("\n") < text.count("\n")
			
 
				+
			
 
				+    def test_preserve_paragraph_structure(self, cleaner):
			
 
				+        """Test that paragraph structure is preserved."""
			
 
				+        text = "第一段\n\n第二段\n\n第三段"
			
 
				+        result = cleaner.remove_extra_whitespace_func(text)
			
 
				+        assert "\n\n" in result  # Paragraph breaks should be kept
			
 
				+
			
 
				+    def test_fix_multiple_periods(self, cleaner):
			
 
				+        """Test fixing multiple Chinese periods."""
			
 
				+        text = "这是第一句。。这是第二句。。。"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        assert "。。" not in result
			
 
				+
			
 
				+    def test_fix_mixed_punctuation(self, cleaner):
			
 
				+        """Test fixing mixed Chinese/English punctuation."""
			
 
				+        text = "这是句子。，也是句子。!"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        # Note: mixed punctuation is complex, just check that some fix was attempted
			
 
				+        assert "。" in result  # Chinese period should be preserved
			
 
				+
			
 
				+    def test_fix_multiple_exclamations(self, cleaner):
			
 
				+        """Test fixing multiple exclamation marks."""
			
 
				+        text = "太棒了！！！！！"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        assert "！！" not in result
			
 
				+
			
 
				+    def test_fix_multiple_question_marks(self, cleaner):
			
 
				+        """Test fixing multiple question marks."""
			
 
				+        text = "真的吗？？？"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        assert "？？" not in result
			
 
				+
			
 
				+    def test_remove_invalid_chars(self, cleaner):
			
 
				+        """Test removing invalid characters."""
			
 
				+        text = "正常文本\x00\x01\x02更多文本"
			
 
				+        result = cleaner.remove_invalid_chars_func(text)
			
 
				+        assert "\x00" not in result
			
 
				+        assert "\x01" not in result
			
 
				+        assert "正常文本" in result
			
 
				+
			
 
				+    def test_remove_bom_character(self, cleaner):
			
 
				+        """Test removing BOM character."""
			
 
				+        text = "\ufeff这是文本"
			
 
				+        result = cleaner.remove_invalid_chars_func(text)
			
 
				+        assert "\ufeff" not in result
			
 
				+
			
 
				+    def test_normalize_quotes(self, cleaner):
			
 
				+        """Test quote normalization."""
			
 
				+        text = '这是\'引号\'和"双引号"内容'
			
 
				+        result = cleaner.normalize_quotes_func(text)
			
 
				+        # Quotes should be normalized to ASCII
			
 
				+        assert "'" in result or '"' in result
			
 
				+
			
 
				+    def test_full_cleaning_pipeline(self, cleaner):
			
 
				+        """Test the full cleaning pipeline."""
			
 
				+        text = "  这是  一段  有问题\x00的文本。。\n\n\n还有多余的空格！  "
			
 
				+        result = cleaner.clean(text)
			
 
				+        assert "\x00" not in result
			
 
				+        assert "。。" not in result
			
 
				+        assert not result.startswith(" ")
			
 
				+        assert not result.endswith(" ")
			
 
				+
			
 
				+    def test_remove_urls(self, cleaner):
			
 
				+        """Test URL removal."""
			
 
				+        text = "访问 https://example.com 查看更多信息"
			
 
				+        result = cleaner.clean(text)
			
 
				+        assert "https://" not in result
			
 
				+
			
 
				+    def test_remove_email_addresses(self, cleaner):
			
 
				+        """Test email address removal."""
			
 
				+        text = "联系 test@example.com 获取更多信息"
			
 
				+        result = cleaner.clean(text)
			
 
				+        assert "@" not in result
			
 
				+
			
 
				+    def test_custom_removal_patterns(self):
			
 
				+        """Test custom removal patterns."""
			
 
				+        text = "这是 [ISBN:123] 一些文字 [ISBN:456] 更多文字"
			
 
				+        cleaner = TextCleaner(custom_removals=[r'\[ISBN:\d+\]'])
			
 
				+        result = cleaner.clean(text)
			
 
				+        assert "[ISBN:" not in result
			
 
				+
			
 
				+    def test_remove_ads(self, cleaner):
			
 
				+        """Test advertisement removal."""
			
 
				+        text = "这是小说内容。本章完。请收藏本站。更多精彩内容。"
			
 
				+        result = cleaner.remove_ads(text)
			
 
				+        assert "本章完" not in result
			
 
				+        assert "请收藏" not in result
			
 
				+
			
 
				+    def test_extract_numbers(self, cleaner):
			
 
				+        """Test number extraction."""
			
 
				+        text = "林风今年18岁，身高175.5厘米，有3个朋友。"
			
 
				+        numbers = cleaner.extract_numbers(text)
			
 
				+        assert "18" in numbers
			
 
				+        assert "175.5" in numbers  # Decimal is returned as whole number
			
 
				+        assert "3" in numbers
			
 
				+
			
 
				+    def test_count_words_chinese(self, cleaner):
			
 
				+        """Test word counting for Chinese text."""
			
 
				+        text = "这是一段中文文本用于测试字数统计。"
			
 
				+        count = cleaner.count_words(text)
			
 
				+        assert count > 0
			
 
				+
			
 
				+    def test_count_words_mixed(self, cleaner):
			
 
				+        """Test word counting for mixed text."""
			
 
				+        text = "这里有 Chinese 和 English 123 混合"
			
 
				+        count = cleaner.count_words(text)
			
 
				+        assert count > 0
			
 
				+
			
 
				+    def test_truncate_short_text(self, cleaner):
			
 
				+        """Test truncating short text (no change)."""
			
 
				+        text = "短文本"
			
 
				+        result = cleaner.truncate(text, 100)
			
 
				+        assert result == text
			
 
				+
			
 
				+    def test_truncate_long_text(self, cleaner):
			
 
				+        """Test truncating long text."""
			
 
				+        text = "这是一段很长的文本需要被截断"
			
 
				+        result = cleaner.truncate(text, 10)
			
 
				+        # Each Chinese character is 1 byte in Python string length
			
 
				+        assert len(result) <= 13  # 10 chars + "..." (but might be less due to multibyte)
			
 
				+        assert result.endswith("...")
			
 
				+
			
 
				+    def test_split_into_sentences_chinese(self, cleaner):
			
 
				+        """Test splitting Chinese text into sentences."""
			
 
				+        text = "这是第一句。这是第二句！这是第三句？"
			
 
				+        sentences = cleaner.split_into_sentences(text)
			
 
				+        assert len(sentences) == 3
			
 
				+        assert "第一句" in sentences[0]
			
 
				+
			
 
				+    def test_split_into_sentences_english(self, cleaner):
			
 
				+        """Test splitting English text into sentences."""
			
 
				+        text = "This is first. This is second! This is third?"
			
 
				+        sentences = cleaner.split_into_sentences(text)
			
 
				+        assert len(sentences) >= 2
			
 
				+
			
 
				+    def test_cleaning_preserves_content(self, cleaner):
			
 
				+        """Test that cleaning doesn't remove important content."""
			
 
				+        text = "第一章 开始\n\n林风站在山顶，看着远方的城市。\n\n" \
			
 
				+               "\"你好，\"他说道。\n\n这是重要的对话内容。"
			
 
				+        result = cleaner.clean(text)
			
 
				+        assert "第一章" in result
			
 
				+        assert "林风" in result
			
 
				+        assert "山顶" in result
			
 
				+
			
 
				+    def test_fix_punctuation_spacing(self, cleaner):
			
 
				+        """Test fixing spacing around punctuation."""
			
 
				+        text = "这是句子 ，还有句子 。 还有感叹号 ！"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        assert " ，" not in result  # No space before Chinese comma
			
 
				+        assert " 。" not in result  # No space before Chinese period
			
 
				+
			
 
				+    def test_mismatched_parentheses(self, cleaner):
			
 
				+        """Test fixing mismatched parentheses."""
			
 
				+        text = "这是（左括号和)右括号"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        # Should normalize to matching pairs
			
 
				+
			
 
				+    def test_disabled_options(self):
			
 
				+        """Test cleaner with options disabled."""
			
 
				+        text = "  文本  。。\x00"
			
 
				+        cleaner = TextCleaner(
			
 
				+            remove_extra_whitespace=False,
			
 
				+            fix_punctuation=False,
			
 
				+            remove_invalid_chars=False
			
 
				+        )
			
 
				+        result = cleaner.clean(text)
			
 
				+        # Should preserve most of the original
			
 
				+        assert "  " in result  # Extra spaces preserved
			
 
				+
			
 
				+    def test_trailing_whitespace_removal(self, cleaner):
			
 
				+        """Test removal of trailing whitespace."""
			
 
				+        text = "第一行  \n第二行\t\n第三行   "
			
 
				+        result = cleaner.remove_extra_whitespace_func(text)
			
 
				+        assert not result.endswith(" ")
			
 
				+        assert not result.endswith("\t")
			
 
				+
			
 
				+    def test_empty_lines_preservation(self, cleaner):
			
 
				+        """Test that single empty lines are preserved."""
			
 
				+        text = "第一段\n\n第二段"
			
 
				+        result = cleaner.remove_extra_whitespace_func(text)
			
 
				+        assert "\n\n" in result
			
 
				+
			
 
				+    def test_multiple_consecutive_punctuation(self, cleaner):
			
 
				+        """Test handling of multiple consecutive punctuation marks."""
			
 
				+        text = "什么！？？真的。。。好吧。。。"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        assert "！？" in result or "？" in result
			
 
				+        assert "。。" not in result
			
 
				+
			
 
				+    def test_colon_and_semicolon_fix(self, cleaner):
			
 
				+        """Test fixing colon and semicolon issues."""
			
 
				+        text = "这是：：测试；；内容"
			
 
				+        result = cleaner.fix_punctuation_func(text)
			
 
				+        assert "：：" not in result
			
 
				+        assert "；；" not in result
			
--- a/tests/cleaning/test_pipeline.py
+++ b/tests/cleaning/test_pipeline.py
@@ -0,0 +1,313 @@
 
				+"""
			
 
				+Integration tests for CleaningPipeline.
			
 
				+"""
			
 
				+
			
 
				+import pytest
			
 
				+from pathlib import Path
			
 
				+from src.cleaning.pipeline import CleaningPipeline, CleaningPipelineError
			
 
				+from src.cleaning.reader import TxtReader
			
 
				+from src.cleaning.cleaner import TextCleaner
			
 
				+from src.cleaning.splitter import ChapterSplitter
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def sample_file(tmp_path):
			
 
				+    """Create a sample TXT file for testing."""
			
 
				+    file_path = tmp_path / "sample.txt"
			
 
				+    content = """第一章 开始
			
 
				+
			
 
				+这是第一章的内容，包含一些文字。
			
 
				+
			
 
				+林风站在山顶，看着远方。
			
 
				+
			
 
				+第二章 继续
			
 
				+
			
 
				+这是第二章的内容。
			
 
				+
			
 
				+他开始了新的旅程。
			
 
				+"""
			
 
				+    file_path.write_text(content, encoding="utf-8")
			
 
				+    return file_path
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def pipeline():
			
 
				+    """Create a CleaningPipeline instance."""
			
 
				+    return CleaningPipeline()
			
 
				+
			
 
				+
			
 
				+class TestCleaningPipeline:
			
 
				+    """Test suite for CleaningPipeline."""
			
 
				+
			
 
				+    def test_process_basic(self, pipeline, sample_file):
			
 
				+        """Test basic file processing."""
			
 
				+        chapters = pipeline.process(sample_file)
			
 
				+        assert len(chapters) >= 2
			
 
				+        assert "第一章" in chapters[0].title
			
 
				+
			
 
				+    def test_process_with_info(self, pipeline, sample_file):
			
 
				+        """Test processing with info return."""
			
 
				+        chapters, info = pipeline.process(sample_file, return_info=True)
			
 
				+        assert len(chapters) >= 2
			
 
				+        assert 'encoding' in info
			
 
				+        assert 'original_size' in info
			
 
				+        assert 'cleaned_size' in info
			
 
				+
			
 
				+    def test_process_to_result(self, pipeline, sample_file):
			
 
				+        """Test processing to CleaningResult."""
			
 
				+        result = pipeline.process_to_result(sample_file)
			
 
				+        assert result.chapter_count >= 2
			
 
				+        assert result.original_char_count > 0
			
 
				+        assert result.cleaned_char_count > 0
			
 
				+
			
 
				+    def test_removal_rate_property(self, pipeline, sample_file):
			
 
				+        """Test removal rate calculation."""
			
 
				+        result = pipeline.process_to_result(sample_file)
			
 
				+        rate = result.removal_rate
			
 
				+        assert 0.0 <= rate <= 1.0
			
 
				+
			
 
				+    def test_read_and_clean(self, pipeline, sample_file):
			
 
				+        """Test reading and cleaning without splitting."""
			
 
				+        content = pipeline.read_and_clean(sample_file)
			
 
				+        assert isinstance(content, str)
			
 
				+        assert len(content) > 0
			
 
				+
			
 
				+    def test_get_file_info(self, pipeline, sample_file):
			
 
				+        """Test getting file information."""
			
 
				+        info = pipeline.get_file_info(sample_file)
			
 
				+        assert info['exists'] is True
			
 
				+        assert info['is_file'] is True
			
 
				+        assert 'size' in info
			
 
				+        assert 'encoding' in info
			
 
				+
			
 
				+    def test_get_file_info_nonexistent(self, pipeline, tmp_path):
			
 
				+        """Test getting info for non-existent file."""
			
 
				+        info = pipeline.get_file_info(tmp_path / "nonexistent.txt")
			
 
				+        assert info['exists'] is False
			
 
				+        assert info['is_file'] is False
			
 
				+
			
 
				+    def test_custom_components(self, sample_file):
			
 
				+        """Test pipeline with custom components."""
			
 
				+        custom_reader = TxtReader(default_encoding="utf-8")
			
 
				+        custom_cleaner = TextCleaner(remove_extra_whitespace=True)
			
 
				+        custom_splitter = ChapterSplitter(min_chapter_length=10)
			
 
				+
			
 
				+        pipeline = CleaningPipeline(
			
 
				+            reader=custom_reader,
			
 
				+            cleaner=custom_cleaner,
			
 
				+            splitter=custom_splitter
			
 
				+        )
			
 
				+
			
 
				+        chapters = pipeline.process(sample_file)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_disable_cleaning(self, sample_file):
			
 
				+        """Test pipeline with cleaning disabled."""
			
 
				+        pipeline = CleaningPipeline(enable_cleaning=False)
			
 
				+        chapters, info = pipeline.process(sample_file, return_info=True)
			
 
				+        assert len(chapters) >= 2
			
 
				+        assert info.get('removed_chars', 0) == 0
			
 
				+
			
 
				+    def test_disable_splitting(self, sample_file):
			
 
				+        """Test pipeline with splitting disabled."""
			
 
				+        pipeline = CleaningPipeline(enable_splitting=False)
			
 
				+        chapters = pipeline.process(sample_file)
			
 
				+        assert len(chapters) == 1
			
 
				+        assert chapters[0].title == "全文"
			
 
				+
			
 
				+    def test_create_custom_splitter(self, pipeline):
			
 
				+        """Test creating custom splitter."""
			
 
				+        pipeline.create_custom_splitter(
			
 
				+            min_chapter_length=50,
			
 
				+            merge_short_chapters=False
			
 
				+        )
			
 
				+        assert pipeline.splitter.min_chapter_length == 50
			
 
				+
			
 
				+    def test_create_custom_cleaner(self, pipeline):
			
 
				+        """Test creating custom cleaner."""
			
 
				+        pipeline.create_custom_cleaner(
			
 
				+            remove_extra_whitespace=True,
			
 
				+            fix_punctuation=True
			
 
				+        )
			
 
				+        assert pipeline.cleaner.remove_extra_whitespace is True
			
 
				+
			
 
				+    def test_batch_process(self, pipeline, tmp_path):
			
 
				+        """Test batch processing multiple files."""
			
 
				+        # Create multiple files
			
 
				+        files = []
			
 
				+        for i in range(3):
			
 
				+            file_path = tmp_path / f"file_{i}.txt"
			
 
				+            content = f"第{i+1}章\n内容{i}\n"
			
 
				+            file_path.write_text(content, encoding="utf-8")
			
 
				+            files.append(file_path)
			
 
				+
			
 
				+        results = pipeline.batch_process(files)
			
 
				+        assert len(results) == 3
			
 
				+
			
 
				+        for path, chapters in results:
			
 
				+            assert isinstance(chapters, list)
			
 
				+
			
 
				+    def test_batch_process_with_errors(self, pipeline, tmp_path):
			
 
				+        """Test batch processing with some errors."""
			
 
				+        files = [
			
 
				+            tmp_path / "exists.txt",
			
 
				+            tmp_path / "nonexistent.txt"
			
 
				+        ]
			
 
				+        files[0].write_text("内容", encoding="utf-8")
			
 
				+
			
 
				+        results = pipeline.batch_process(files, raise_on_error=False)
			
 
				+        assert len(results) == 2
			
 
				+        assert isinstance(results[0][1], list)  # Success
			
 
				+        assert isinstance(results[1][1], Exception)  # Error
			
 
				+
			
 
				+    def test_batch_process_raise_on_error(self, pipeline, tmp_path):
			
 
				+        """Test batch processing raises on error."""
			
 
				+        files = [tmp_path / "nonexistent.txt"]
			
 
				+
			
 
				+        with pytest.raises(CleaningPipelineError):
			
 
				+            pipeline.batch_process(files, raise_on_error=True)
			
 
				+
			
 
				+    def test_process_nonexistent_file(self, pipeline):
			
 
				+        """Test processing non-existent file raises error."""
			
 
				+        with pytest.raises(CleaningPipelineError):
			
 
				+            pipeline.process("/nonexistent/file.txt")
			
 
				+
			
 
				+    def test_process_empty_file(self, pipeline, tmp_path):
			
 
				+        """Test processing empty file."""
			
 
				+        empty_file = tmp_path / "empty.txt"
			
 
				+        empty_file.write_text("", encoding="utf-8")
			
 
				+
			
 
				+        chapters = pipeline.process(empty_file)
			
 
				+        # Should handle gracefully - either empty list or single empty chapter
			
 
				+        assert isinstance(chapters, list)
			
 
				+
			
 
				+    def test_result_properties(self, pipeline, sample_file):
			
 
				+        """Test CleaningResult properties."""
			
 
				+        result = pipeline.process_to_result(sample_file)
			
 
				+
			
 
				+        # Test chapter_count property
			
 
				+        assert result.chapter_count == len(result.chapters)
			
 
				+
			
 
				+        # Test chapters have content
			
 
				+        for chapter in result.chapters:
			
 
				+            assert hasattr(chapter, 'content')
			
 
				+            assert hasattr(chapter, 'char_count')
			
 
				+
			
 
				+    def test_chapter_word_count_property(self, pipeline, sample_file):
			
 
				+        """Test chapter word_count property."""
			
 
				+        chapters = pipeline.process(sample_file)
			
 
				+        for chapter in chapters:
			
 
				+            assert chapter.word_count >= 0
			
 
				+
			
 
				+    def test_chapter_len_operator(self, pipeline, sample_file):
			
 
				+        """Test len() operator on chapters."""
			
 
				+        chapters = pipeline.process(sample_file)
			
 
				+        for chapter in chapters:
			
 
				+            assert len(chapter) == chapter.char_count
			
 
				+
			
 
				+    def test_full_pipeline_integration(self, pipeline, sample_file):
			
 
				+        """Test full integration of all components."""
			
 
				+        # This test verifies the entire pipeline works together
			
 
				+        result = pipeline.process_to_result(sample_file)
			
 
				+
			
 
				+        # Verify all stages completed
			
 
				+        assert result.chapter_count > 0
			
 
				+        assert result.original_char_count > 0
			
 
				+        assert result.cleaned_char_count >= 0
			
 
				+
			
 
				+        # Verify chapter structure
			
 
				+        for chapter in result.chapters:
			
 
				+            assert hasattr(chapter, 'index')
			
 
				+            assert hasattr(chapter, 'title')
			
 
				+            assert hasattr(chapter, 'content')
			
 
				+            assert chapter.index >= 0
			
 
				+
			
 
				+    def test_chinese_encoding_detection(self, pipeline, tmp_path):
			
 
				+        """Test processing files with different Chinese encodings."""
			
 
				+        # GBK encoded file
			
 
				+        gbk_file = tmp_path / "gbk.txt"
			
 
				+        content = "第一章 测试\n内容"
			
 
				+        gbk_file.write_bytes(content.encode("gbk"))
			
 
				+
			
 
				+        chapters = pipeline.process(gbk_file)
			
 
				+        assert len(chapters) >= 1
			
 
				+
			
 
				+    def test_large_file_handling(self, pipeline, tmp_path):
			
 
				+        """Test handling larger files."""
			
 
				+        large_file = tmp_path / "large.txt"
			
 
				+        # Create a file with many chapters
			
 
				+        lines = []
			
 
				+        for i in range(50):
			
 
				+            lines.append(f"第{i+1}章")
			
 
				+            lines.append("这是测试内容。" * 10)
			
 
				+
			
 
				+        large_file.write_text("\n".join(lines), encoding="utf-8")
			
 
				+
			
 
				+        chapters = pipeline.process(large_file)
			
 
				+        assert len(chapters) == 50
			
 
				+
			
 
				+    def test_no_chapters_detected(self, pipeline, tmp_path):
			
 
				+        """Test file without chapter titles."""
			
 
				+        no_chapter_file = tmp_path / "no_chapter.txt"
			
 
				+        no_chapter_file.write_text("这是一段没有章节标题的文本。\n第二行内容。", encoding="utf-8")
			
 
				+
			
 
				+        chapters = pipeline.process(no_chapter_file)
			
 
				+        # Should return single chapter with "全文" title
			
 
				+        assert len(chapters) == 1
			
 
				+        assert chapters[0].title == "全文"
			
 
				+
			
 
				+    def test_special_characters_in_file(self, pipeline, tmp_path):
			
 
				+        """Test handling files with special characters."""
			
 
				+        special_file = tmp_path / "special.txt"
			
 
				+        content = "第一章：测试！\n\"引号\"内容\n\t制表符\n多种标点：；，。！？"
			
 
				+        special_file.write_text(content, encoding="utf-8")
			
 
				+
			
 
				+        chapters = pipeline.process(special_file)
			
 
				+        assert len(chapters) >= 1
			
 
				+
			
 
				+    def test_cleaning_statistics(self, pipeline, sample_file):
			
 
				+        """Test that cleaning statistics are accurate."""
			
 
				+        result = pipeline.process_to_result(sample_file)
			
 
				+
			
 
				+        # Verify statistics are consistent
			
 
				+        if result.original_char_count > result.cleaned_char_count:
			
 
				+            assert result.removed_char_count > 0
			
 
				+            assert result.removed_char_count == result.original_char_count - result.cleaned_char_count
			
 
				+
			
 
				+    def test_pipeline_with_custom_patterns(self, tmp_path):
			
 
				+        """Test pipeline with custom chapter patterns."""
			
 
				+        custom_file = tmp_path / "custom.txt"
			
 
				+        # Make content longer to avoid merging
			
 
				+        content = """EPISODE 1 Start
			
 
				+
			
 
				+This is episode one with enough content to avoid merging.
			
 
				+
			
 
				+EPISODE 2 Middle
			
 
				+
			
 
				+This is episode two with enough content to avoid merging as well.
			
 
				+"""
			
 
				+        custom_file.write_text(content, encoding="utf-8")
			
 
				+
			
 
				+        pipeline = CleaningPipeline()
			
 
				+        pipeline.create_custom_splitter(
			
 
				+            min_chapter_length=10,
			
 
				+            merge_short_chapters=False,
			
 
				+            custom_patterns=[(r'^EPISODE\s+\d+', 1)]
			
 
				+        )
			
 
				+
			
 
				+        chapters = pipeline.process(custom_file)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_is_binary_detection(self, pipeline, tmp_path):
			
 
				+        """Test binary file detection."""
			
 
				+        text_file = tmp_path / "text.txt"
			
 
				+        text_file.write_text("文本内容", encoding="utf-8")
			
 
				+
			
 
				+        binary_file = tmp_path / "binary.bin"
			
 
				+        binary_file.write_bytes(b"\x00\x01\x02\x03" * 100)
			
 
				+
			
 
				+        text_info = pipeline.get_file_info(text_file)
			
 
				+        binary_info = pipeline.get_file_info(binary_file)
			
 
				+
			
 
				+        assert text_info['is_binary'] is False
			
 
				+        assert binary_info['is_binary'] is True
			
--- a/tests/cleaning/test_reader.py
+++ b/tests/cleaning/test_reader.py
@@ -0,0 +1,215 @@
 
				+"""
			
 
				+Unit tests for TxtReader.
			
 
				+"""
			
 
				+
			
 
				+import pytest
			
 
				+from pathlib import Path
			
 
				+from src.cleaning.reader import TxtReader, TxtReaderError
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def reader():
			
 
				+    """Create a TxtReader instance."""
			
 
				+    return TxtReader()
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def temp_dir(tmp_path):
			
 
				+    """Create temporary directory for test files."""
			
 
				+    return tmp_path
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def utf8_file(temp_dir):
			
 
				+    """Create a UTF-8 encoded test file."""
			
 
				+    file_path = temp_dir / "utf8_test.txt"
			
 
				+    content = "这是一个测试文件。\n第二行内容。\n第三行内容。"
			
 
				+    file_path.write_text(content, encoding="utf-8")
			
 
				+    return file_path, content
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def gbk_file(temp_dir):
			
 
				+    """Create a GBK encoded test file."""
			
 
				+    file_path = temp_dir / "gbk_test.txt"
			
 
				+    content = "这是一个GBK编码的测试文件。\n第二行内容。"
			
 
				+    file_path.write_bytes(content.encode("gbk"))
			
 
				+    return file_path, content
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def utf8_bom_file(temp_dir):
			
 
				+    """Create a UTF-8 with BOM test file."""
			
 
				+    file_path = temp_dir / "utf8_bom_test.txt"
			
 
				+    content = "这是带有BOM的UTF-8文件。\n第二行。"
			
 
				+    # Write UTF-8 BOM + content
			
 
				+    file_path.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
			
 
				+    return file_path, content
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def large_file(temp_dir):
			
 
				+    """Create a large test file for performance testing."""
			
 
				+    file_path = temp_dir / "large_test.txt"
			
 
				+    # Create a file with about 100KB of content
			
 
				+    content = "这是测试内容。" * 10000
			
 
				+    file_path.write_text(content, encoding="utf-8")
			
 
				+    return file_path, content
			
 
				+
			
 
				+
			
 
				+class TestTxtReader:
			
 
				+    """Test suite for TxtReader."""
			
 
				+
			
 
				+    def test_read_utf8_file(self, reader, utf8_file):
			
 
				+        """Test reading a UTF-8 encoded file."""
			
 
				+        file_path, expected_content = utf8_file
			
 
				+        content = reader.read(file_path)
			
 
				+        assert content == expected_content
			
 
				+
			
 
				+    def test_read_gbk_file(self, reader, gbk_file):
			
 
				+        """Test reading a GBK encoded file."""
			
 
				+        file_path, expected_content = gbk_file
			
 
				+        content = reader.read(file_path)
			
 
				+        assert content == expected_content
			
 
				+
			
 
				+    def test_read_utf8_bom_file(self, reader, utf8_bom_file):
			
 
				+        """Test reading a UTF-8 file with BOM."""
			
 
				+        file_path, expected_content = utf8_bom_file
			
 
				+        content = reader.read(file_path)
			
 
				+        assert content == expected_content
			
 
				+
			
 
				+    def test_read_nonexistent_file(self, reader):
			
 
				+        """Test reading a non-existent file raises FileNotFoundError."""
			
 
				+        with pytest.raises(FileNotFoundError):
			
 
				+            reader.read("/nonexistent/path/file.txt")
			
 
				+
			
 
				+    def test_read_with_info_returns_encoding(self, reader, utf8_file):
			
 
				+        """Test read_with_info returns both content and encoding."""
			
 
				+        file_path, expected_content = utf8_file
			
 
				+        content, encoding = reader.read_with_info(file_path)
			
 
				+        assert content == expected_content
			
 
				+        assert encoding == "utf-8"
			
 
				+
			
 
				+    def test_read_with_info_gbk(self, reader, gbk_file):
			
 
				+        """Test read_with_info detects GBK encoding."""
			
 
				+        file_path, expected_content = gbk_file
			
 
				+        content, encoding = reader.read_with_info(file_path)
			
 
				+        assert content == expected_content
			
 
				+        # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
			
 
				+        assert encoding in ["gbk", "gb2312", "gb18030"]
			
 
				+
			
 
				+    def test_read_lines_keep_newlines(self, reader, utf8_file):
			
 
				+        """Test reading lines with newlines preserved."""
			
 
				+        file_path, content = utf8_file
			
 
				+        lines = reader.read_lines(file_path, keep_newlines=True)
			
 
				+        assert len(lines) == 3
			
 
				+        assert lines[0].endswith("\n")
			
 
				+
			
 
				+    def test_read_lines_without_newlines(self, reader, utf8_file):
			
 
				+        """Test reading lines without newlines."""
			
 
				+        file_path, content = utf8_file
			
 
				+        lines = reader.read_lines(file_path, keep_newlines=False)
			
 
				+        assert len(lines) == 3
			
 
				+        assert not lines[0].endswith("\n")
			
 
				+
			
 
				+    def test_detect_encoding_utf8(self, reader, utf8_file):
			
 
				+        """Test encoding detection for UTF-8."""
			
 
				+        file_path, _ = utf8_file
			
 
				+        encoding = reader.detect_encoding(file_path)
			
 
				+        assert encoding in ["utf-8", "ascii"]
			
 
				+
			
 
				+    def test_detect_encoding_gbk(self, reader, gbk_file):
			
 
				+        """Test encoding detection for GBK."""
			
 
				+        file_path, _ = gbk_file
			
 
				+        encoding = reader.detect_encoding(file_path)
			
 
				+        # GB18030 is superset of GBK/GB2312, chardet may detect it as GB18030
			
 
				+        assert encoding in ["gbk", "gb2312", "gb18030"]
			
 
				+
			
 
				+    def test_detect_encoding_nonexistent(self, reader):
			
 
				+        """Test encoding detection for non-existent file."""
			
 
				+        encoding = reader.detect_encoding("/nonexistent/file.txt")
			
 
				+        assert encoding is None
			
 
				+
			
 
				+    def test_is_binary_with_text_file(self, reader, utf8_file):
			
 
				+        """Test is_binary returns False for text files."""
			
 
				+        file_path, _ = utf8_file
			
 
				+        assert not reader.is_binary(file_path)
			
 
				+
			
 
				+    def test_is_binary_with_binary_file(self, reader, temp_dir):
			
 
				+        """Test is_binary returns True for binary files."""
			
 
				+        file_path = temp_dir / "binary_test.bin"
			
 
				+        file_path.write_bytes(b"\x00\x01\x02\x03" * 1000)
			
 
				+        assert reader.is_binary(file_path)
			
 
				+
			
 
				+    def test_is_binary_nonexistent(self, reader):
			
 
				+        """Test is_binary returns False for non-existent file."""
			
 
				+        assert not reader.is_binary("/nonexistent/file.txt")
			
 
				+
			
 
				+    def test_read_large_file_performance(self, reader, large_file):
			
 
				+        """Test that large files are read efficiently."""
			
 
				+        import time
			
 
				+
			
 
				+        file_path, expected_content = large_file
			
 
				+        start_time = time.time()
			
 
				+        content = reader.read(file_path)
			
 
				+        elapsed = time.time() - start_time
			
 
				+
			
 
				+        assert content == expected_content
			
 
				+        # Should read 100KB in less than 1 second
			
 
				+        assert elapsed < 1.0
			
 
				+
			
 
				+    def test_custom_default_encoding(self, temp_dir):
			
 
				+        """Test reader with custom default encoding."""
			
 
				+        file_path = temp_dir / "gbk_test.txt"
			
 
				+        content = "GBK编码测试"
			
 
				+        file_path.write_bytes(content.encode("gbk"))
			
 
				+
			
 
				+        reader = TxtReader(default_encoding="gbk")
			
 
				+        result = reader.read(file_path)
			
 
				+        assert result == content
			
 
				+
			
 
				+    def test_empty_file(self, reader, temp_dir):
			
 
				+        """Test reading an empty file."""
			
 
				+        file_path = temp_dir / "empty.txt"
			
 
				+        file_path.write_text("", encoding="utf-8")
			
 
				+        content = reader.read(file_path)
			
 
				+        assert content == ""
			
 
				+
			
 
				+    def test_file_with_special_characters(self, reader, temp_dir):
			
 
				+        """Test reading file with various special characters."""
			
 
				+        file_path = temp_dir / "special.txt"
			
 
				+        content = "测试！@#$%^&*()_+-=[]{}|;':\",./<>?\n换行\n制表符\t内容"
			
 
				+        file_path.write_text(content, encoding="utf-8")
			
 
				+        result = reader.read(file_path)
			
 
				+        assert result == content
			
 
				+
			
 
				+    def test_file_with_mixed_line_endings(self, reader, temp_dir):
			
 
				+        """Test reading file with mixed line endings gets normalized."""
			
 
				+        file_path = temp_dir / "mixed_endings.txt"
			
 
				+        # Write with binary to preserve exact bytes
			
 
				+        content = "Line1\nLine2\r\nLine3\r"
			
 
				+        file_path.write_bytes(content.encode("utf-8"))
			
 
				+        result = reader.read(file_path)
			
 
				+        # Python's text mode normalizes line endings to \n
			
 
				+        expected = "Line1\nLine2\nLine3\n"
			
 
				+        assert result == expected
			
 
				+
			
 
				+
			
 
				+class TestTxtReaderErrorHandling:
			
 
				+    """Test error handling in TxtReader."""
			
 
				+
			
 
				+    def test_directory_path_raises_error(self, reader, temp_dir):
			
 
				+        """Test that reading a directory raises TxtReaderError."""
			
 
				+        with pytest.raises(TxtReaderError):
			
 
				+            reader.read(temp_dir)
			
 
				+
			
 
				+    def test_unreadable_encoding(self, reader, temp_dir):
			
 
				+        """Test handling of file with encoding that can't be auto-detected."""
			
 
				+        # Create a file that's valid UTF-16 but not readable as UTF-8
			
 
				+        file_path = temp_dir / "utf16_test.txt"
			
 
				+        content = "测试内容"
			
 
				+        file_path.write_bytes(content.encode("utf-16-le"))
			
 
				+
			
 
				+        # Should still be able to read it via fallback
			
 
				+        result = reader.read(file_path)
			
 
				+        assert "测试" in result or len(result) > 0
			
--- a/tests/cleaning/test_splitter.py
+++ b/tests/cleaning/test_splitter.py
@@ -0,0 +1,363 @@
 
				+"""
			
 
				+Unit tests for ChapterSplitter.
			
 
				+"""
			
 
				+
			
 
				+import pytest
			
 
				+from src.cleaning.splitter import ChapterSplitter, ChapterSplitterError
			
 
				+
			
 
				+
			
 
				+class TestChapterSplitter:
			
 
				+    """Test suite for ChapterSplitter."""
			
 
				+
			
 
				+    @pytest.fixture
			
 
				+    def splitter(self):
			
 
				+        """Create a ChapterSplitter instance with low min length for testing."""
			
 
				+        return ChapterSplitter(min_chapter_length=10, merge_short_chapters=False)
			
 
				+
			
 
				+    def test_split_chinese_numerals(self, splitter):
			
 
				+        """Test splitting Chinese chapter titles (第一章, etc.)."""
			
 
				+        text = """第一章 开始
			
 
				+
			
 
				+这是第一章的内容。
			
 
				+
			
 
				+第二章 继续
			
 
				+
			
 
				+这是第二章的内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) == 2
			
 
				+        assert "第一章" in chapters[0].title
			
 
				+        assert "第二章" in chapters[1].title
			
 
				+
			
 
				+    def test_split_arabic_chinese(self, splitter):
			
 
				+        """Test splitting Arabic-Chinese chapter titles (第1章, etc.)."""
			
 
				+        text = """第1章 开始
			
 
				+
			
 
				+这是第一章的内容。
			
 
				+
			
 
				+第2章 继续
			
 
				+
			
 
				+这是第二章的内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) == 2
			
 
				+        assert "第1章" in chapters[0].title
			
 
				+        assert "第2章" in chapters[1].title
			
 
				+
			
 
				+    def test_split_english_format(self, splitter):
			
 
				+        """Test splitting English chapter titles."""
			
 
				+        text = """Chapter 1 The Beginning
			
 
				+
			
 
				+This is chapter one.
			
 
				+
			
 
				+Chapter 2 The Journey Continues
			
 
				+
			
 
				+This is chapter two."""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+        assert "Chapter 1" in chapters[0].title
			
 
				+
			
 
				+    def test_split_numbered_format(self, splitter):
			
 
				+        """Test splitting numbered chapter titles."""
			
 
				+        text = """1. The Start
			
 
				+
			
 
				+Content here.
			
 
				+
			
 
				+2. The Middle
			
 
				+
			
 
				+More content."""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+        assert "1." in chapters[0].title
			
 
				+
			
 
				+    def test_split_date_format(self, splitter):
			
 
				+        """Test splitting date format chapters."""
			
 
				+        text = """2024年3月15日
			
 
				+
			
 
				+这是第一天的内容。
			
 
				+
			
 
				+2024年3月16日
			
 
				+
			
 
				+这是第二天的内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_split_volume_format(self, splitter):
			
 
				+        """Test splitting volume format (第一卷, etc.)."""
			
 
				+        text = """第一卷 命运的开始
			
 
				+
			
 
				+这是第一卷的内容。
			
 
				+
			
 
				+第二卷 奇遇
			
 
				+
			
 
				+这是第二卷的内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_split_with_brackets(self, splitter):
			
 
				+        """Test splitting bracket format chapters."""
			
 
				+        text = """[1] 开始
			
 
				+
			
 
				+内容。
			
 
				+
			
 
				+[2] 继续
			
 
				+
			
 
				+更多内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_empty_text_returns_empty_list(self, splitter):
			
 
				+        """Test that empty text returns empty chapter list."""
			
 
				+        chapters = splitter.split("")
			
 
				+        assert chapters == []
			
 
				+
			
 
				+    def test_no_chapter_titles_returns_single_chapter(self, splitter):
			
 
				+        """Test that text without chapter titles becomes one chapter."""
			
 
				+        text = "这是一段没有章节标题的文本。\n第二行内容。"
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) == 1
			
 
				+        assert chapters[0].title == "全文"
			
 
				+
			
 
				+    def test_chapter_char_count(self, splitter):
			
 
				+        """Test that chapter character count is correct."""
			
 
				+        text = """第一章 测试章节
			
 
				+
			
 
				+这是第一章的内容，包含一些文字。
			
 
				+
			
 
				+第二章 第二个章节
			
 
				+
			
 
				+这是第二章的内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert chapters[0].char_count > 0
			
 
				+        assert len(chapters[0].content) == chapters[0].char_count
			
 
				+
			
 
				+    def test_chapter_positions(self, splitter):
			
 
				+        """Test that chapter start/end positions are correct."""
			
 
				+        text = "第一章\n内容1\n\n第二章\n内容2"
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+        if chapters[0].start_pos is not None:
			
 
				+            assert chapters[0].start_pos == 0
			
 
				+
			
 
				+    def test_detect_chapter_title_chinese(self, splitter):
			
 
				+        """Test chapter title detection for Chinese format."""
			
 
				+        assert splitter.is_chapter_title("第一章 开始")
			
 
				+        assert splitter.is_chapter_title("第123章")
			
 
				+        assert splitter.is_chapter_title("第十章 约战")
			
 
				+
			
 
				+    def test_detect_chapter_title_english(self, splitter):
			
 
				+        """Test chapter title detection for English format."""
			
 
				+        assert splitter.is_chapter_title("Chapter 1")
			
 
				+        assert splitter.is_chapter_title("Chapter One - The Beginning")
			
 
				+
			
 
				+    def test_detect_chapter_title_numbered(self, splitter):
			
 
				+        """Test chapter title detection for numbered format."""
			
 
				+        assert splitter.is_chapter_title("1. Start")
			
 
				+        assert splitter.is_chapter_title("123. End")
			
 
				+
			
 
				+    def test_detect_chapter_title_date(self, splitter):
			
 
				+        """Test chapter title detection for date format."""
			
 
				+        assert splitter.is_chapter_title("2024年3月15日")
			
 
				+        assert splitter.is_chapter_title("2024年12月1日")
			
 
				+
			
 
				+    def test_detect_not_chapter_title(self, splitter):
			
 
				+        """Test that non-titles are correctly identified."""
			
 
				+        assert not splitter.is_chapter_title("这是一个普通的句子")
			
 
				+        assert not splitter.is_chapter_title("")
			
 
				+        assert not splitter.is_chapter_title("hello world")
			
 
				+
			
 
				+    def test_get_chapter_count(self, splitter):
			
 
				+        """Test getting chapter count without full split."""
			
 
				+        text = """第一章 开始
			
 
				+
			
 
				+内容。
			
 
				+
			
 
				+第二章 继续
			
 
				+
			
 
				+更多内容。
			
 
				+
			
 
				+第三章 结束
			
 
				+
			
 
				+最后内容。"""
			
 
				+
			
 
				+        count = splitter.get_chapter_count(text)
			
 
				+        assert count == 3
			
 
				+
			
 
				+    def test_get_chapter_count_no_chapters(self, splitter):
			
 
				+        """Test getting chapter count for text without chapters."""
			
 
				+        text = "这是一段没有章节的文本。"
			
 
				+        count = splitter.get_chapter_count(text)
			
 
				+        assert count == 1
			
 
				+
			
 
				+    def test_preview_chapters(self, splitter):
			
 
				+        """Test getting chapter previews."""
			
 
				+        text = """第一章 开始
			
 
				+
			
 
				+这是第一章的内容，包含一些文字。
			
 
				+
			
 
				+第二章 继续
			
 
				+
			
 
				+这是第二章的内容，包含更多文字。"""
			
 
				+
			
 
				+        previews = splitter.preview_chapters(text, preview_length=50)
			
 
				+        assert len(previews) >= 2
			
 
				+        assert "第一章" in previews[0]
			
 
				+        assert "第二章" in previews[1]
			
 
				+
			
 
				+    def test_merge_short_chapters_enabled(self):
			
 
				+        """Test that short chapters are merged when enabled."""
			
 
				+        text = """第一章 开始
			
 
				+
			
 
				+短。
			
 
				+
			
 
				+第二章 中间
			
 
				+
			
 
				+这是第二章较长的内容。
			
 
				+
			
 
				+第三章 结尾
			
 
				+
			
 
				+也短。"""
			
 
				+
			
 
				+        splitter = ChapterSplitter(min_chapter_length=50, merge_short_chapters=True)
			
 
				+        chapters = splitter.split(text)
			
 
				+        # Short chapters should be merged with adjacent ones
			
 
				+        assert len(chapters) <= 3
			
 
				+
			
 
				+    def test_merge_short_chapters_disabled(self):
			
 
				+        """Test that short chapters are kept when merging disabled."""
			
 
				+        text = """第一章 开始
			
 
				+
			
 
				+短内容。
			
 
				+
			
 
				+第二章 继续
			
 
				+
			
 
				+更多内容。"""
			
 
				+
			
 
				+        splitter = ChapterSplitter(min_chapter_length=1000, merge_short_chapters=False)
			
 
				+        chapters = splitter.split(text)
			
 
				+        # All chapters should be kept
			
 
				+        assert len(chapters) == 2
			
 
				+
			
 
				+    def test_custom_patterns(self):
			
 
				+        """Test using custom chapter patterns."""
			
 
				+        text = """EPISODE 1 Start
			
 
				+
			
 
				+Content.
			
 
				+
			
 
				+EPISODE 2 Middle
			
 
				+
			
 
				+More content."""
			
 
				+
			
 
				+        custom_patterns = [(r'^EPISODE\s+\d+', 1)]
			
 
				+        splitter = ChapterSplitter(
			
 
				+            min_chapter_length=10,
			
 
				+            merge_short_chapters=False,
			
 
				+            custom_patterns=custom_patterns
			
 
				+        )
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+        assert "EPISODE 1" in chapters[0].title
			
 
				+
			
 
				+    def test_mixed_pattern_types(self, splitter):
			
 
				+        """Test handling mixed chapter pattern types."""
			
 
				+        text = """第一章 开始
			
 
				+
			
 
				+内容。
			
 
				+
			
 
				+Chapter 2 Middle
			
 
				+
			
 
				+English content.
			
 
				+
			
 
				+第三章 End
			
 
				+
			
 
				+中文内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        # Should detect all chapters despite mixed formats
			
 
				+        assert len(chapters) >= 3
			
 
				+
			
 
				+    def test_chapter_with_special_characters(self, splitter):
			
 
				+        """Test chapters with special characters in title."""
			
 
				+        text = """第一章：命运的齿轮！
			
 
				+
			
 
				+内容。
			
 
				+
			
 
				+第二章 - 新的开始
			
 
				+
			
 
				+更多内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_large_chapter_count(self, splitter):
			
 
				+        """Test handling many chapters."""
			
 
				+        # Create text with 100 chapters
			
 
				+        lines = []
			
 
				+        for i in range(1, 101):
			
 
				+            lines.append(f"第{i}章")
			
 
				+            lines.append(f"这是第{i}章的内容。\n")
			
 
				+
			
 
				+        text = "\n".join(lines)
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) == 100
			
 
				+
			
 
				+    def test_consecutive_chapter_titles(self, splitter):
			
 
				+        """Test handling consecutive chapter titles without content."""
			
 
				+        text = """第一章
			
 
				+
			
 
				+第二章
			
 
				+
			
 
				+这是第二章的内容。
			
 
				+
			
 
				+第三章
			
 
				+
			
 
				+这是第三章的内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        # Should handle empty chapters gracefully
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_chapter_with_leading_whitespace(self, splitter):
			
 
				+        """Test chapter titles with leading whitespace."""
			
 
				+        text = """  第一章 开始
			
 
				+
			
 
				+内容。
			
 
				+
			
 
				+  第二章 继续
			
 
				+
			
 
				+更多内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters) >= 2
			
 
				+
			
 
				+    def test_detect_chapter_title_returns_priority(self, splitter):
			
 
				+        """Test that detect_chapter_title returns priority."""
			
 
				+        result = splitter.detect_chapter_title("第一章 开始")
			
 
				+        assert result is not None
			
 
				+        priority, title = result
			
 
				+        assert isinstance(priority, int)
			
 
				+        assert isinstance(title, str)
			
 
				+
			
 
				+    def test_word_count_property(self, splitter):
			
 
				+        """Test chapter word_count property."""
			
 
				+        text = """第一章 测试
			
 
				+
			
 
				+这是测试内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert chapters[0].word_count > 0
			
 
				+
			
 
				+    def test_len_operator(self, splitter):
			
 
				+        """Test len() operator on Chapter."""
			
 
				+        text = """第一章 测试
			
 
				+
			
 
				+这是测试内容。"""
			
 
				+
			
 
				+        chapters = splitter.split(text)
			
 
				+        assert len(chapters[0]) == chapters[0].char_count