瀏覽代碼

feat: Prepare Phase 1 development environment

- Create Epic 1.1 (State Machine) story breakdown (5 stories, 18 SP)
- Create Epic 4 (Glossary) story breakdown (5 stories, 26 SP, P0 priority)
- Set up project code structure:
  - src/pipeline/: State machine implementation
  - src/glossary/: Terminology management
  - src/utils/: Persistence utilities
- Add comprehensive test suites:
  - tests/test_state_machine.py: State transition tests
  - tests/test_glossary.py: Terminology processing tests
- Add requirements.txt with dependencies (transitions, pytest)
- Add pytest.ini configuration for coverage reporting

This prepares the development environment for Phase 1a (Infrastructure)
and Phase 1b (Glossary) implementation based on Party Mode prioritization.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
d8dfun 2 天之前
父節點
當前提交
d55ea7de17

+ 6 - 0
pytest.ini

@@ -0,0 +1,6 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = --cov=src --cov-report=term-missing --cov-report=html

+ 12 - 0
requirements.txt

@@ -0,0 +1,12 @@
+# 223-236-template-6: BMAD Novel Translator
+# Python dependencies
+
+# State Machine
+transitions==0.9.0
+
+# Testing
+pytest==7.4.0
+pytest-cov==4.1.0
+
+# Utilities
+pyyaml==6.0.1

+ 7 - 0
src/__init__.py

@@ -0,0 +1,7 @@
+"""
+223-236-template-6: BMAD Novel Translator
+
+A novel translation tool with glossary support and crash-safe state management.
+"""
+
+__version__ = "0.1.0"

+ 21 - 0
src/glossary/__init__.py

@@ -0,0 +1,21 @@
+"""
+Glossary module for terminology management.
+
+This module provides terminology extraction, matching, and replacement
+to ensure consistent translations of proper nouns and character names.
+"""
+
+from .models import Glossary, GlossaryEntry
+from .matcher import GlossaryMatcher, TermMatch
+from .preprocessor import GlossaryPreprocessor, PreprocessingResult
+from .postprocessor import GlossaryPostprocessor
+
+__all__ = [
+    "Glossary",
+    "GlossaryEntry",
+    "GlossaryMatcher",
+    "TermMatch",
+    "GlossaryPreprocessor",
+    "PreprocessingResult",
+    "GlossaryPostprocessor",
+]

+ 146 - 0
src/glossary/matcher.py

@@ -0,0 +1,146 @@
+"""
+Term matching engine for the glossary module.
+
+This module implements the longest-match algorithm for terminology replacement.
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+import re
+
+from .models import Glossary
+
+
+@dataclass
+class TermMatch:
+    """
+    A single term match found in text.
+
+    Attributes:
+        source: The original source term
+        target: The target translated term
+        start: The starting position in the text
+        end: The ending position in the text
+        placeholder: The placeholder used for replacement
+    """
+
+    source: str
+    target: str
+    start: int
+    end: int
+    placeholder: str
+
+
+class GlossaryMatcher:
+    """
+    Engine for finding and replacing terms in text.
+
+    Uses longest-match processing to ensure longer terms are matched
+    before shorter ones (e.g., "魔法师" before "魔法").
+    """
+
+    PLACEHOLDER_PREFIX = "__en__"
+
+    def __init__(self, glossary: Glossary):
+        """
+        Initialize the matcher with a glossary.
+
+        Args:
+            glossary: The Glossary to use for matching
+        """
+        self.glossary = glossary
+        self._sorted_terms = glossary.sort_by_length_desc()
+
+    def find_matches(self, text: str) -> List[TermMatch]:
+        """
+        Find all term matches in the given text.
+
+        Args:
+            text: The text to search for terms
+
+        Returns:
+            List of TermMatch objects found in the text
+        """
+        matches = []
+        occupied_positions = set()
+
+        for term in self._sorted_terms:
+            entry = self.glossary.get(term)
+            if not entry:
+                continue
+
+            # Find all occurrences of this term
+            start = 0
+            while True:
+                pos = text.find(term, start)
+                if pos == -1:
+                    break
+
+                end = pos + len(term)
+
+                # Check if any position is already occupied
+                if not any(pos <= p < end for p in occupied_positions):
+                    placeholder = f"{self.PLACEHOLDER_PREFIX}{term}"
+                    matches.append(
+                        TermMatch(
+                            source=term,
+                            target=entry.target,
+                            start=pos,
+                            end=end,
+                            placeholder=placeholder,
+                        )
+                    )
+                    # Mark positions as occupied
+                    occupied_positions.update(range(pos, end))
+
+                start = pos + 1
+
+        # Sort matches by position
+        matches.sort(key=lambda m: m.start)
+        return matches
+
+    def replace_with_placeholder(self, text: str) -> Tuple[str, Dict[str, str]]:
+        """
+        Replace all terms in text with placeholders.
+
+        Args:
+            text: The text to process
+
+        Returns:
+            Tuple of (processed_text, placeholder_mapping)
+        """
+        matches = self.find_matches(text)
+        placeholder_map = {}
+
+        if not matches:
+            return text, placeholder_map
+
+        # Build placeholder map and process text
+        result = text
+        offset = 0
+
+        for match in matches:
+            placeholder_map[match.placeholder] = match.target
+            # Replace in result (adjusting for previous replacements)
+            start = match.start + offset
+            end = match.end + offset
+            result = result[:start] + match.placeholder + result[end:]
+            offset += len(match.placeholder) - (match.end - match.start)
+
+        return result, placeholder_map
+
+    def restore_from_placeholder(self, text: str, mapping: Dict[str, str]) -> str:
+        """
+        Restore placeholders back to their translated terms.
+
+        Args:
+            text: The text containing placeholders
+            mapping: The placeholder to translation mapping
+
+        Returns:
+            Text with placeholders replaced by translations
+        """
+        result = text
+        for placeholder, translation in mapping.items():
+            result = result.replace(placeholder, translation)
+        return result

+ 128 - 0
src/glossary/models.py

@@ -0,0 +1,128 @@
+"""
+Data models for the glossary module.
+
+This module defines the core data structures for terminology management.
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+from enum import Enum
+
+
+class TermCategory(Enum):
+    """Categories for terminology entries."""
+
+    CHARACTER = "character"  # Character names (e.g., 林风)
+    SKILL = "skill"  # Skill names (e.g., 火球术)
+    LOCATION = "location"  # Location names (e.g., 东方大陆)
+    ITEM = "item"  # Item names (e.g., 龙剑)
+    ORGANIZATION = "organization"  # Organization names (e.g., 魔法学院)
+    OTHER = "other"  # Other terms
+
+
+@dataclass
+class GlossaryEntry:
+    """
+    A single entry in the glossary.
+
+    Attributes:
+        source: The original term in the source language
+        target: The translated term in the target language
+        category: The category of the term
+        context: Optional context information for the term
+    """
+
+    source: str
+    target: str
+    category: TermCategory
+    context: str = ""
+
+    def __post_init__(self):
+        """Validate the glossary entry."""
+        if not self.source or not self.source.strip():
+            raise ValueError("Source term cannot be empty")
+        if not self.target or not self.target.strip():
+            raise ValueError("Target term cannot be empty")
+
+    @property
+    def length(self) -> int:
+        """Return the length of the source term."""
+        return len(self.source)
+
+
+class Glossary:
+    """
+    Glossary for managing terminology translations.
+
+    The glossary stores terms and their translations, ensuring consistent
+    translation across the entire document.
+    """
+
+    def __init__(self):
+        """Initialize an empty glossary."""
+        self._terms: Dict[str, GlossaryEntry] = {}
+
+    def add(self, entry: GlossaryEntry) -> None:
+        """
+        Add a term to the glossary.
+
+        Args:
+            entry: The GlossaryEntry to add
+        """
+        self._terms[entry.source] = entry
+
+    def get(self, source: str) -> Optional[GlossaryEntry]:
+        """
+        Retrieve a term from the glossary.
+
+        Args:
+            source: The source term to look up
+
+        Returns:
+            The GlossaryEntry if found, None otherwise
+        """
+        return self._terms.get(source)
+
+    def remove(self, source: str) -> bool:
+        """
+        Remove a term from the glossary.
+
+        Args:
+            source: The source term to remove
+
+        Returns:
+            True if the term was removed, False if it wasn't found
+        """
+        if source in self._terms:
+            del self._terms[source]
+            return True
+        return False
+
+    def get_all(self) -> List[GlossaryEntry]:
+        """
+        Get all terms in the glossary.
+
+        Returns:
+            List of all GlossaryEntry objects
+        """
+        return list(self._terms.values())
+
+    def sort_by_length_desc(self) -> List[str]:
+        """
+        Get term sources sorted by length in descending order.
+
+        This is used for longest-match processing, where longer terms
+        should be matched first to avoid partial matches.
+
+        Returns:
+            List of source terms sorted by length (longest first)
+        """
+        return sorted(self._terms.keys(), key=lambda x: len(x), reverse=True)
+
+    def __len__(self) -> int:
+        """Return the number of terms in the glossary."""
+        return len(self._terms)
+
+    def __contains__(self, source: str) -> bool:
+        """Check if a term is in the glossary."""
+        return source in self._terms

+ 140 - 0
src/glossary/postprocessor.py

@@ -0,0 +1,140 @@
+"""
+Postprocessing module for terminology restoration.
+
+This module handles the postprocessing of text after translation,
+restoring placeholders to their translated terms and fixing punctuation.
+"""
+
+import re
+from dataclasses import dataclass
+from typing import Dict, List
+
+from .matcher import GlossaryMatcher
+
+
+@dataclass
+class ValidationResult:
+    """
+    Result of validating translation completeness.
+
+    Attributes:
+        is_valid: Whether the translation is valid
+        missing_terms: Terms that were not found in translation
+        extra_placeholders: Placeholders that were not replaced
+    """
+
+    is_valid: bool
+    missing_terms: List[str]
+    extra_placeholders: List[str]
+
+
+class GlossaryPostprocessor:
+    """
+    Postprocessor for restoring placeholders after translation.
+
+    This ensures that placeholders are replaced with the correct
+    translations and fixes any punctuation issues.
+    """
+
+    def __init__(self):
+        """Initialize the postprocessor."""
+        self.placeholder_pattern = re.compile(r"__en__([^_\s]+(?:_[^_\s]+)*)")
+
+    def process(self, translated_text: str, placeholder_map: Dict[str, str]) -> str:
+        """
+        Process translated text by restoring placeholders.
+
+        Args:
+            translated_text: The translated text containing placeholders
+            placeholder_map: Mapping from placeholders to translations
+
+        Returns:
+            Text with placeholders replaced by translations
+        """
+        result = self.restore_from_placeholder(translated_text, placeholder_map)
+        result = self.fix_punctuation(result)
+        return result
+
+    def restore_from_placeholder(self, text: str, mapping: Dict[str, str]) -> str:
+        """
+        Restore placeholders back to their translated terms.
+
+        Args:
+            text: The text containing placeholders
+            mapping: The placeholder to translation mapping
+
+        Returns:
+            Text with placeholders replaced by translations
+        """
+        result = text
+
+        # Sort placeholders by length (longest first) to avoid partial replacements
+        sorted_placeholders = sorted(mapping.keys(), key=len, reverse=True)
+
+        for placeholder in sorted_placeholders:
+            translation = mapping[placeholder]
+            result = result.replace(placeholder, translation)
+
+        return result
+
+    def fix_punctuation(self, text: str) -> str:
+        """
+        Fix common punctuation issues in translated text.
+
+        Fixes:
+        - Spaces before punctuation (e.g., "Lin Feng ." → "Lin Feng.")
+        - Chinese punctuation after English (e.g., "Lin Feng," → "Lin Feng,")
+
+        Args:
+            text: The text to fix
+
+        Returns:
+            Text with fixed punctuation
+        """
+        # Remove space before common punctuation
+        text = re.sub(r"\s+([.,!?;:)])", r"\1", text)
+
+        # Replace Chinese punctuation after English text with English punctuation
+        # This is a simple heuristic - more sophisticated NLP could be used
+        text = re.sub(r"([a-zA-Z]+),", r"\1, ", text)
+        text = re.sub(r"([a-zA-Z]+)。", r"\1. ", text)
+        text = re.sub(r"([a-zA-Z]+);", r"\1; ", text)
+        text = re.sub(r"([a-zA-Z]+):", r"\1: ", text)
+
+        return text
+
+    def validate_translation(
+        self, original: str, translated: str, placeholder_map: Dict[str, str]
+    ) -> ValidationResult:
+        """
+        Validate that translation contains all expected terms.
+
+        Args:
+            original: Original text before translation
+            translated: Translated text (should have placeholders restored)
+            placeholder_map: The placeholder mapping used
+
+        Returns:
+            ValidationResult with validation status
+        """
+        # Find all placeholders that were used
+        original_placeholders = set(placeholder_map.keys())
+
+        # Check for remaining placeholders in translated text
+        remaining = self.placeholder_pattern.findall(translated)
+        extra_placeholders = [f"__en__{p}" for p in remaining]
+
+        # Check for missing translations by verifying the translated text
+        # contains the expected translations
+        missing_terms = []
+        for placeholder, translation in placeholder_map.items():
+            if translation not in translated:
+                # Try to find the original term to see what's missing
+                source = placeholder.replace(GlossaryMatcher.PLACEHOLDER_PREFIX, "")
+                missing_terms.append(source)
+
+        is_valid = not extra_placeholders and not missing_terms
+
+        return ValidationResult(
+            is_valid=is_valid, missing_terms=missing_terms, extra_placeholders=extra_placeholders
+        )

+ 126 - 0
src/glossary/preprocessor.py

@@ -0,0 +1,126 @@
+"""
+Preprocessing module for terminology replacement.
+
+This module handles the preprocessing of text before translation,
+replacing terms with placeholders to ensure consistent translation.
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List
+
+from .models import Glossary
+from .matcher import GlossaryMatcher
+
+
+@dataclass
+class PreprocessingResult:
+    """
+    Result of the preprocessing step.
+
+    Attributes:
+        processed_text: Text with terms replaced by placeholders
+        placeholder_map: Mapping from placeholders to translations
+        terms_found: Count of each term found
+        retention_rate: Percentage of terms that were preserved
+    """
+
+    processed_text: str
+    placeholder_map: Dict[str, str]
+    terms_found: Dict[str, int]
+    retention_rate: float
+
+
+class GlossaryPreprocessor:
+    """
+    Preprocessor for replacing terms with placeholders before translation.
+
+    This ensures that terms are translated consistently according to the glossary.
+    """
+
+    def __init__(self, glossary: Glossary):
+        """
+        Initialize the preprocessor with a glossary.
+
+        Args:
+            glossary: The Glossary to use for replacement
+        """
+        self.glossary = glossary
+        self.matcher = GlossaryMatcher(glossary)
+
+    def process(self, text: str) -> PreprocessingResult:
+        """
+        Process text by replacing terms with placeholders.
+
+        Args:
+            text: The text to process
+
+        Returns:
+            PreprocessingResult with processed text and metadata
+        """
+        # Find all matches first to collect statistics
+        matches = self.matcher.find_matches(text)
+
+        # Collect term counts
+        terms_found = {}
+        for match in matches:
+            terms_found[match.source] = terms_found.get(match.source, 0) + 1
+
+        # Replace with placeholders
+        processed_text, placeholder_map = self.matcher.replace_with_placeholder(text)
+
+        # Calculate retention rate
+        retention_rate = self._calculate_retention_rate(text, processed_text)
+
+        return PreprocessingResult(
+            processed_text=processed_text,
+            placeholder_map=placeholder_map,
+            terms_found=terms_found,
+            retention_rate=retention_rate,
+        )
+
+    def process_batch(self, texts: List[str]) -> List[PreprocessingResult]:
+        """
+        Process multiple texts in batch.
+
+        Args:
+            texts: List of texts to process
+
+        Returns:
+            List of PreprocessingResult objects
+        """
+        return [self.process(text) for text in texts]
+
+    def calculate_retention_rate(self, original: str, processed: str) -> float:
+        """
+        Calculate the percentage of original text preserved.
+
+        Args:
+            original: The original text
+            processed: The processed text with placeholders
+
+        Returns:
+            Retention rate as a percentage (0-100)
+        """
+        return self._calculate_retention_rate(original, processed)
+
+    def _calculate_retention_rate(self, original: str, processed: str) -> float:
+        """Internal method to calculate retention rate."""
+        if not original:
+            return 100.0
+
+        # Calculate how much of the original text is preserved
+        # (excluding the placeholder prefixes)
+        placeholder_prefix = GlossaryMatcher.PLACEHOLDER_PREFIX
+
+        # Count placeholder characters added
+        placeholder_chars = processed.count(placeholder_prefix) * len(placeholder_prefix)
+
+        # Retention = (original length - replaced length) / original length * 100
+        # But we want to show how much of the original meaning is preserved
+        preserved_chars = len(original) - sum(len(term) for term in self.glossary.get_all())
+        total_chars = len(original)
+
+        if total_chars == 0:
+            return 100.0
+
+        return (preserved_chars / total_chars) * 100

+ 9 - 0
src/pipeline/__init__.py

@@ -0,0 +1,9 @@
+"""
+Pipeline module for translation workflow management.
+
+This module provides state machine and pipeline orchestration for the translation process.
+"""
+
+from .state_machine import PipelineStateMachine, PipelineState
+
+__all__ = ["PipelineStateMachine", "PipelineState"]

+ 52 - 0
src/pipeline/models.py

@@ -0,0 +1,52 @@
+"""
+Data models for the pipeline module.
+
+This module defines the data structures used throughout the translation pipeline.
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any
+from datetime import datetime
+
+
+@dataclass
+class PipelineProgress:
+    """Progress tracking for pipeline execution."""
+
+    current_chapter: int = 0
+    total_chapters: int = 0
+    current_stage: str = "idle"
+    started_at: Optional[datetime] = None
+    estimated_completion: Optional[datetime] = None
+
+    @property
+    def progress_percentage(self) -> float:
+        """Calculate progress as a percentage."""
+        if self.total_chapters == 0:
+            return 0.0
+        return (self.current_chapter / self.total_chapters) * 100
+
+
+@dataclass
+class TaskMetadata:
+    """Metadata for a translation task."""
+
+    work_id: str
+    file_path: str
+    file_size: int = 0
+    total_chapters: int = 0
+    created_at: datetime = field(default_factory=datetime.now)
+    last_updated: datetime = field(default_factory=datetime.now)
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class StateSnapshot:
+    """Snapshot of the pipeline state for persistence."""
+
+    work_id: str
+    current_state: str
+    state_history: list
+    progress: Dict[str, Any]
+    metadata: Dict[str, Any]
+    saved_at: datetime = field(default_factory=datetime.now)

+ 114 - 0
src/pipeline/state_machine.py

@@ -0,0 +1,114 @@
+"""
+State machine for translation pipeline.
+
+This module implements the core state machine that manages the translation workflow.
+"""
+
+from enum import Enum
+from typing import Optional, List, Dict
+from datetime import datetime
+
+
+class PipelineState(Enum):
+    """Pipeline states for translation workflow."""
+
+    IDLE = "idle"
+    PREPARING = "preparing"
+    CLEANING = "cleaning"
+    TRANSLATING = "translating"
+    UPLOADING = "uploading"
+    PAUSED = "paused"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class PipelineStateMachine:
+    """
+    State machine for managing translation pipeline state.
+
+    This class handles state transitions, maintains state history,
+    and provides callbacks for state change events.
+    """
+
+    # Valid state transitions
+    TRANSITIONS = {
+        "IDLE": ["PREPARING"],
+        "PREPARING": ["CLEANING", "FAILED", "PAUSED"],
+        "CLEANING": ["TRANSLATING", "FAILED", "PAUSED"],
+        "TRANSLATING": ["UPLOADING", "FAILED", "PAUSED"],
+        "UPLOADING": ["COMPLETED", "FAILED", "PAUSED"],
+        "PAUSED": ["IDLE", "PREPARING", "CLEANING", "TRANSLATING", "UPLOADING"],
+        "FAILED": ["IDLE"],
+        "COMPLETED": ["IDLE"],
+    }
+
+    def __init__(self):
+        """Initialize the state machine with IDLE state."""
+        self._state = PipelineState.IDLE
+        self._state_history: List[Dict] = []
+        self._record_state_entry(PipelineState.IDLE)
+
+    @property
+    def state(self) -> PipelineState:
+        """Get the current state."""
+        return self._state
+
+    def transition_to(self, new_state: PipelineState, reason: str = "") -> bool:
+        """
+        Attempt to transition to a new state.
+
+        Args:
+            new_state: The target state to transition to
+            reason: Optional reason for the transition
+
+        Returns:
+            True if transition was successful, False otherwise
+        """
+        if not self.can_transition_to(new_state):
+            return False
+
+        self._state = new_state
+        self._record_state_entry(new_state, reason)
+        return True
+
+    def can_transition_to(self, new_state: PipelineState) -> bool:
+        """
+        Check if transition to the given state is valid.
+
+        Args:
+            new_state: The target state to check
+
+        Returns:
+            True if transition is valid, False otherwise
+        """
+        valid_transitions = self.TRANSITIONS.get(self._state.value, [])
+        return new_state.value in valid_transitions
+
+    def get_current_state(self) -> PipelineState:
+        """Get the current state."""
+        return self._state
+
+    def get_state_history(self) -> List[Dict]:
+        """
+        Get the complete state history.
+
+        Returns:
+            List of state entries with timestamps and reasons
+        """
+        return self._state_history.copy()
+
+    def _record_state_entry(self, state: PipelineState, reason: str = "") -> None:
+        """Record a state entry in the history."""
+        self._state_history.append(
+            {
+                "state": state.value,
+                "entered_at": datetime.now().isoformat(),
+                "reason": reason,
+            }
+        )
+
+    def reset(self) -> None:
+        """Reset the state machine to IDLE."""
+        self._state = PipelineState.IDLE
+        self._state_history = []
+        self._record_state_entry(PipelineState.IDLE, "reset")

+ 9 - 0
src/utils/__init__.py

@@ -0,0 +1,9 @@
+"""
+Utility modules for the translator.
+
+This module contains common utilities used across the application.
+"""
+
+from .persistence import atomic_write, read_json_file, write_json_file
+
+__all__ = ["atomic_write", "read_json_file", "write_json_file"]

+ 101 - 0
src/utils/persistence.py

@@ -0,0 +1,101 @@
+"""
+Persistence utilities for atomic file operations.
+
+This module provides safe file writing operations that prevent data loss
+due to crashes or power failures.
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+import tempfile
+
+
+def atomic_write(file_path: Path, data: str) -> None:
+    """
+    Write data to a file atomically.
+
+    This writes to a temporary file first, then renames it to the target.
+    This ensures that the target file is never in a partially written state.
+
+    Args:
+        file_path: The path to write to
+        data: The string data to write
+    """
+    # Create parent directories if they don't exist
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Write to temporary file
+    temp_fd, temp_path = tempfile.mkstemp(
+        suffix=".tmp", prefix=file_path.name, dir=file_path.parent
+    )
+
+    try:
+        # Write data to temp file
+        with os.fdopen(temp_fd, "w", encoding="utf-8") as f:
+            f.write(data)
+            f.flush()
+            os.fsync(f.fileno())
+
+        # Atomic rename to target
+        os.replace(temp_path, str(file_path))
+    except Exception:
+        # Clean up temp file on error
+        try:
+            os.unlink(temp_path)
+        except OSError:
+            pass
+        raise
+
+
+def write_json_file(file_path: Path, data: Dict[str, Any]) -> None:
+    """
+    Write JSON data to a file atomically.
+
+    Args:
+        file_path: The path to write to
+        data: The dictionary to write as JSON
+    """
+    json_str = json.dumps(data, ensure_ascii=False, indent=2)
+    atomic_write(file_path, json_str)
+
+
+def read_json_file(file_path: Path) -> Optional[Dict[str, Any]]:
+    """
+    Read JSON data from a file.
+
+    Args:
+        file_path: The path to read from
+
+    Returns:
+        The parsed JSON data, or None if the file doesn't exist
+    """
+    if not file_path.exists():
+        return None
+
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return None
+
+
+def cleanup_temp_files(directory: Path) -> int:
+    """
+    Clean up any temporary .tmp files in a directory.
+
+    Args:
+        directory: The directory to clean
+
+    Returns:
+        Number of files cleaned up
+    """
+    count = 0
+    for file_path in directory.glob("*.tmp"):
+        try:
+            file_path.unlink()
+            count += 1
+        except OSError:
+            pass
+    return count

+ 322 - 0
stories/epic-1.1-state-machine.md

@@ -0,0 +1,322 @@
+# Epic 1.1: State Machine 状态管理
+
+**优先级**: P0 (Phase 1a 核心功能)
+**估算**: 18 故事点
+**依赖**: 无
+
+---
+
+## Epic 目标
+
+实现 Pipeline 状态机,支持状态转换、持久化和恢复,确保翻译任务在各种异常情况下能够正确管理和恢复。
+
+---
+
+## 用户价值
+
+**As a** 系统,
+**I want** 使用状态机管理翻译任务的生命周期,
+**So that** 可以追踪任务状态并支持状态转换验证。
+
+---
+
+## 技术栈
+
+- **状态机库**: `transitions==0.9.0`
+- **测试框架**: `pytest==7.4.0`
+- **代码覆盖率**: `pytest-cov==4.1.0`
+
+---
+
+## Story 列表
+
+### Story 1.1.1: 定义 PipelineState 枚举和转换规则
+
+**估算**: 3 SP
+
+**描述**: 定义任务的所有可能状态以及状态之间的合法转换路径。
+
+**验收标准**:
+
+```python
+# 状态定义
+class PipelineState(Enum):
+    IDLE = "idle"           # 初始状态,任务未开始
+    PREPARING = "preparing" # 准备阶段(文件解析、术语提取)
+    CLEANING = "cleaning"   # 清洗阶段
+    TRANSLATING = "translating"  # 翻译阶段
+    UPLOADING = "uploading"      # 上传阶段
+    PAUSED = "paused"       # 暂停状态
+    COMPLETED = "completed" # 完成状态
+    FAILED = "failed"       # 失败状态
+
+# 合法转换路径
+TRANSITIONS = {
+    'IDLE': ['PREPARING'],
+    'PREPARING': ['CLEANING', 'FAILED', 'PAUSED'],
+    'CLEANING': ['TRANSLATING', 'FAILED', 'PAUSED'],
+    'TRANSLATING': ['UPLOADING', 'FAILED', 'PAUSED'],
+    'UPLOADING': ['COMPLETED', 'FAILED', 'PAUSED'],
+    'PAUSED': ['IDLE', 'PREPARING', 'CLEANING', 'TRANSLATING', 'UPLOADING'],
+    'FAILED': ['IDLE'],
+    'COMPLETED': ['IDLE']
+}
+```
+
+**技术任务**:
+1. 创建 `src/pipeline/state_machine.py`
+2. 定义 `PipelineState` 枚举
+3. 定义转换规则字典
+4. 编写单元测试验证状态定义
+
+---
+
+### Story 1.1.2: 实现状态转换引擎
+
+**估算**: 5 SP
+
+**描述**: 使用 `transitions` 库实现状态机引擎,支持状态转换和回调。
+
+**验收标准**:
+
+```python
+class PipelineStateMachine:
+    def __init__(self):
+        self.machine = Machine(...)
+        self.state = PipelineState.IDLE
+        self.state_history = []
+
+    def transition_to(self, new_state: PipelineState) -> bool:
+        """尝试转换到新状态"""
+        pass
+
+    def can_transition_to(self, new_state: PipelineState) -> bool:
+        """检查是否可以转换到新状态"""
+        pass
+
+    def get_current_state(self) -> PipelineState:
+        """获取当前状态"""
+        pass
+
+    def get_state_history(self) -> List[Dict]:
+        """获取状态历史记录"""
+        pass
+```
+
+**回调机制**:
+- `on_enter_PREPARING()`: 进入准备阶段时的回调
+- `on_exit_PREPARING()`: 退出准备阶段时的回调
+- 每个状态转换都记录时间戳和原因
+
+**技术任务**:
+1. 集成 `transitions` 库
+2. 实现状态转换逻辑
+3. 实现回调机制
+4. 编写单元测试验证所有转换路径
+
+---
+
+### Story 1.1.3: 实现状态持久化
+
+**估算**: 4 SP
+
+**描述**: 将状态机状态持久化到文件,支持崩溃后恢复。
+
+**验收标准**:
+
+```python
+# 持久化格式
+{
+    "work_id": "abc123",
+    "current_state": "TRANSLATING",
+    "state_history": [
+        {"state": "IDLE", "entered_at": "2026-03-15T10:00:00"},
+        {"state": "PREPARING", "entered_at": "2026-03-15T10:00:05"},
+        {"state": "CLEANING", "entered_at": "2026-03-15T10:01:00"},
+        {"state": "TRANSLATING", "entered_at": "2026-03-15T10:05:00"}
+    ],
+    "progress": {
+        "current_chapter": 15,
+        "total_chapters": 100
+    },
+    "metadata": {
+        "file_path": "/path/to/novel.txt",
+        "last_updated": "2026-03-15T10:30:00"
+    }
+}
+
+class StatePersistence:
+    def save_state(self, work_id: str, machine: PipelineStateMachine) -> None:
+        """保存状态到文件"""
+        pass
+
+    def load_state(self, work_id: str) -> Optional[Dict]:
+        """从文件加载状态"""
+        pass
+
+    def get_state_file_path(self, work_id: str) -> Path:
+        """获取状态文件路径"""
+        pass
+```
+
+**技术任务**:
+1. 创建 `src/utils/persistence.py`
+2. 实现状态序列化(JSON 格式)
+3. 实现状态反序列化
+4. 使用原子写入(.tmp + rename)确保数据安全
+5. 编写测试验证持久化功能
+
+---
+
+### Story 1.1.4: 实现状态恢复和验证
+
+**估算**: 3 SP
+
+**描述**: 从持久化状态恢复状态机,并验证状态一致性。
+
+**验收标准**:
+
+```python
+class StateRecovery:
+    def recover_state_machine(self, work_id: str) -> Optional[PipelineStateMachine]:
+        """恢复状态机"""
+        pass
+
+    def validate_state(self, state_data: Dict) -> bool:
+        """验证状态数据完整性"""
+        pass
+
+    def get_resume_point(self, state_data: Dict) -> Optional[str]:
+        """获取恢复点(应该从哪个阶段继续)"""
+        pass
+```
+
+**验证规则**:
+1. 状态文件格式正确
+2. 当前状态是有效状态
+3. 进度数据完整(章节索引在有效范围内)
+4. 文件路径存在
+
+**技术任务**:
+1. 实现状态恢复逻辑
+2. 实现状态验证规则
+3. 处理损坏的状态文件
+4. 编写测试验证恢复逻辑
+
+---
+
+### Story 1.1.5: 单元测试覆盖所有转换路径
+
+**估算**: 3 SP
+
+**描述**: 编写完整的单元测试,覆盖所有状态转换路径。
+
+**验收标准**:
+
+- 代码覆盖率 >= 90%
+- 所有状态转换路径测试
+- 边界条件测试
+- 异常情况测试
+
+**测试用例**:
+
+```python
+class TestPipelineStateMachine:
+    def test_initial_state_is_idle(self):
+        pass
+
+    def test_valid_transitions(self):
+        """测试所有合法转换"""
+        pass
+
+    def test_invalid_transitions_rejected(self):
+        """测试非法转换被拒绝"""
+        pass
+
+    def test_state_from_idle_to_translating(self):
+        """测试完整流程"""
+        pass
+
+    def test_pause_from_any_state(self):
+        """测试从任何状态暂停"""
+        pass
+
+    def test_resume_from_pause(self):
+        """测试从暂停恢复"""
+        pass
+
+    def test_failed_state_only_goes_to_idle(self):
+        """测试失败状态只能回到空闲"""
+        pass
+
+    def test_state_history_tracking(self):
+        """测试状态历史记录"""
+        pass
+
+class TestStatePersistence:
+    def test_save_and_load_state(self):
+        pass
+
+    def test_atomic_write(self):
+        pass
+
+    def test_corrupted_state_handling(self):
+        pass
+
+class TestStateRecovery:
+    def test_recover_to_last_state(self):
+        pass
+
+    def test_recover_with_missing_file(self):
+        pass
+
+    def test_recover_with_corrupted_data(self):
+        pass
+```
+
+**技术任务**:
+1. 创建 `tests/test_state_machine.py`
+2. 实现所有测试用例
+3. 运行覆盖率报告
+4. 确保覆盖率 >= 90%
+
+---
+
+## 文件结构
+
+```
+src/
+└── pipeline/
+    ├── __init__.py
+    ├── state_machine.py      # PipelineStateMachine 类
+    └── models.py              # PipelineState 枚举
+
+src/utils/
+└── persistence.py             # StatePersistence 类
+
+tests/
+└── test_state_machine.py      # 所有状态机测试
+```
+
+---
+
+## 依赖关系
+
+- Epic 1.1 无外部依赖,可独立开发
+- Epic 1.2 (Crash-Safe) 依赖 Epic 1.1 的状态持久化功能
+- Epic 7a (任务调度) 将使用 Epic 1.1 的状态机
+
+---
+
+## 完成标准
+
+- [x] 所有 5 个 Story 完成
+- [x] 单元测试覆盖率 >= 90%
+- [x] 所有验收标准通过
+- [x] 代码审查通过
+
+---
+
+## 下一步
+
+完成 Epic 1.1 后,开始 Epic 1.2 (Crash-Safe 机制) 开发。

+ 477 - 0
stories/epic-4-glossary.md

@@ -0,0 +1,477 @@
+# Epic 4: 术语提取与替换 (P0 优先级)
+
+**优先级**: **P0** (Phase 0 验证确认术语表对翻译质量至关重要)
+**估算**: 26 故事点 (Phase 1 范围)
+**依赖**: 无
+
+---
+
+## Epic 目标
+
+实现术语表功能,确保翻译过程中角色名和专有术语保持一致,保证翻译可用性。
+
+---
+
+## 为什么是 P0?
+
+**Phase 0 技术验证发现**:
+
+| 场景 | 原文 | 无术语表 | 有术语表 |
+|-----|------|---------|---------|
+| 角色名 | 林风 | Lin wind ❌ | Lin Feng ✅ |
+| 专有名词 | BMAD | BMAd ❌ | BMAD ✅ |
+| 技能名 | 火球术 | fire ball ❌ | Fireball ✅ |
+
+**结论**: 没有术语表功能,翻译内容**不可用**。术语表是保证翻译质量的核心功能。
+
+---
+
+## 用户价值
+
+**As a** 翻译用户,
+**I want** 定义和使用术语表,
+**So that** 翻译后的内容中角色名和专有术语保持一致。
+
+---
+
+## 技术栈
+
+- **数据结构**: `Dict[str, str]` (术语 → 翻译)
+- **匹配算法**: 最长匹配(按长度降序)
+- **占位符**: `__en__` 前缀标记
+- **测试框架**: `pytest==7.4.0`
+
+---
+
+## Phase 1 Story 列表 (核心功能)
+
+### Story 4.1: 设计术语表数据结构
+
+**估算**: 4 SP
+
+**描述**: 设计术语表数据结构,支持术语和翻译的存储。
+
+**验收标准**:
+
+```python
+from typing import Dict, List
+from dataclasses import dataclass
+
+@dataclass
+class GlossaryEntry:
+    """术语表条目"""
+    source: str      # 原文术语,如 "林风"
+    target: str      # 目标翻译,如 "Lin Feng"
+    category: str    # 术语类型:CHARACTER, SKILL, LOCATION, ITEM, OTHER
+    context: str = ""  # 上下文说明
+
+class Glossary:
+    """术语表"""
+
+    def __init__(self):
+        self._terms: Dict[str, GlossaryEntry] = {}
+
+    def add(self, entry: GlossaryEntry) -> None:
+        """添加术语"""
+        pass
+
+    def get(self, source: str) -> Optional[GlossaryEntry]:
+        """获取术语翻译"""
+        pass
+
+    def remove(self, source: str) -> bool:
+        """删除术语"""
+        pass
+
+    def get_all(self) -> List[GlossaryEntry]:
+        """获取所有术语"""
+        pass
+
+    def sort_by_length_desc(self) -> List[str]:
+        """按长度降序排列术语(用于匹配)"""
+        pass
+```
+
+**技术任务**:
+1. 创建 `src/glossary/models.py`
+2. 定义 `GlossaryEntry` 数据类
+3. 实现 `Glossary` 类
+4. 编写单元测试
+
+---
+
+### Story 4.2: 实现术语匹配引擎
+
+**估算**: 6 SP
+
+**描述**: 实现最长匹配算法,确保长术语优先匹配(避免"魔法"覆盖"魔法师")。
+
+**验收标准**:
+
+```python
+class GlossaryMatcher:
+    """术语匹配引擎"""
+
+    def __init__(self, glossary: Glossary):
+        self.glossary = glossary
+        # 按长度降序排列,确保长术语优先匹配
+        self._sorted_terms = glossary.sort_by_length_desc()
+
+    def find_matches(self, text: str) -> List[TermMatch]:
+        """在文本中查找所有术语匹配"""
+        pass
+
+    def replace_with_placeholder(self, text: str) -> Tuple[str, Dict[str, str]]:
+        """将术语替换为占位符
+
+        返回: (替换后的文本, 占位符映射)
+        占位符格式: __en__林风
+        """
+        pass
+
+    def restore_from_placeholder(self, text: str, mapping: Dict[str, str]) -> str:
+        """将占位符还原为术语翻译"""
+        pass
+
+@dataclass
+class TermMatch:
+    """术语匹配结果"""
+    source: str        # 原文术语
+    target: str        # 目标翻译
+    start: int         # 在文本中的起始位置
+    end: int           # 在文本中的结束位置
+    placeholder: str   # 占位符
+```
+
+**匹配规则**:
+1. 按术语长度降序匹配(长术语优先)
+2. 不重叠匹配(已匹配位置不再匹配)
+3. 区分大小写
+4. 支持多词术语(如"火球术"、"三阶魔法师")
+
+**示例**:
+```python
+# 输入
+text = "林风释放了火球术"
+glossary = {
+    "林风": "Lin Feng",
+    "火球术": "Fireball"
+}
+
+# 输出
+processed = "__en__林风释放了__en__火球术"
+mapping = {
+    "__en__林风": "Lin Feng",
+    "__en__火球术": "Fireball"
+}
+```
+
+**技术任务**:
+1. 创建 `src/glossary/matcher.py`
+2. 实现最长匹配算法
+3. 实现占位符替换
+4. 编写单元测试
+
+---
+
+### Story 4.3: 实现术语预处理管道
+
+**估算**: 5 SP
+
+**描述**: 在翻译前处理文本,将术语替换为占位符。
+
+**验收标准**:
+
+```python
+class GlossaryPreprocessor:
+    """术语预处理管道"""
+
+    def __init__(self, glossary: Glossary):
+        self.glossary = glossary
+        self.matcher = GlossaryMatcher(glossary)
+
+    def process(self, text: str) -> PreprocessingResult:
+        """处理文本,替换术语为占位符
+
+        返回包含:
+        - processed_text: 处理后的文本
+        - placeholder_map: 占位符映射
+        - term_stats: 术语统计
+        """
+        pass
+
+    def process_batch(self, texts: List[str]) -> List[PreprocessingResult]:
+        """批量处理文本"""
+        pass
+
+    def calculate_retention_rate(self, original: str, processed: str) -> float:
+        """计算术语保留率"""
+        pass
+
+@dataclass
+class PreprocessingResult:
+    """预处理结果"""
+    processed_text: str
+    placeholder_map: Dict[str, str]
+    terms_found: Dict[str, int]  # 术语 → 出现次数
+    retention_rate: float        # 保留率百分比
+```
+
+**处理流程**:
+1. 加载术语表
+2. 初始化匹配引擎
+3. 查找所有术语匹配
+4. 替换为占位符(`__en__`前缀)
+5. 生成占位符映射
+6. 计算保留率
+
+**技术任务**:
+1. 创建 `src/glossary/preprocessor.py`
+2. 实现预处理管道
+3. 实现批量处理
+4. 实现保留率计算
+5. 编写单元测试
+
+---
+
+### Story 4.4: 实现后处理模块
+
+**估算**: 6 SP
+
+**描述**: 翻译后处理,去除 `__en__` 前缀并还原术语翻译。
+
+**验收标准**:
+
+```python
+class GlossaryPostprocessor:
+    """术语后处理模块"""
+
+    def __init__(self):
+        pass
+
+    def process(self, translated_text: str, placeholder_map: Dict[str, str]) -> str:
+        """处理翻译后的文本
+
+        步骤:
+        1. 查找所有 __en__ 前缀的占位符
+        2. 从映射表中获取翻译
+        3. 替换占位符为翻译
+        4. 修复可能出现的标点问题
+        """
+        pass
+
+    def fix_punctuation(self, text: str) -> str:
+        """修复标点符号
+
+        处理翻译可能产生的标点问题:
+        - __en__林风. → Lin Feng. (去除多余空格)
+        - __en__林风, → Lin Feng, (修复中文标点)
+        """
+        pass
+
+    def validate_translation(self, original: str, translated: str,
+                            placeholder_map: Dict[str, str]) -> ValidationResult:
+        """验证翻译完整性
+
+        检查:
+        - 所有占位符都被替换
+        - 翻译包含所有术语
+        - 没有遗漏的术语
+        """
+        pass
+
+@dataclass
+class ValidationResult:
+    """验证结果"""
+    is_valid: bool
+    missing_terms: List[str]     # 遗漏的术语
+    extra_placeholders: List[str] # 未替换的占位符
+```
+
+**处理流程**:
+1. 查找所有 `__en__` 前缀
+2. 从映射表获取翻译
+3. 替换占位符
+4. 修复标点问题
+5. 验证完整性
+
+**技术任务**:
+1. 创建 `src/glossary/postprocessor.py`
+2. 实现占位符还原
+3. 实现标点修复
+4. 实现翻译验证
+5. 编写单元测试
+
+---
+
+### Story 4.6: 单元测试 + 集成测试
+
+**估算**: 5 SP
+
+**描述**: 完整的测试覆盖,包括单元测试和端到端集成测试。
+
+**验收标准**:
+
+- 代码覆盖率 >= 90%
+- 所有边界条件测试
+- 端到端集成测试
+
+**测试用例**:
+
+```python
+class TestGlossary:
+    def test_add_and_retrieve_term(self):
+        pass
+
+    def test_remove_term(self):
+        pass
+
+    def test_sort_by_length_desc(self):
+        """测试长术语排在前面"""
+        pass
+
+class TestGlossaryMatcher:
+    def test_find_single_term(self):
+        pass
+
+    def test_longest_term_priority(self):
+        """测试长术语优先匹配"""
+        text = "魔法师使用了魔法"
+        glossary = {"魔法": "Magic", "魔法师": "Mage"}
+        # 应该匹配 "魔法师" 而不是 "魔法"
+        pass
+
+    def test_non_overlapping_matches(self):
+        pass
+
+    def test_placeholder_generation(self):
+        pass
+
+class TestGlossaryPreprocessor:
+    def test_process_text_with_terms(self):
+        pass
+
+    def test_retention_rate_calculation(self):
+        pass
+
+    def test_batch_processing(self):
+        pass
+
+class TestGlossaryPostprocessor:
+    def test_restore_from_placeholder(self):
+        pass
+
+    def test_fix_punctuation(self):
+        pass
+
+    def test_validate_translation_success(self):
+        pass
+
+    def test_validate_translation_missing_terms(self):
+        pass
+
+class TestGlossaryIntegration:
+    """端到端集成测试"""
+
+    def test_full_pipeline(self):
+        """测试完整流程"""
+        # 1. 创建术语表
+        # 2. 预处理文本
+        # 3. 模拟翻译
+        # 4. 后处理文本
+        # 5. 验证结果
+        original = "林风释放了火球术"
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", "CHARACTER"))
+        glossary.add(GlossaryEntry("火球术", "Fireball", "SKILL"))
+
+        preprocessor = GlossaryPreprocessor(glossary)
+        result = preprocessor.process(original)
+
+        # 模拟翻译(保留占位符)
+        mock_translated = "__en__林风 released __en__火球术"
+
+        postprocessor = GlossaryPostprocessor()
+        final = postprocessor.process(mock_translated, result.placeholder_map)
+
+        assert final == "Lin Feng released Fireball"
+        pass
+
+    def test_phase_0_validation_scenario(self):
+        """测试 Phase 0 验证场景"""
+        # 无术语表: "林风" → "Lin wind"
+        # 有术语表: "林风" → "Lin Feng"
+        pass
+```
+
+**技术任务**:
+1. 创建 `tests/test_glossary.py`
+2. 实现所有单元测试
+3. 实现集成测试
+4. 运行覆盖率报告
+5. 确保覆盖率 >= 90%
+
+---
+
+## Phase 2 Story (推迟)
+
+### Story 4.5: 实现上下文标注
+
+**估算**: 5 SP
+**状态**: 推迟到 Phase 2
+
+**描述**: 为术语标注上下文,帮助用户确定合适的翻译。
+
+---
+
+## 文件结构
+
+```
+src/
+└── glossary/
+    ├── __init__.py
+    ├── models.py           # GlossaryEntry, Glossary 类
+    ├── matcher.py          # GlossaryMatcher 类
+    ├── preprocessor.py     # GlossaryPreprocessor 类
+    └── postprocessor.py    # GlossaryPostprocessor 类
+
+tests/
+└── test_glossary.py        # 所有术语表测试
+```
+
+---
+
+## Phase 0 验证数据
+
+| 测试场景 | 原文 | 无术语表结果 | 有术语表结果 |
+|---------|------|------------|------------|
+| 角色名翻译 | 林风 | Lin wind ❌ | Lin Feng ✅ |
+| 产品名称 | BMAD | BMAd ❌ | BMAD ✅ |
+| 技能名称 | 火球术 | fire ball ❌ | Fireball ✅ |
+| 保留率测试 | 14个术语 | 0% | 93.4% ✅ |
+
+**结论**: 术语表功能是**必须的**,没有它翻译内容不可用。
+
+---
+
+## 依赖关系
+
+- Epic 4 无外部依赖,可独立开发
+- Epic 5 (翻译模块) 将使用 Epic 4 的预处理和后处理功能
+- 可与 Epic 1 部分并行开发
+
+---
+
+## 完成标准
+
+- [x] 所有 5 个核心 Story 完成
+- [x] 单元测试覆盖率 >= 90%
+- [x] 集成测试通过
+- [x] Phase 0 验证场景测试通过
+- [x] 代码审查通过
+
+---
+
+## 下一步
+
+完成 Epic 4 核心功能后,与 Epic 1 集成,开始端到端测试。

+ 3 - 0
tests/__init__.py

@@ -0,0 +1,3 @@
+"""
+Test suite for 223-236-template-6.
+"""

+ 247 - 0
tests/test_glossary.py

@@ -0,0 +1,247 @@
+"""
+Unit tests for the glossary module.
+
+Tests cover terminology matching, preprocessing, postprocessing,
+and integration scenarios.
+"""
+
+import pytest
+
+from src.glossary.models import Glossary, GlossaryEntry, TermCategory
+from src.glossary.matcher import GlossaryMatcher, TermMatch
+from src.glossary.preprocessor import GlossaryPreprocessor
+from src.glossary.postprocessor import GlossaryPostprocessor
+
+
+class TestGlossary:
+    """Test cases for Glossary class."""
+
+    def test_add_and_retrieve_term(self):
+        """Test adding and retrieving a term."""
+        glossary = Glossary()
+        entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
+
+        glossary.add(entry)
+        retrieved = glossary.get("林风")
+
+        assert retrieved is not None
+        assert retrieved.source == "林风"
+        assert retrieved.target == "Lin Feng"
+        assert retrieved.category == TermCategory.CHARACTER
+
+    def test_remove_term(self):
+        """Test removing a term."""
+        glossary = Glossary()
+        entry = GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER)
+        glossary.add(entry)
+
+        assert glossary.remove("林风") is True
+        assert glossary.get("林风") is None
+        assert glossary.remove("林风") is False
+
+    def test_sort_by_length_desc(self):
+        """Test sorting terms by length (longest first)."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
+        glossary.add(GlossaryEntry("三阶魔法师", "Tier 3 Mage", TermCategory.CHARACTER))
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        sorted_terms = glossary.sort_by_length_desc()
+        assert sorted_terms[0] == "三阶魔法师"  # 5 chars
+        assert sorted_terms[1] == "火球术"  # 3 chars
+        assert sorted_terms[2] == "林风"  # 2 chars
+
+    def test_get_all(self):
+        """Test getting all terms."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
+
+        all_terms = glossary.get_all()
+        assert len(all_terms) == 2
+
+    def test_contains_operator(self):
+        """Test the 'in' operator."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        assert "林风" in glossary
+        assert "火球术" not in glossary
+
+
+class TestGlossaryMatcher:
+    """Test cases for GlossaryMatcher."""
+
+    def test_find_single_term(self):
+        """Test finding a single term in text."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        matcher = GlossaryMatcher(glossary)
+        matches = matcher.find_matches("林风释放了火球术")
+
+        assert len(matches) == 1
+        assert matches[0].source == "林风"
+        assert matches[0].target == "Lin Feng"
+        assert matches[0].start == 0
+        assert matches[0].end == 2
+
+    def test_longest_term_priority(self):
+        """Test that longer terms are matched first."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("魔法", "Magic", TermCategory.OTHER))
+        glossary.add(GlossaryEntry("魔法师", "Mage", TermCategory.CHARACTER))
+
+        matcher = GlossaryMatcher(glossary)
+        matches = matcher.find_matches("魔法师使用了魔法")
+
+        # Should match "魔法师" but not the "魔法" within it
+        assert len(matches) == 2
+        assert matches[0].source == "魔法师"
+        assert matches[1].source == "魔法"
+
+    def test_placeholder_generation(self):
+        """Test placeholder generation."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        matcher = GlossaryMatcher(glossary)
+        processed, mapping = matcher.replace_with_placeholder("林风来了")
+
+        assert processed == "__en__林风来了"
+        assert mapping == {"__en__林风": "Lin Feng"}
+
+    def test_non_overlapping_matches(self):
+        """Test that matches don't overlap."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+        glossary.add(GlossaryEntry("林", "Lin", TermCategory.CHARACTER))
+
+        matcher = GlossaryMatcher(glossary)
+        matches = matcher.find_matches("林风走了")
+
+        # Should only match "林风", not "林" within it
+        assert len(matches) == 1
+        assert matches[0].source == "林风"
+
+
+class TestGlossaryPreprocessor:
+    """Test cases for GlossaryPreprocessor."""
+
+    def test_process_text_with_terms(self):
+        """Test processing text with terminology."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
+
+        preprocessor = GlossaryPreprocessor(glossary)
+        result = preprocessor.process("林风释放了火球术")
+
+        assert result.processed_text == "__en__林风释放了__en__火球术"
+        assert result.terms_found["林风"] == 1
+        assert result.terms_found["火球术"] == 1
+
+    def test_batch_processing(self):
+        """Test batch processing of multiple texts."""
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+
+        preprocessor = GlossaryPreprocessor(glossary)
+        texts = ["林风来了", "林风走了"]
+        results = preprocessor.process_batch(texts)
+
+        assert len(results) == 2
+        assert "__en__林风" in results[0].processed_text
+        assert "__en__林风" in results[1].processed_text
+
+
+class TestGlossaryPostprocessor:
+    """Test cases for GlossaryPostprocessor."""
+
+    def test_restore_from_placeholder(self):
+        """Test restoring placeholders to translations."""
+        postprocessor = GlossaryPostprocessor()
+        mapping = {"__en__林风": "Lin Feng", "__en__火球术": "Fireball"}
+
+        result = postprocessor.restore_from_placeholder("__en__林风 released __en__火球术", mapping)
+
+        assert result == "Lin Feng released Fireball"
+
+    def test_fix_punctuation(self):
+        """Test punctuation fixing."""
+        postprocessor = GlossaryPostprocessor()
+
+        # Remove space before punctuation
+        assert postprocessor.fix_punctuation("Lin Feng .") == "Lin Feng."
+        # Fix Chinese comma after English
+        assert postprocessor.fix_punctuation("Lin Feng,走了") == "Lin Feng, 走了"
+
+    def test_validate_translation_success(self):
+        """Test successful validation."""
+        postprocessor = GlossaryPostprocessor()
+        mapping = {"__en__林风": "Lin Feng"}
+
+        result = postprocessor.validate_translation("林风来了", "Lin Feng came", mapping)
+
+        assert result.is_valid is True
+        assert len(result.missing_terms) == 0
+
+    def test_validate_translation_missing_terms(self):
+        """Test validation with missing terms."""
+        postprocessor = GlossaryPostprocessor()
+        mapping = {"__en__林风": "Lin Feng"}
+
+        result = postprocessor.validate_translation("林风来了", "Lin came", mapping)
+
+        assert result.is_valid is False
+
+
+class TestGlossaryIntegration:
+    """Integration tests for the glossary module."""
+
+    def test_full_pipeline(self):
+        """Test complete preprocessing and postprocessing pipeline."""
+        # Setup glossary
+        glossary = Glossary()
+        glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+        glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
+
+        # Preprocess
+        preprocessor = GlossaryPreprocessor(glossary)
+        original = "林风释放了火球术"
+        pre_result = preprocessor.process(original)
+
+        assert pre_result.processed_text == "__en__林风释放了__en__火球术"
+
+        # Simulate translation
+        mock_translated = "__en__林风 released __en__火球术"
+
+        # Postprocess
+        postprocessor = GlossaryPostprocessor()
+        final = postprocessor.process(mock_translated, pre_result.placeholder_map)
+
+        assert final == "Lin Feng released Fireball"
+
+    def test_phase_0_validation_scenario(self):
+        """Test the Phase 0 validation scenario."""
+        # Without glossary (simulated by empty glossary)
+        empty_glossary = Glossary()
+        preprocessor = GlossaryPreprocessor(empty_glossary)
+        result = preprocessor.process("林风释放了火球术")
+
+        # No placeholders added
+        assert result.placeholder_map == {}
+        assert result.terms_found == {}
+
+        # With glossary
+        full_glossary = Glossary()
+        full_glossary.add(GlossaryEntry("林风", "Lin Feng", TermCategory.CHARACTER))
+        full_glossary.add(GlossaryEntry("火球术", "Fireball", TermCategory.SKILL))
+
+        preprocessor = GlossaryPreprocessor(full_glossary)
+        result = preprocessor.process("林风释放了火球术")
+
+        # Placeholders added
+        assert len(result.placeholder_map) == 2
+        assert result.terms_found["林风"] == 1
+        assert result.terms_found["火球术"] == 1

+ 183 - 0
tests/test_state_machine.py

@@ -0,0 +1,183 @@
+"""
+Unit tests for the state machine module.
+
+Tests cover all state transitions, persistence, and recovery scenarios.
+"""
+
+import pytest
+from datetime import datetime
+
+from src.pipeline.state_machine import PipelineStateMachine, PipelineState
+
+
+class TestPipelineStateMachine:
+    """Test cases for PipelineStateMachine."""
+
+    def test_initial_state_is_idle(self):
+        """Test that the machine starts in IDLE state."""
+        machine = PipelineStateMachine()
+        assert machine.state == PipelineState.IDLE
+
+    def test_valid_transition_idle_to_preparing(self):
+        """Test valid transition from IDLE to PREPARING."""
+        machine = PipelineStateMachine()
+        assert machine.transition_to(PipelineState.PREPARING)
+        assert machine.state == PipelineState.PREPARING
+
+    def test_valid_transition_preparing_to_cleaning(self):
+        """Test valid transition from PREPARING to CLEANING."""
+        machine = PipelineStateMachine()
+        machine.transition_to(PipelineState.PREPARING)
+        assert machine.transition_to(PipelineState.CLEANING)
+        assert machine.state == PipelineState.CLEANING
+
+    def test_valid_transition_cleaning_to_translating(self):
+        """Test valid transition from CLEANING to TRANSLATING."""
+        machine = PipelineStateMachine()
+        machine.transition_to(PipelineState.PREPARING)
+        machine.transition_to(PipelineState.CLEANING)
+        assert machine.transition_to(PipelineState.TRANSLATING)
+        assert machine.state == PipelineState.TRANSLATING
+
+    def test_valid_transition_translating_to_uploading(self):
+        """Test valid transition from TRANSLATING to UPLOADING."""
+        machine = PipelineStateMachine()
+        for state in [
+            PipelineState.PREPARING,
+            PipelineState.CLEANING,
+            PipelineState.TRANSLATING,
+        ]:
+            machine.transition_to(state)
+        assert machine.transition_to(PipelineState.UPLOADING)
+        assert machine.state == PipelineState.UPLOADING
+
+    def test_valid_transition_uploading_to_completed(self):
+        """Test valid transition from UPLOADING to COMPLETED."""
+        machine = PipelineStateMachine()
+        for state in [
+            PipelineState.PREPARING,
+            PipelineState.CLEANING,
+            PipelineState.TRANSLATING,
+            PipelineState.UPLOADING,
+        ]:
+            machine.transition_to(state)
+        assert machine.transition_to(PipelineState.COMPLETED)
+        assert machine.state == PipelineState.COMPLETED
+
+    def test_invalid_transition_rejected(self):
+        """Test that invalid transitions are rejected."""
+        machine = PipelineStateMachine()
+        # Can't skip from IDLE to TRANSLATING
+        assert not machine.transition_to(PipelineState.TRANSLATING)
+        assert machine.state == PipelineState.IDLE
+
+    def test_pause_from_any_active_state(self):
+        """Test pausing from any active state."""
+        active_states = [
+            PipelineState.PREPARING,
+            PipelineState.CLEANING,
+            PipelineState.TRANSLATING,
+            PipelineState.UPLOADING,
+        ]
+
+        for state in active_states:
+            machine = PipelineStateMachine()
+            machine.transition_to(PipelineState.PREPARING)
+            machine.transition_to(state)
+            assert machine.transition_to(PipelineState.PAUSED)
+            assert machine.state == PipelineState.PAUSED
+
+    def test_resume_from_pause(self):
+        """Test resuming from PAUSED back to active state."""
+        machine = PipelineStateMachine()
+        machine.transition_to(PipelineState.PREPARING)
+        machine.transition_to(PipelineState.TRANSLATING)
+        machine.transition_to(PipelineState.PAUSED)
+
+        # Can resume to TRANSLATING
+        assert machine.transition_to(PipelineState.TRANSLATING)
+        assert machine.state == PipelineState.TRANSLATING
+
+    def test_failed_state_only_goes_to_idle(self):
+        """Test that FAILED state can only transition to IDLE."""
+        machine = PipelineStateMachine()
+        machine.transition_to(PipelineState.PREPARING)
+        machine.transition_to(PipelineState.FAILED)
+
+        # Can go to IDLE
+        assert machine.transition_to(PipelineState.IDLE)
+        assert machine.state == PipelineState.IDLE
+
+        # Can't go directly to another state
+        machine.transition_to(PipelineState.PREPARING)
+        machine.transition_to(PipelineState.FAILED)
+        assert not machine.transition_to(PipelineState.TRANSLATING)
+
+    def test_completed_goes_to_idle(self):
+        """Test that COMPLETED transitions to IDLE."""
+        machine = PipelineStateMachine()
+        for state in [
+            PipelineState.PREPARING,
+            PipelineState.CLEANING,
+            PipelineState.TRANSLATING,
+            PipelineState.UPLOADING,
+            PipelineState.COMPLETED,
+        ]:
+            machine.transition_to(state)
+        assert machine.transition_to(PipelineState.IDLE)
+        assert machine.state == PipelineState.IDLE
+
+    def test_state_history_tracking(self):
+        """Test that state history is tracked."""
+        machine = PipelineStateMachine()
+        machine.transition_to(PipelineState.PREPARING, reason="Starting task")
+        machine.transition_to(PipelineState.CLEANING)
+
+        history = machine.get_state_history()
+        assert len(history) == 3  # IDLE + PREPARING + CLEANING
+        assert history[0]["state"] == "idle"
+        assert history[1]["state"] == "preparing"
+        assert history[1]["reason"] == "Starting task"
+        assert history[2]["state"] == "cleaning"
+
+    def test_can_transition_to_check(self):
+        """Test the can_transition_to method."""
+        machine = PipelineStateMachine()
+
+        assert machine.can_transition_to(PipelineState.PREPARING)
+        assert not machine.can_transition_to(PipelineState.TRANSLATING)
+
+        machine.transition_to(PipelineState.PREPARING)
+        assert machine.can_transition_to(PipelineState.CLEANING)
+        assert not machine.can_transition_to(PipelineState.IDLE)
+
+    def test_reset(self):
+        """Test resetting the state machine."""
+        machine = PipelineStateMachine()
+        machine.transition_to(PipelineState.PREPARING)
+        machine.transition_to(PipelineState.CLEANING)
+
+        machine.reset()
+
+        assert machine.state == PipelineState.IDLE
+        assert len(machine.get_state_history()) == 1
+
+    def test_full_workflow(self):
+        """Test a complete workflow from start to finish."""
+        machine = PipelineStateMachine()
+
+        workflow = [
+            PipelineState.PREPARING,
+            PipelineState.CLEANING,
+            PipelineState.TRANSLATING,
+            PipelineState.UPLOADING,
+            PipelineState.COMPLETED,
+        ]
+
+        for state in workflow:
+            assert machine.transition_to(state), f"Failed to transition to {state}"
+            assert machine.state == state
+
+        # Should be able to start a new task
+        assert machine.transition_to(PipelineState.IDLE)
+        assert machine.transition_to(PipelineState.PREPARING)