2 天之前 · bfe8a3071c
--- a/src/scheduler/__init__.py
+++ b/src/scheduler/__init__.py
@@ -0,0 +1,59 @@
 
				+"""
			
 
				+Scheduler module for pipeline orchestration.
			
 
				+
			
 
				+This module provides task scheduling, progress tracking,
			
 
				+and crash recovery for the translation pipeline.
			
 
				+"""
			
 
				+
			
 
				+from .models import (
			
 
				+    SchedulerState,
			
 
				+    TaskStatus,
			
 
				+    ChapterTask,
			
 
				+    PipelineProgress,
			
 
				+    CheckpointData
			
 
				+)
			
 
				+from .task_queue import TaskQueue, TaskStats
			
 
				+from .retry import RetryManager, RetryConfig, RetryRecord
			
 
				+from .progress import (
			
 
				+    ProgressObserver,
			
 
				+    ProgressNotifier,
			
 
				+    ConsoleProgressObserver,
			
 
				+    CallbackProgressObserver,
			
 
				+    ProgressEvent
			
 
				+)
			
 
				+from .recovery import RecoveryManager, AutoCheckpointMixin, compute_work_fingerprint
			
 
				+from .pipeline_scheduler import PipelineScheduler, PipelineSchedulerError
			
 
				+
			
 
				+__all__ = [
			
 
				+    # Models
			
 
				+    "SchedulerState",
			
 
				+    "TaskStatus",
			
 
				+    "ChapterTask",
			
 
				+    "PipelineProgress",
			
 
				+    "CheckpointData",
			
 
				+
			
 
				+    # Task Queue
			
 
				+    "TaskQueue",
			
 
				+    "TaskStats",
			
 
				+
			
 
				+    # Retry
			
 
				+    "RetryManager",
			
 
				+    "RetryConfig",
			
 
				+    "RetryRecord",
			
 
				+
			
 
				+    # Progress
			
 
				+    "ProgressObserver",
			
 
				+    "ProgressNotifier",
			
 
				+    "ConsoleProgressObserver",
			
 
				+    "CallbackProgressObserver",
			
 
				+    "ProgressEvent",
			
 
				+
			
 
				+    # Recovery
			
 
				+    "RecoveryManager",
			
 
				+    "AutoCheckpointMixin",
			
 
				+    "compute_work_fingerprint",
			
 
				+
			
 
				+    # Scheduler
			
 
				+    "PipelineScheduler",
			
 
				+    "PipelineSchedulerError",
			
 
				+]
			
--- a/src/scheduler/models.py
+++ b/src/scheduler/models.py
@@ -0,0 +1,153 @@
 
				+"""
			
 
				+Data models for the scheduler module.
			
 
				+
			
 
				+This module defines the core data structures for task scheduling
			
 
				+and pipeline coordination.
			
 
				+"""
			
 
				+
			
 
				+from dataclasses import dataclass, field
			
 
				+from typing import Optional, Dict, Any
			
 
				+from datetime import datetime
			
 
				+from enum import Enum
			
 
				+
			
 
				+
			
 
				+class SchedulerState(Enum):
			
 
				+    """States of the pipeline scheduler."""
			
 
				+
			
 
				+    IDLE = "idle"
			
 
				+    RUNNING = "running"
			
 
				+    PAUSED = "paused"
			
 
				+    COMPLETED = "completed"
			
 
				+    FAILED = "failed"
			
 
				+    RECOVERING = "recovering"
			
 
				+
			
 
				+
			
 
				+class TaskStatus(Enum):
			
 
				+    """Status of a chapter task."""
			
 
				+
			
 
				+    PENDING = "pending"
			
 
				+    IN_PROGRESS = "in_progress"
			
 
				+    COMPLETED = "completed"
			
 
				+    FAILED = "failed"
			
 
				+    SKIPPED = "skipped"
			
 
				+    RETRYING = "retrying"
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ChapterTask:
			
 
				+    """
			
 
				+    A task representing a chapter to be processed.
			
 
				+
			
 
				+    Attributes:
			
 
				+        chapter_id: Unique identifier for the task
			
 
				+        chapter_index: Zero-based index of the chapter
			
 
				+        title: Chapter title
			
 
				+        original_content: Original Chinese text
			
 
				+        translated_content: Translated text (filled after completion)
			
 
				+        status: Current task status
			
 
				+        retry_count: Number of retry attempts
			
 
				+        error_message: Error message if failed
			
 
				+        created_at: Task creation timestamp
			
 
				+        started_at: Task start timestamp
			
 
				+        completed_at: Task completion timestamp
			
 
				+    """
			
 
				+
			
 
				+    chapter_id: str
			
 
				+    chapter_index: int
			
 
				+    title: str
			
 
				+    original_content: str
			
 
				+    translated_content: Optional[str] = None
			
 
				+    status: TaskStatus = TaskStatus.PENDING
			
 
				+    retry_count: int = 0
			
 
				+    error_message: Optional[str] = None
			
 
				+    created_at: datetime = field(default_factory=datetime.now)
			
 
				+    started_at: Optional[datetime] = None
			
 
				+    completed_at: Optional[datetime] = None
			
 
				+
			
 
				+    @property
			
 
				+    def duration(self) -> Optional[float]:
			
 
				+        """Get task duration in seconds."""
			
 
				+        if self.started_at and self.completed_at:
			
 
				+            return (self.completed_at - self.started_at).total_seconds()
			
 
				+        return None
			
 
				+
			
 
				+    @property
			
 
				+    def is_finished(self) -> bool:
			
 
				+        """Check if task is finished (success or failed)."""
			
 
				+        return self.status in (TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.SKIPPED)
			
 
				+
			
 
				+    @property
			
 
				+    def can_retry(self) -> bool:
			
 
				+        """Check if task can be retried."""
			
 
				+        return self.status == TaskStatus.FAILED and self.retry_count < 3
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class PipelineProgress:
			
 
				+    """
			
 
				+    Progress tracking for the pipeline.
			
 
				+
			
 
				+    Attributes:
			
 
				+        total_chapters: Total number of chapters to process
			
 
				+        completed_chapters: Number of successfully completed chapters
			
 
				+        failed_chapters: Number of failed chapters
			
 
				+        current_chapter: Index of current chapter being processed
			
 
				+        state: Current scheduler state
			
 
				+        started_at: Pipeline start timestamp
			
 
				+        paused_at: Pipeline pause timestamp (if paused)
			
 
				+        completed_at: Pipeline completion timestamp (if completed)
			
 
				+    """
			
 
				+
			
 
				+    total_chapters: int = 0
			
 
				+    completed_chapters: int = 0
			
 
				+    failed_chapters: int = 0
			
 
				+    current_chapter: int = 0
			
 
				+    state: SchedulerState = SchedulerState.IDLE
			
 
				+    started_at: Optional[datetime] = None
			
 
				+    paused_at: Optional[datetime] = None
			
 
				+    completed_at: Optional[datetime] = None
			
 
				+
			
 
				+    @property
			
 
				+    def pending_chapters(self) -> int:
			
 
				+        """Get number of pending chapters."""
			
 
				+        return self.total_chapters - self.completed_chapters - self.failed_chapters
			
 
				+
			
 
				+    @property
			
 
				+    def completion_rate(self) -> float:
			
 
				+        """Get completion rate (0.0 to 1.0)."""
			
 
				+        if self.total_chapters == 0:
			
 
				+            return 0.0
			
 
				+        return self.completed_chapters / self.total_chapters
			
 
				+
			
 
				+    @property
			
 
				+    def is_complete(self) -> bool:
			
 
				+        """Check if pipeline is complete."""
			
 
				+        return self.state == SchedulerState.COMPLETED
			
 
				+
			
 
				+    @property
			
 
				+    def is_running(self) -> bool:
			
 
				+        """Check if pipeline is running."""
			
 
				+        return self.state == SchedulerState.RUNNING
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class CheckpointData:
			
 
				+    """
			
 
				+    Checkpoint data for crash recovery.
			
 
				+
			
 
				+    Attributes:
			
 
				+        work_id: Work item ID
			
 
				+        current_chapter_index: Index of chapter to resume from
			
 
				+        completed_indices: List of completed chapter indices
			
 
				+        failed_indices: List of failed chapter indices
			
 
				+        timestamp: Checkpoint creation timestamp
			
 
				+        scheduler_state: Current scheduler state
			
 
				+    """
			
 
				+
			
 
				+    work_id: str
			
 
				+    current_chapter_index: int
			
 
				+    completed_indices: list[int] = field(default_factory=list)
			
 
				+    failed_indices: list[int] = field(default_factory=list)
			
 
				+    timestamp: datetime = field(default_factory=datetime.now)
			
 
				+    scheduler_state: SchedulerState = SchedulerState.RUNNING
			
 
				+    metadata: Dict[str, Any] = field(default_factory=dict)
			
--- a/src/scheduler/pipeline_scheduler.py
+++ b/src/scheduler/pipeline_scheduler.py
@@ -0,0 +1,463 @@
 
				+"""
			
 
				+Pipeline scheduler integrating all translation modules.
			
 
				+
			
 
				+This module provides the main scheduler that orchestrates the entire
			
 
				+translation pipeline, including cleaning, glossary processing, translation,
			
 
				+and persistence.
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import threading
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, Dict, Any, List, Callable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..core.state_machine import StateMachine, PipelineState
			
 
				+from ..fingerprint.service import FingerprintService
			
 
				+from ..cleaning.pipeline import CleaningPipeline
			
 
				+from ..glossary.models import Glossary
			
 
				+from ..translator.pipeline import TranslationPipeline
			
 
				+from ..translator.engine import TranslationEngine
			
 
				+from ..repository import Repository, WorkItem
			
 
				+from .models import (
			
 
				+    ChapterTask, TaskStatus, PipelineProgress, SchedulerState,
			
 
				+    CheckpointData
			
 
				+)
			
 
				+from .task_queue import TaskQueue
			
 
				+from .retry import RetryManager, RetryConfig
			
 
				+from .progress import ProgressNotifier, ProgressObserver
			
 
				+from .recovery import RecoveryManager
			
 
				+
			
 
				+
			
 
				+class PipelineSchedulerError(Exception):
			
 
				+    """Base exception for scheduler errors."""
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class PipelineScheduler:
			
 
				+    """
			
 
				+    Main pipeline scheduler integrating all modules.
			
 
				+
			
 
				+    This scheduler orchestrates:
			
 
				+    1. Fingerprint checking (Epic 2)
			
 
				+    2. TXT cleaning and splitting (Epic 3)
			
 
				+    3. Glossary preprocessing (Epic 4)
			
 
				+    4. Translation (Epic 5)
			
 
				+    5. State persistence (Epic 1)
			
 
				+    6. Progress tracking and recovery
			
 
				+
			
 
				+    Features:
			
 
				+    - Full pipeline integration
			
 
				+    - Pause/resume capability
			
 
				+    - Crash-safe recovery
			
 
				+    - Automatic retry with exponential backoff
			
 
				+    - Progress notifications via observer pattern
			
 
				+    - Checkpoint-based recovery
			
 
				+
			
 
				+    Example:
			
 
				+        >>> scheduler = PipelineScheduler("/data/work")
			
 
				+        >>> scheduler.register_progress_observer(ConsoleProgressObserver())
			
 
				+        >>> result = await scheduler.run_full_pipeline("novel.txt")
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        work_dir: str | Path,
			
 
				+        model_path: Optional[str] = None,
			
 
				+        glossary: Optional[Glossary] = None,
			
 
				+        retry_config: Optional[RetryConfig] = None,
			
 
				+        checkpoint_interval: int = 5
			
 
				+    ):
			
 
				+        """
			
 
				+        Initialize the pipeline scheduler.
			
 
				+
			
 
				+        Args:
			
 
				+            work_dir: Working directory for data storage
			
 
				+            model_path: Path to m2m100 model (default: models/m2m100)
			
 
				+            glossary: Optional glossary for terminology
			
 
				+            retry_config: Optional retry configuration
			
 
				+            checkpoint_interval: Save checkpoint every N chapters
			
 
				+        """
			
 
				+        self.work_dir = Path(work_dir)
			
 
				+        self.model_path = model_path or str(Path.cwd() / "models" / "m2m100")
			
 
				+
			
 
				+        # Initialize repository and state machine
			
 
				+        self.repository = Repository(self.work_dir)
			
 
				+        self.state_machine = StateMachine()
			
 
				+
			
 
				+        # Initialize modules
			
 
				+        self.fingerprint_service = FingerprintService(self.repository)
			
 
				+        self.cleaning_pipeline = CleaningPipeline()
			
 
				+        self.translation_engine = TranslationEngine(model_path=self.model_path)
			
 
				+        self.translation_pipeline = TranslationPipeline(
			
 
				+            engine=self.translation_engine,
			
 
				+            glossary=glossary or Glossary()
			
 
				+        )
			
 
				+
			
 
				+        # Initialize scheduler components
			
 
				+        self.task_queue = TaskQueue()
			
 
				+        self.retry_manager = RetryManager(retry_config)
			
 
				+        self.progress_notifier = ProgressNotifier()
			
 
				+        self.recovery_manager = RecoveryManager(self.work_dir)
			
 
				+
			
 
				+        # Progress tracking
			
 
				+        self.progress = PipelineProgress()
			
 
				+        self.checkpoint_interval = checkpoint_interval
			
 
				+        self._chapters_since_checkpoint = 0
			
 
				+
			
 
				+        # Control flags
			
 
				+        self._pause_requested = False
			
 
				+        self._stop_requested = False
			
 
				+        self._lock = threading.Lock()
			
 
				+
			
 
				+    @property
			
 
				+    def current_state(self) -> SchedulerState:
			
 
				+        """Get the current scheduler state."""
			
 
				+        return self.progress.state
			
 
				+
			
 
				+    @property
			
 
				+    def is_running(self) -> bool:
			
 
				+        """Check if the scheduler is running."""
			
 
				+        return self.progress.state == SchedulerState.RUNNING
			
 
				+
			
 
				+    @property
			
 
				+    def is_paused(self) -> bool:
			
 
				+        """Check if the scheduler is paused."""
			
 
				+        return self.progress.state == SchedulerState.PAUSED
			
 
				+
			
 
				+    def register_progress_observer(self, observer: ProgressObserver) -> None:
			
 
				+        """
			
 
				+        Register a progress observer.
			
 
				+
			
 
				+        Args:
			
 
				+            observer: The observer to register
			
 
				+        """
			
 
				+        self.progress_notifier.register(observer)
			
 
				+
			
 
				+    def unregister_progress_observer(self, observer: ProgressObserver) -> None:
			
 
				+        """
			
 
				+        Unregister a progress observer.
			
 
				+
			
 
				+        Args:
			
 
				+            observer: The observer to unregister
			
 
				+        """
			
 
				+        self.progress_notifier.unregister(observer)
			
 
				+
			
 
				+    async def run_full_pipeline(
			
 
				+        self,
			
 
				+        txt_file: str | Path,
			
 
				+        metadata: Optional[Dict[str, Any]] = None
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        Execute the full translation pipeline.
			
 
				+
			
 
				+        Args:
			
 
				+            txt_file: Path to the TXT file to translate
			
 
				+            metadata: Optional metadata for the work item
			
 
				+
			
 
				+        Returns:
			
 
				+            Result dictionary with status, chapter count, etc.
			
 
				+        """
			
 
				+        txt_file = Path(txt_file)
			
 
				+
			
 
				+        # Update state
			
 
				+        self.progress.state = SchedulerState.RUNNING
			
 
				+        self.progress.started_at = datetime.now()
			
 
				+
			
 
				+        try:
			
 
				+            # 1. Check fingerprint (skip if already translated)
			
 
				+            is_duplicate, existing_work_id = self.fingerprint_service.check_before_import(str(txt_file))
			
 
				+            if is_duplicate and existing_work_id:
			
 
				+                return {
			
 
				+                    "status": "skipped",
			
 
				+                    "reason": "already_translated",
			
 
				+                    "work_id": existing_work_id
			
 
				+                }
			
 
				+
			
 
				+            # 2. Create work item
			
 
				+            work = self.repository.create_work(
			
 
				+                str(txt_file),
			
 
				+                **(metadata or {})
			
 
				+            )
			
 
				+
			
 
				+            # 3. Clean and split
			
 
				+            self.state_machine.transition_to(PipelineState.CLEANING)
			
 
				+            chapters_data = self.cleaning_pipeline.process(txt_file)
			
 
				+
			
 
				+            # Setup task queue
			
 
				+            self.progress.total_chapters = len(chapters_data)
			
 
				+            self.progress_notifier.notify_pipeline_start(len(chapters_data))
			
 
				+
			
 
				+            for i, chapter in enumerate(chapters_data):
			
 
				+                self.task_queue.add_chapter(
			
 
				+                    chapter_id=f"{work.work_id}_ch{i}",
			
 
				+                    chapter_index=i,
			
 
				+                    title=chapter.title,
			
 
				+                    content=chapter.content
			
 
				+                )
			
 
				+
			
 
				+            # Save initial checkpoint
			
 
				+            self._save_checkpoint(work.work_id, 0, [], [])
			
 
				+
			
 
				+            # 4. Process chapters
			
 
				+            self.state_machine.transition_to(PipelineState.TRANSLATING)
			
 
				+            completed, failed = await self._process_chapters(work.work_id)
			
 
				+
			
 
				+            # 5. Finalize
			
 
				+            self.state_machine.transition_to(PipelineState.COMPLETED)
			
 
				+            self._finalize_work(work.work_id, completed, failed)
			
 
				+
			
 
				+            # Register fingerprint
			
 
				+            self.fingerprint_service.register_import(work.work_id, str(txt_file))
			
 
				+
			
 
				+            # Clean up checkpoint
			
 
				+            self.recovery_manager.delete_checkpoint()
			
 
				+
			
 
				+            self.progress.state = SchedulerState.COMPLETED
			
 
				+            self.progress.completed_at = datetime.now()
			
 
				+
			
 
				+            result = {
			
 
				+                "status": "completed",
			
 
				+                "work_id": work.work_id,
			
 
				+                "total_chapters": len(chapters_data),
			
 
				+                "completed_chapters": len(completed),
			
 
				+                "failed_chapters": len(failed),
			
 
				+                "duration": self.progress.completed_at.timestamp() - self.progress.started_at.timestamp()
			
 
				+            }
			
 
				+
			
 
				+            self.progress_notifier.notify_pipeline_complete(self.progress)
			
 
				+            return result
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self.progress.state = SchedulerState.FAILED
			
 
				+            self.progress_notifier.notify_pipeline_failed(str(e), self.progress)
			
 
				+            raise PipelineSchedulerError(f"Pipeline failed: {e}")
			
 
				+
			
 
				+    async def _process_chapters(self, work_id: str) -> tuple[List[ChapterTask], List[ChapterTask]]:
			
 
				+        """
			
 
				+        Process all chapters in the queue.
			
 
				+
			
 
				+        Args:
			
 
				+            work_id: Work item ID
			
 
				+
			
 
				+        Returns:
			
 
				+            Tuple of (completed_tasks, failed_tasks)
			
 
				+        """
			
 
				+        completed = []
			
 
				+        failed = []
			
 
				+        completed_indices = []
			
 
				+        failed_indices = []
			
 
				+
			
 
				+        while self.task_queue.has_pending():
			
 
				+            # Check for pause
			
 
				+            if self._pause_requested:
			
 
				+                await self._handle_pause()
			
 
				+                continue
			
 
				+
			
 
				+            # Check for stop
			
 
				+            if self._stop_requested:
			
 
				+                break
			
 
				+
			
 
				+            # Get next task
			
 
				+            task = self.task_queue.get_next_pending()
			
 
				+            if not task:
			
 
				+                break
			
 
				+
			
 
				+            self.progress.current_chapter = task.chapter_index
			
 
				+            self.progress_notifier.notify_chapter_start(task)
			
 
				+
			
 
				+            try:
			
 
				+                # Translate chapter
			
 
				+                result = self.translation_pipeline.translate(
			
 
				+                    task.original_content
			
 
				+                )
			
 
				+
			
 
				+                # Mark complete
			
 
				+                task.translated_content = result.translated
			
 
				+                self.task_queue.mark_completed(task.chapter_id, result.translated)
			
 
				+                completed.append(task)
			
 
				+                completed_indices.append(task.chapter_index)
			
 
				+
			
 
				+                # Save chapter to repository
			
 
				+                self._save_chapter_translation(work_id, task)
			
 
				+
			
 
				+                self.progress_notifier.notify_chapter_complete(task)
			
 
				+
			
 
				+                # Checkpoint periodically
			
 
				+                self._chapters_since_checkpoint += 1
			
 
				+                if self._chapters_since_checkpoint >= self.checkpoint_interval:
			
 
				+                    self._save_checkpoint(
			
 
				+                        work_id,
			
 
				+                        task.chapter_index + 1,
			
 
				+                        completed_indices,
			
 
				+                        failed_indices
			
 
				+                    )
			
 
				+                    self._chapters_since_checkpoint = 0
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                error_msg = str(e)
			
 
				+
			
 
				+                # Check if we should retry
			
 
				+                if self.retry_manager.should_retry(task, error_msg):
			
 
				+                    # Attempt retry
			
 
				+                    success, result = await self.retry_manager.retry_task(
			
 
				+                        task,
			
 
				+                        lambda: self.translation_pipeline.translate(task.original_content),
			
 
				+                        error_msg
			
 
				+                    )
			
 
				+
			
 
				+                    if success:
			
 
				+                        task.translated_content = result.translated
			
 
				+                        self.task_queue.mark_completed(task.chapter_id, result.translated)
			
 
				+                        completed.append(task)
			
 
				+                        completed_indices.append(task.chapter_index)
			
 
				+                        self.progress_notifier.notify_chapter_complete(task)
			
 
				+                        continue
			
 
				+
			
 
				+                # Mark as failed
			
 
				+                self.task_queue.mark_failed(task.chapter_id, error_msg)
			
 
				+                failed.append(task)
			
 
				+                failed_indices.append(task.chapter_index)
			
 
				+                self.progress_notifier.notify_chapter_failed(task, error_msg)
			
 
				+
			
 
				+            self.progress_notifier.notify_progress(
			
 
				+                len(completed) + len(failed),
			
 
				+                self.progress.total_chapters
			
 
				+            )
			
 
				+
			
 
				+        return completed, failed
			
 
				+
			
 
				+    async def _handle_pause(self) -> None:
			
 
				+        """Handle pause request."""
			
 
				+        self.progress.state = SchedulerState.PAUSED
			
 
				+        self.progress.paused_at = datetime.now()
			
 
				+
			
 
				+        # Save checkpoint before pausing
			
 
				+        if hasattr(self, 'current_work_id'):
			
 
				+            self._save_checkpoint(
			
 
				+                self.current_work_id,
			
 
				+                self.progress.current_chapter,
			
 
				+                self._get_completed_indices(),
			
 
				+                self._get_failed_indices()
			
 
				+            )
			
 
				+
			
 
				+        self.progress_notifier.notify_pipeline_paused(self.progress)
			
 
				+
			
 
				+        # Wait for resume
			
 
				+        while self._pause_requested and not self._stop_requested:
			
 
				+            await asyncio.sleep(0.1)
			
 
				+
			
 
				+        self.progress.state = SchedulerState.RUNNING
			
 
				+        self.progress_notifier.notify_pipeline_resumed(self.progress)
			
 
				+
			
 
				+    def pause(self) -> None:
			
 
				+        """Request the pipeline to pause."""
			
 
				+        with self._lock:
			
 
				+            self._pause_requested = True
			
 
				+
			
 
				+    def resume(self) -> None:
			
 
				+        """Request the pipeline to resume."""
			
 
				+        with self._lock:
			
 
				+            self._pause_requested = False
			
 
				+
			
 
				+    def stop(self) -> None:
			
 
				+        """Request the pipeline to stop."""
			
 
				+        with self._lock:
			
 
				+            self._stop_requested = True
			
 
				+            self._pause_requested = False
			
 
				+
			
 
				+    def get_progress(self) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        Get current progress information.
			
 
				+
			
 
				+        Returns:
			
 
				+            Dictionary with progress data
			
 
				+        """
			
 
				+        stats = self.task_queue.get_stats()
			
 
				+        return {
			
 
				+            "state": self.progress.state.value,
			
 
				+            "total_chapters": self.progress.total_chapters,
			
 
				+            "completed_chapters": stats.completed,
			
 
				+            "failed_chapters": stats.failed,
			
 
				+            "pending_chapters": stats.pending,
			
 
				+            "current_chapter": self.progress.current_chapter,
			
 
				+            "completion_rate": self.progress.completion_rate,
			
 
				+            "started_at": self.progress.started_at.isoformat() if self.progress.started_at else None,
			
 
				+        }
			
 
				+
			
 
				+    def resume_from_checkpoint(self) -> Optional[str]:
			
 
				+        """
			
 
				+        Resume from a saved checkpoint.
			
 
				+
			
 
				+        Returns:
			
 
				+            Work ID if resumable, None otherwise
			
 
				+        """
			
 
				+        if not self.recovery_manager.can_resume():
			
 
				+            return None
			
 
				+
			
 
				+        recovery_state = self.recovery_manager.get_recovery_state()
			
 
				+        if not recovery_state:
			
 
				+            return None
			
 
				+
			
 
				+        work_id = recovery_state["work_id"]
			
 
				+        self.progress.state = SchedulerState.RECOVERING
			
 
				+
			
 
				+        # TODO: Implement full resume logic
			
 
				+
			
 
				+        return work_id
			
 
				+
			
 
				+    def _save_checkpoint(
			
 
				+        self,
			
 
				+        work_id: str,
			
 
				+        current_index: int,
			
 
				+        completed_indices: List[int],
			
 
				+        failed_indices: List[int]
			
 
				+    ) -> None:
			
 
				+        """Save a checkpoint."""
			
 
				+        self.recovery_manager.create_checkpoint_from_progress(
			
 
				+            work_id=work_id,
			
 
				+            current_index=current_index,
			
 
				+            completed_indices=completed_indices,
			
 
				+            failed_indices=failed_indices,
			
 
				+            state=self.progress.state
			
 
				+        )
			
 
				+
			
 
				+    def _save_chapter_translation(self, work_id: str, task: ChapterTask) -> None:
			
 
				+        """Save a chapter translation to the repository."""
			
 
				+        from ..repository.models import ChapterItem, ChapterStatus
			
 
				+
			
 
				+        chapter = ChapterItem(
			
 
				+            chapter_id=task.chapter_id,
			
 
				+            work_id=work_id,
			
 
				+            index=task.chapter_index,
			
 
				+            title=task.title,
			
 
				+            original_text=task.original_content,
			
 
				+            translated_text=task.translated_content or "",
			
 
				+            status=ChapterStatus.COMPLETED
			
 
				+        )
			
 
				+
			
 
				+        self.repository.save_chapter(work_id, chapter)
			
 
				+
			
 
				+    def _finalize_work(
			
 
				+        self,
			
 
				+        work_id: str,
			
 
				+        completed: List[ChapterTask],
			
 
				+        failed: List[ChapterTask]
			
 
				+    ) -> None:
			
 
				+        """Finalize the work item."""
			
 
				+        work = self.repository.get_work(work_id)
			
 
				+        if work:
			
 
				+            if failed:
			
 
				+                work.status = "partial"
			
 
				+            else:
			
 
				+                work.status = "completed"
			
 
				+            work.touch()
			
 
				+            self.repository.update_work(work)
			
 
				+
			
 
				+    def _get_completed_indices(self) -> List[int]:
			
 
				+        """Get indices of completed chapters."""
			
 
				+        return [t.chapter_index for t in self.task_queue.get_completed_tasks()]
			
 
				+
			
 
				+    def _get_failed_indices(self) -> List[int]:
			
 
				+        """Get indices of failed chapters."""
			
 
				+        return [t.chapter_index for t in self.task_queue.get_failed_tasks()]
			
--- a/src/scheduler/progress.py
+++ b/src/scheduler/progress.py
@@ -0,0 +1,384 @@
 
				+"""
			
 
				+Progress notification system using the Observer pattern.
			
 
				+
			
 
				+This module provides progress tracking and notification capabilities
			
 
				+for the translation pipeline.
			
 
				+"""
			
 
				+
			
 
				+from abc import ABC, abstractmethod
			
 
				+from typing import List, Callable, Any, Optional
			
 
				+from dataclasses import dataclass
			
 
				+from datetime import datetime
			
 
				+import threading
			
 
				+
			
 
				+from .models import ChapterTask, PipelineProgress, TaskStatus
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ProgressEvent:
			
 
				+    """
			
 
				+    A progress event.
			
 
				+
			
 
				+    Attributes:
			
 
				+        event_type: Type of event
			
 
				+        timestamp: Event timestamp
			
 
				+        data: Event-specific data
			
 
				+    """
			
 
				+
			
 
				+    event_type: str
			
 
				+    timestamp: datetime
			
 
				+    data: dict[str, Any]
			
 
				+
			
 
				+
			
 
				+class ProgressObserver(ABC):
			
 
				+    """
			
 
				+    Observer interface for progress notifications.
			
 
				+
			
 
				+    Observers can be registered with ProgressNotifier to receive
			
 
				+    updates about pipeline progress.
			
 
				+    """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_pipeline_start(self, total_chapters: int) -> None:
			
 
				+        """Called when the pipeline starts."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_pipeline_complete(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline completes."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_pipeline_paused(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline is paused."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_pipeline_resumed(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline is resumed."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_pipeline_failed(self, error: str, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline fails."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_chapter_start(self, task: ChapterTask) -> None:
			
 
				+        """Called when a chapter starts processing."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_chapter_complete(self, task: ChapterTask) -> None:
			
 
				+        """Called when a chapter completes successfully."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_chapter_failed(self, task: ChapterTask, error: str) -> None:
			
 
				+        """Called when a chapter fails."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_chapter_retry(self, task: ChapterTask, attempt: int) -> None:
			
 
				+        """Called when a chapter is being retried."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def on_progress(self, current: int, total: int) -> None:
			
 
				+        """Called on progress update."""
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+class ProgressNotifier:
			
 
				+    """
			
 
				+    Progress notification system using the Observer pattern.
			
 
				+
			
 
				+    This class manages a list of observers and notifies them of
			
 
				+    pipeline events.
			
 
				+
			
 
				+    Thread-safe: Uses locks to ensure thread-safe observer notifications.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        """Initialize the progress notifier."""
			
 
				+        self._observers: List[ProgressObserver] = []
			
 
				+        self._lock = threading.Lock()
			
 
				+        self._event_history: List[ProgressEvent] = []
			
 
				+        self._max_history = 1000
			
 
				+
			
 
				+    def register(self, observer: ProgressObserver) -> None:
			
 
				+        """
			
 
				+        Register a new observer.
			
 
				+
			
 
				+        Args:
			
 
				+            observer: The observer to register
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            if observer not in self._observers:
			
 
				+                self._observers.append(observer)
			
 
				+
			
 
				+    def unregister(self, observer: ProgressObserver) -> None:
			
 
				+        """
			
 
				+        Unregister an observer.
			
 
				+
			
 
				+        Args:
			
 
				+            observer: The observer to unregister
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            if observer in self._observers:
			
 
				+                self._observers.remove(observer)
			
 
				+
			
 
				+    def clear_observers(self) -> None:
			
 
				+        """Clear all observers."""
			
 
				+        with self._lock:
			
 
				+            self._observers.clear()
			
 
				+
			
 
				+    @property
			
 
				+    def observer_count(self) -> int:
			
 
				+        """Get the number of registered observers."""
			
 
				+        return len(self._observers)
			
 
				+
			
 
				+    def get_event_history(self, limit: Optional[int] = None) -> List[ProgressEvent]:
			
 
				+        """
			
 
				+        Get the event history.
			
 
				+
			
 
				+        Args:
			
 
				+            limit: Maximum number of events to return
			
 
				+
			
 
				+        Returns:
			
 
				+            List of recent events
			
 
				+        """
			
 
				+        if limit:
			
 
				+            return self._event_history[-limit:]
			
 
				+        return self._event_history.copy()
			
 
				+
			
 
				+    def _notify(self, method_name: str, *args, **kwargs) -> None:
			
 
				+        """
			
 
				+        Notify all observers of an event.
			
 
				+
			
 
				+        Args:
			
 
				+            method_name: Name of the observer method to call
			
 
				+            *args: Positional arguments to pass
			
 
				+            **kwargs: Keyword arguments to pass
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            # Record event
			
 
				+            event = ProgressEvent(
			
 
				+                event_type=method_name,
			
 
				+                timestamp=datetime.now(),
			
 
				+                data=kwargs
			
 
				+            )
			
 
				+            self._event_history.append(event)
			
 
				+
			
 
				+            # Trim history if needed
			
 
				+            if len(self._event_history) > self._max_history:
			
 
				+                self._event_history = self._event_history[-self._max_history:]
			
 
				+
			
 
				+            # Notify observers
			
 
				+            for observer in self._observers:
			
 
				+                method = getattr(observer, method_name, None)
			
 
				+                if method:
			
 
				+                    try:
			
 
				+                        method(*args, **kwargs)
			
 
				+                    except Exception as e:
			
 
				+                        # Don't let one observer failure break others
			
 
				+                        print(f"Observer error in {method_name}: {e}")
			
 
				+
			
 
				+    def notify_pipeline_start(self, total_chapters: int) -> None:
			
 
				+        """Notify that the pipeline has started."""
			
 
				+        self._notify("on_pipeline_start", total_chapters)
			
 
				+
			
 
				+    def notify_pipeline_complete(self, progress: PipelineProgress) -> None:
			
 
				+        """Notify that the pipeline has completed."""
			
 
				+        self._notify("on_pipeline_complete", progress)
			
 
				+
			
 
				+    def notify_pipeline_paused(self, progress: PipelineProgress) -> None:
			
 
				+        """Notify that the pipeline has been paused."""
			
 
				+        self._notify("on_pipeline_paused", progress)
			
 
				+
			
 
				+    def notify_pipeline_resumed(self, progress: PipelineProgress) -> None:
			
 
				+        """Notify that the pipeline has been resumed."""
			
 
				+        self._notify("on_pipeline_resumed", progress)
			
 
				+
			
 
				+    def notify_pipeline_failed(self, error: str, progress: PipelineProgress) -> None:
			
 
				+        """Notify that the pipeline has failed."""
			
 
				+        self._notify("on_pipeline_failed", error, progress)
			
 
				+
			
 
				+    def notify_chapter_start(self, task: ChapterTask) -> None:
			
 
				+        """Notify that a chapter has started processing."""
			
 
				+        self._notify("on_chapter_start", task)
			
 
				+
			
 
				+    def notify_chapter_complete(self, task: ChapterTask) -> None:
			
 
				+        """Notify that a chapter has completed."""
			
 
				+        self._notify("on_chapter_complete", task)
			
 
				+
			
 
				+    def notify_chapter_failed(self, task: ChapterTask, error: str) -> None:
			
 
				+        """Notify that a chapter has failed."""
			
 
				+        self._notify("on_chapter_failed", task, error)
			
 
				+
			
 
				+    def notify_chapter_retry(self, task: ChapterTask, attempt: int) -> None:
			
 
				+        """Notify that a chapter is being retried."""
			
 
				+        self._notify("on_chapter_retry", task, attempt)
			
 
				+
			
 
				+    def notify_progress(self, current: int, total: int) -> None:
			
 
				+        """Notify of progress update."""
			
 
				+        self._notify("on_progress", current, total)
			
 
				+
			
 
				+
			
 
				+class ConsoleProgressObserver(ProgressObserver):
			
 
				+    """
			
 
				+    Console-based progress observer.
			
 
				+
			
 
				+    Prints progress updates to the console.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, verbose: bool = True):
			
 
				+        """
			
 
				+        Initialize the console observer.
			
 
				+
			
 
				+        Args:
			
 
				+            verbose: Whether to print detailed messages
			
 
				+        """
			
 
				+        self.verbose = verbose
			
 
				+        self._start_time = None
			
 
				+
			
 
				+    def on_pipeline_start(self, total_chapters: int) -> None:
			
 
				+        """Called when the pipeline starts."""
			
 
				+        self._start_time = datetime.now()
			
 
				+        if self.verbose:
			
 
				+            print(f"[Pipeline] Starting translation of {total_chapters} chapters...")
			
 
				+
			
 
				+    def on_pipeline_complete(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline completes."""
			
 
				+        duration = 0
			
 
				+        if self._start_time:
			
 
				+            duration = (datetime.now() - self._start_time).total_seconds()
			
 
				+
			
 
				+        if self.verbose:
			
 
				+            print(f"[Pipeline] Complete! {progress.completed_chapters}/{progress.total_chapters} chapters in {duration:.1f}s")
			
 
				+
			
 
				+    def on_pipeline_paused(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline is paused."""
			
 
				+        if self.verbose:
			
 
				+            print(f"[Pipeline] Paused at chapter {progress.current_chapter}")
			
 
				+
			
 
				+    def on_pipeline_resumed(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline is resumed."""
			
 
				+        if self.verbose:
			
 
				+            print(f"[Pipeline] Resumed from chapter {progress.current_chapter}")
			
 
				+
			
 
				+    def on_pipeline_failed(self, error: str, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline fails."""
			
 
				+        if self.verbose:
			
 
				+            print(f"[Pipeline] Failed: {error}")
			
 
				+
			
 
				+    def on_chapter_start(self, task: ChapterTask) -> None:
			
 
				+        """Called when a chapter starts processing."""
			
 
				+        if self.verbose:
			
 
				+            print(f"  [{task.chapter_index + 1}] {task.title}...")
			
 
				+
			
 
				+    def on_chapter_complete(self, task: ChapterTask) -> None:
			
 
				+        """Called when a chapter completes successfully."""
			
 
				+        if self.verbose:
			
 
				+            duration = ""
			
 
				+            if task.duration:
			
 
				+                duration = f" ({task.duration:.1f}s)"
			
 
				+            print(f"  [{task.chapter_index + 1}] {task.title} ✓{duration}")
			
 
				+
			
 
				+    def on_chapter_failed(self, task: ChapterTask, error: str) -> None:
			
 
				+        """Called when a chapter fails."""
			
 
				+        if self.verbose:
			
 
				+            print(f"  [{task.chapter_index + 1}] {task.title} ✗ ({error})")
			
 
				+
			
 
				+    def on_chapter_retry(self, task: ChapterTask, attempt: int) -> None:
			
 
				+        """Called when a chapter is being retried."""
			
 
				+        if self.verbose:
			
 
				+            print(f"  [{task.chapter_index + 1}] {task.title} ↺ (retry {attempt}/3)")
			
 
				+
			
 
				+    def on_progress(self, current: int, total: int) -> None:
			
 
				+        """Called on progress update."""
			
 
				+        # Only print progress for every 10% to avoid spam
			
 
				+        if total > 0 and current % max(1, total // 10) == 0:
			
 
				+            pct = (current / total) * 100
			
 
				+            print(f"[Progress] {current}/{total} ({pct:.0f}%)")
			
 
				+
			
 
				+
			
 
				+class CallbackProgressObserver(ProgressObserver):
			
 
				+    """
			
 
				+    Progress observer that calls user-provided callbacks.
			
 
				+
			
 
				+    Allows programmatic handling of progress events.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        on_start: Optional[Callable[[int], None]] = None,
			
 
				+        on_complete: Optional[Callable[[PipelineProgress], None]] = None,
			
 
				+        on_chapter_start: Optional[Callable[[ChapterTask], None]] = None,
			
 
				+        on_chapter_complete: Optional[Callable[[ChapterTask], None]] = None,
			
 
				+        on_chapter_failed: Optional[Callable[[ChapterTask, str], None]] = None,
			
 
				+        on_progress: Optional[Callable[[int, int], None]] = None
			
 
				+    ):
			
 
				+        """
			
 
				+        Initialize the callback observer.
			
 
				+
			
 
				+        Args:
			
 
				+            on_start: Callback for pipeline start
			
 
				+            on_complete: Callback for pipeline complete
			
 
				+            on_chapter_start: Callback for chapter start
			
 
				+            on_chapter_complete: Callback for chapter complete
			
 
				+            on_chapter_failed: Callback for chapter failure
			
 
				+            on_progress: Callback for progress updates
			
 
				+        """
			
 
				+        self._on_start = on_start
			
 
				+        self._on_complete = on_complete
			
 
				+        self._on_chapter_start = on_chapter_start
			
 
				+        self._on_chapter_complete = on_chapter_complete
			
 
				+        self._on_chapter_failed = on_chapter_failed
			
 
				+        self._on_progress = on_progress
			
 
				+
			
 
				+    def on_pipeline_start(self, total_chapters: int) -> None:
			
 
				+        """Called when the pipeline starts."""
			
 
				+        if self._on_start:
			
 
				+            self._on_start(total_chapters)
			
 
				+
			
 
				+    def on_pipeline_complete(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline completes."""
			
 
				+        if self._on_complete:
			
 
				+            self._on_complete(progress)
			
 
				+
			
 
				+    def on_pipeline_paused(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline is paused."""
			
 
				+        pass
			
 
				+
			
 
				+    def on_pipeline_resumed(self, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline is resumed."""
			
 
				+        pass
			
 
				+
			
 
				+    def on_pipeline_failed(self, error: str, progress: PipelineProgress) -> None:
			
 
				+        """Called when the pipeline fails."""
			
 
				+        pass
			
 
				+
			
 
				+    def on_chapter_start(self, task: ChapterTask) -> None:
			
 
				+        """Called when a chapter starts processing."""
			
 
				+        if self._on_chapter_start:
			
 
				+            self._on_chapter_start(task)
			
 
				+
			
 
				+    def on_chapter_complete(self, task: ChapterTask) -> None:
			
 
				+        """Called when a chapter completes successfully."""
			
 
				+        if self._on_chapter_complete:
			
 
				+            self._on_chapter_complete(task)
			
 
				+
			
 
				+    def on_chapter_failed(self, task: ChapterTask, error: str) -> None:
			
 
				+        """Called when a chapter fails."""
			
 
				+        if self._on_chapter_failed:
			
 
				+            self._on_chapter_failed(task, error)
			
 
				+
			
 
				+    def on_chapter_retry(self, task: ChapterTask, attempt: int) -> None:
			
 
				+        """Called when a chapter is being retried."""
			
 
				+        pass
			
 
				+
			
 
				+    def on_progress(self, current: int, total: int) -> None:
			
 
				+        """Called on progress update."""
			
 
				+        if self._on_progress:
			
 
				+            self._on_progress(current, total)
			
--- a/src/scheduler/recovery.py
+++ b/src/scheduler/recovery.py
@@ -0,0 +1,350 @@
 
				+"""
			
 
				+Crash recovery management for the pipeline scheduler.
			
 
				+
			
 
				+This module provides checkpoint and recovery functionality
			
 
				+to ensure the pipeline can resume after crashes.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import threading
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, Dict, Any, List
			
 
				+from datetime import datetime
			
 
				+import hashlib
			
 
				+
			
 
				+from .models import CheckpointData, SchedulerState, ChapterTask, TaskStatus
			
 
				+
			
 
				+
			
 
				+class RecoveryManager:
			
 
				+    """
			
 
				+    Manage crash recovery through checkpoints.
			
 
				+
			
 
				+    This class handles:
			
 
				+    - Saving checkpoints during pipeline execution
			
 
				+    - Loading checkpoints after a crash
			
 
				+    - Determining recovery state
			
 
				+    - Cleaning up old checkpoints
			
 
				+
			
 
				+    Thread-safe: Uses locks for checkpoint operations.
			
 
				+    """
			
 
				+
			
 
				+    CHECKPOINT_FILE = "checkpoint.json"
			
 
				+    CHECKPOINT_BACKUP_FILE = "checkpoint.json.bak"
			
 
				+
			
 
				+    def __init__(self, work_dir: str | Path):
			
 
				+        """
			
 
				+        Initialize the recovery manager.
			
 
				+
			
 
				+        Args:
			
 
				+            work_dir: Working directory for checkpoint storage
			
 
				+        """
			
 
				+        self.work_dir = Path(work_dir)
			
 
				+        self.checkpoint_file = self.work_dir / self.CHECKPOINT_FILE
			
 
				+        self.backup_file = self.work_dir / self.CHECKPOINT_BACKUP_FILE
			
 
				+        self._lock = threading.Lock()
			
 
				+
			
 
				+    def save_checkpoint(self, checkpoint: CheckpointData) -> None:
			
 
				+        """
			
 
				+        Save a checkpoint to disk.
			
 
				+
			
 
				+        This method uses atomic write (write to temp then rename)
			
 
				+        to ensure checkpoint integrity.
			
 
				+
			
 
				+        Args:
			
 
				+            checkpoint: The checkpoint data to save
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            # Ensure directory exists
			
 
				+            self.work_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+            # Serialize checkpoint
			
 
				+            data = {
			
 
				+                "work_id": checkpoint.work_id,
			
 
				+                "current_chapter_index": checkpoint.current_chapter_index,
			
 
				+                "completed_indices": checkpoint.completed_indices,
			
 
				+                "failed_indices": checkpoint.failed_indices,
			
 
				+                "timestamp": checkpoint.timestamp.isoformat(),
			
 
				+                "scheduler_state": checkpoint.scheduler_state.value,
			
 
				+                "metadata": checkpoint.metadata
			
 
				+            }
			
 
				+
			
 
				+            # Atomic write: write to temp file first
			
 
				+            temp_file = self.checkpoint_file.with_suffix(".tmp")
			
 
				+            try:
			
 
				+                with open(temp_file, "w", encoding="utf-8") as f:
			
 
				+                    json.dump(data, f, indent=2, ensure_ascii=False)
			
 
				+
			
 
				+                # Backup existing checkpoint if it exists
			
 
				+                if self.checkpoint_file.exists():
			
 
				+                    self.checkpoint_file.replace(self.backup_file)
			
 
				+
			
 
				+                # Atomic rename
			
 
				+                temp_file.replace(self.checkpoint_file)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                # Clean up temp file on error
			
 
				+                if temp_file.exists():
			
 
				+                    temp_file.unlink()
			
 
				+                raise IOError(f"Failed to save checkpoint: {e}")
			
 
				+
			
 
				+    def load_checkpoint(self) -> Optional[CheckpointData]:
			
 
				+        """
			
 
				+        Load a checkpoint from disk.
			
 
				+
			
 
				+        Tries to load the main checkpoint file, and falls back
			
 
				+        to the backup if the main file is corrupted.
			
 
				+
			
 
				+        Returns:
			
 
				+            The loaded CheckpointData, or None if no checkpoint exists
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            # Try main checkpoint
			
 
				+            checkpoint = self._load_checkpoint_file(self.checkpoint_file)
			
 
				+            if checkpoint:
			
 
				+                return checkpoint
			
 
				+
			
 
				+            # Try backup checkpoint
			
 
				+            checkpoint = self._load_checkpoint_file(self.backup_file)
			
 
				+            if checkpoint:
			
 
				+                return checkpoint
			
 
				+
			
 
				+            return None
			
 
				+
			
 
				+    def _load_checkpoint_file(self, file_path: Path) -> Optional[CheckpointData]:
			
 
				+        """
			
 
				+        Load checkpoint from a specific file.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path: The file to load from
			
 
				+
			
 
				+        Returns:
			
 
				+            The loaded CheckpointData, or None if file doesn't exist/is invalid
			
 
				+        """
			
 
				+        if not file_path.exists():
			
 
				+            return None
			
 
				+
			
 
				+        try:
			
 
				+            with open(file_path, "r", encoding="utf-8") as f:
			
 
				+                data = json.load(f)
			
 
				+
			
 
				+            # Validate required fields
			
 
				+            if "work_id" not in data or "current_chapter_index" not in data:
			
 
				+                return None
			
 
				+
			
 
				+            # Parse timestamp
			
 
				+            timestamp = datetime.now()
			
 
				+            if "timestamp" in data:
			
 
				+                try:
			
 
				+                    timestamp = datetime.fromisoformat(data["timestamp"])
			
 
				+                except (ValueError, TypeError):
			
 
				+                    pass
			
 
				+
			
 
				+            # Parse state
			
 
				+            state = SchedulerState.RUNNING
			
 
				+            if "scheduler_state" in data:
			
 
				+                try:
			
 
				+                    state = SchedulerState(data["scheduler_state"])
			
 
				+                except ValueError:
			
 
				+                    state = SchedulerState.RUNNING
			
 
				+
			
 
				+            return CheckpointData(
			
 
				+                work_id=data["work_id"],
			
 
				+                current_chapter_index=data["current_chapter_index"],
			
 
				+                completed_indices=data.get("completed_indices", []),
			
 
				+                failed_indices=data.get("failed_indices", []),
			
 
				+                timestamp=timestamp,
			
 
				+                scheduler_state=state,
			
 
				+                metadata=data.get("metadata", {})
			
 
				+            )
			
 
				+        except (json.JSONDecodeError, IOError, KeyError):
			
 
				+            return None
			
 
				+
			
 
				+    def has_checkpoint(self) -> bool:
			
 
				+        """
			
 
				+        Check if a checkpoint exists.
			
 
				+
			
 
				+        Returns:
			
 
				+            True if a checkpoint file exists, False otherwise
			
 
				+        """
			
 
				+        return self.checkpoint_file.exists() or self.backup_file.exists()
			
 
				+
			
 
				+    def delete_checkpoint(self) -> None:
			
 
				+        """Delete all checkpoint files."""
			
 
				+        with self._lock:
			
 
				+            if self.checkpoint_file.exists():
			
 
				+                self.checkpoint_file.unlink()
			
 
				+            if self.backup_file.exists():
			
 
				+                self.backup_file.unlink()
			
 
				+
			
 
				+    def get_checkpoint_age(self) -> Optional[float]:
			
 
				+        """
			
 
				+        Get the age of the checkpoint in seconds.
			
 
				+
			
 
				+        Returns:
			
 
				+            Age in seconds, or None if no checkpoint exists
			
 
				+        """
			
 
				+        checkpoint = self.load_checkpoint()
			
 
				+        if checkpoint:
			
 
				+            return (datetime.now() - checkpoint.timestamp).total_seconds()
			
 
				+        return None
			
 
				+
			
 
				+    def create_checkpoint_from_progress(
			
 
				+        self,
			
 
				+        work_id: str,
			
 
				+        current_index: int,
			
 
				+        completed_indices: List[int],
			
 
				+        failed_indices: List[int],
			
 
				+        state: SchedulerState = SchedulerState.RUNNING,
			
 
				+        metadata: Optional[Dict[str, Any]] = None
			
 
				+    ) -> CheckpointData:
			
 
				+        """
			
 
				+        Create a checkpoint from progress data.
			
 
				+
			
 
				+        Args:
			
 
				+            work_id: Work item ID
			
 
				+            current_index: Current chapter index
			
 
				+            completed_indices: List of completed chapter indices
			
 
				+            failed_indices: List of failed chapter indices
			
 
				+            state: Current scheduler state
			
 
				+            metadata: Optional metadata
			
 
				+
			
 
				+        Returns:
			
 
				+            The created CheckpointData
			
 
				+        """
			
 
				+        checkpoint = CheckpointData(
			
 
				+            work_id=work_id,
			
 
				+            current_chapter_index=current_index,
			
 
				+            completed_indices=list(completed_indices),
			
 
				+            failed_indices=list(failed_indices),
			
 
				+            timestamp=datetime.now(),
			
 
				+            scheduler_state=state,
			
 
				+            metadata=metadata or {}
			
 
				+        )
			
 
				+
			
 
				+        self.save_checkpoint(checkpoint)
			
 
				+        return checkpoint
			
 
				+
			
 
				+    def get_recovery_state(self) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        Get the recovery state from checkpoint.
			
 
				+
			
 
				+        Returns:
			
 
				+            Dictionary with recovery information:
			
 
				+                - recoverable: Whether recovery is possible
			
 
				+                - work_id: Work ID to resume
			
 
				+                - resume_index: Chapter index to resume from
			
 
				+                - completed_count: Number of completed chapters
			
 
				+                - failed_count: Number of failed chapters
			
 
				+                - checkpoint_age: Age of checkpoint in seconds
			
 
				+        """
			
 
				+        checkpoint = self.load_checkpoint()
			
 
				+        if not checkpoint:
			
 
				+            return None
			
 
				+
			
 
				+        return {
			
 
				+            "recoverable": True,
			
 
				+            "work_id": checkpoint.work_id,
			
 
				+            "resume_index": checkpoint.current_chapter_index,
			
 
				+            "completed_count": len(checkpoint.completed_indices),
			
 
				+            "failed_count": len(checkpoint.failed_indices),
			
 
				+            "checkpoint_age": (datetime.now() - checkpoint.timestamp).total_seconds(),
			
 
				+            "state": checkpoint.scheduler_state.value
			
 
				+        }
			
 
				+
			
 
				+    def can_resume(self) -> bool:
			
 
				+        """
			
 
				+        Check if the pipeline can be resumed from a checkpoint.
			
 
				+
			
 
				+        Returns:
			
 
				+            True if resumable, False otherwise
			
 
				+        """
			
 
				+        recovery_state = self.get_recovery_state()
			
 
				+        if not recovery_state:
			
 
				+            return False
			
 
				+
			
 
				+        # Check if checkpoint is recent (within 24 hours)
			
 
				+        if recovery_state["checkpoint_age"] > 86400:
			
 
				+            return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def get_work_id_from_checkpoint(self) -> Optional[str]:
			
 
				+        """
			
 
				+        Get the work ID from the checkpoint.
			
 
				+
			
 
				+        Returns:
			
 
				+            The work ID, or None if no checkpoint exists
			
 
				+        """
			
 
				+        checkpoint = self.load_checkpoint()
			
 
				+        return checkpoint.work_id if checkpoint else None
			
 
				+
			
 
				+
			
 
				+class AutoCheckpointMixin:
			
 
				+    """
			
 
				+    Mixin for automatic checkpointing.
			
 
				+
			
 
				+    Classes can inherit from this to get automatic checkpoint
			
 
				+    saving at regular intervals.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, *args, checkpoint_interval: int = 5, **kwargs):
			
 
				+        """
			
 
				+        Initialize with auto-checkpoint.
			
 
				+
			
 
				+        Args:
			
 
				+            checkpoint_interval: Save checkpoint every N chapters
			
 
				+        """
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.checkpoint_interval = checkpoint_interval
			
 
				+        self._chapters_since_checkpoint = 0
			
 
				+
			
 
				+    def should_checkpoint(self) -> bool:
			
 
				+        """
			
 
				+        Check if it's time to save a checkpoint.
			
 
				+
			
 
				+        Returns:
			
 
				+            True if checkpoint should be saved
			
 
				+        """
			
 
				+        self._chapters_since_checkpoint += 1
			
 
				+        return self._chapters_since_checkpoint >= self.checkpoint_interval
			
 
				+
			
 
				+    def reset_checkpoint_counter(self) -> None:
			
 
				+        """Reset the checkpoint counter."""
			
 
				+        self._chapters_since_checkpoint = 0
			
 
				+
			
 
				+
			
 
				+def compute_work_fingerprint(file_path: str | Path) -> str:
			
 
				+    """
			
 
				+    Compute a fingerprint for a work file.
			
 
				+
			
 
				+    This fingerprint can be used to detect if the source file
			
 
				+    has changed since a checkpoint was created.
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file
			
 
				+
			
 
				+    Returns:
			
 
				+        Hexadecimal fingerprint string
			
 
				+    """
			
 
				+    path = Path(file_path)
			
 
				+    if not path.exists():
			
 
				+        return ""
			
 
				+
			
 
				+    # Use file size, mtime, and first/last 1KB for fingerprint
			
 
				+    stat = path.stat()
			
 
				+
			
 
				+    # Read first and last 1KB
			
 
				+    first_kb = b""
			
 
				+    last_kb = b""
			
 
				+    try:
			
 
				+        with open(path, "rb") as f:
			
 
				+            first_kb = f.read(1024)
			
 
				+            if stat.st_size > 2048:
			
 
				+                f.seek(-1024, 2)
			
 
				+                last_kb = f.read(1024)
			
 
				+    except (IOError, OSError):
			
 
				+        pass
			
 
				+
			
 
				+    # Compute hash
			
 
				+    content = f"{stat.st_size}{stat.st_mtime}{first_kb}{last_kb}".encode()
			
 
				+    return hashlib.sha256(content).hexdigest()
			
--- a/src/scheduler/retry.py
+++ b/src/scheduler/retry.py
@@ -0,0 +1,377 @@
 
				+"""
			
 
				+Retry management for failed tasks.
			
 
				+
			
 
				+This module provides retry logic with exponential backoff
			
 
				+for handling failed chapter translations.
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import time
			
 
				+from typing import Optional, Callable, List, Dict, Any
			
 
				+from dataclasses import dataclass
			
 
				+from datetime import datetime, timedelta
			
 
				+
			
 
				+from .models import ChapterTask, TaskStatus
			
 
				+from .task_queue import TaskQueue
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class RetryConfig:
			
 
				+    """
			
 
				+    Configuration for retry behavior.
			
 
				+
			
 
				+    Attributes:
			
 
				+        max_retries: Maximum number of retry attempts
			
 
				+        base_delay: Base delay in seconds before first retry
			
 
				+        max_delay: Maximum delay in seconds
			
 
				+        exponential_backoff: Whether to use exponential backoff
			
 
				+        backoff_multiplier: Multiplier for exponential backoff
			
 
				+        retry_on_timeout: Whether to retry on timeout errors
			
 
				+        retry_on_network_error: Whether to retry on network errors
			
 
				+        retry_on_translation_error: Whether to retry on translation errors
			
 
				+    """
			
 
				+
			
 
				+    max_retries: int = 3
			
 
				+    base_delay: float = 1.0  # seconds
			
 
				+    max_delay: float = 60.0  # seconds
			
 
				+    exponential_backoff: bool = True
			
 
				+    backoff_multiplier: float = 2.0
			
 
				+    retry_on_timeout: bool = True
			
 
				+    retry_on_network_error: bool = True
			
 
				+    retry_on_translation_error: bool = False
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class RetryRecord:
			
 
				+    """
			
 
				+    Record of a retry attempt.
			
 
				+
			
 
				+    Attributes:
			
 
				+        chapter_id: The chapter ID
			
 
				+        attempt_number: Which attempt this is (1-based)
			
 
				+        error_message: The error that caused the retry
			
 
				+        timestamp: When the retry occurred
			
 
				+        delay_used: Delay used before this retry
			
 
				+        success: Whether this attempt succeeded
			
 
				+    """
			
 
				+
			
 
				+    chapter_id: str
			
 
				+    attempt_number: int
			
 
				+    error_message: str
			
 
				+    timestamp: datetime
			
 
				+    delay_used: float
			
 
				+    success: bool = False
			
 
				+
			
 
				+
			
 
				+class RetryManager:
			
 
				+    """
			
 
				+    Manage retry logic for failed tasks.
			
 
				+
			
 
				+    Features:
			
 
				+    - Configurable retry limits
			
 
				+    - Exponential backoff
			
 
				+    - Per-task retry history
			
 
				+    - Error classification for retry decisions
			
 
				+    """
			
 
				+
			
 
				+    # Error patterns that should trigger retries
			
 
				+    RETRYABLE_PATTERNS = [
			
 
				+        "timeout",
			
 
				+        "connection",
			
 
				+        "network",
			
 
				+        "temporarily",
			
 
				+        "unavailable",
			
 
				+        "rate limit",
			
 
				+        "429",  # HTTP 429 Too Many Requests
			
 
				+        "503",  # HTTP 503 Service Unavailable
			
 
				+        "504",  # HTTP 504 Gateway Timeout
			
 
				+    ]
			
 
				+
			
 
				+    # Error patterns that should NOT trigger retries
			
 
				+    NON_RETRYABLE_PATTERNS = [
			
 
				+        "authentication",
			
 
				+        "authorization",
			
 
				+        "invalid key",
			
 
				+        "quota exceeded",
			
 
				+        "401",  # HTTP 401 Unauthorized
			
 
				+        "403",  # HTTP 403 Forbidden
			
 
				+        "file not found",
			
 
				+        "invalid format",
			
 
				+    ]
			
 
				+
			
 
				+    def __init__(self, config: Optional[RetryConfig] = None):
			
 
				+        """
			
 
				+        Initialize the retry manager.
			
 
				+
			
 
				+        Args:
			
 
				+            config: Retry configuration (uses defaults if not provided)
			
 
				+        """
			
 
				+        self.config = config or RetryConfig()
			
 
				+        self._retry_history: Dict[str, List[RetryRecord]] = {}
			
 
				+
			
 
				+    def should_retry(self, task: ChapterTask, error_message: str) -> bool:
			
 
				+        """
			
 
				+        Determine if a task should be retried.
			
 
				+
			
 
				+        Args:
			
 
				+            task: The failed task
			
 
				+            error_message: The error message
			
 
				+
			
 
				+        Returns:
			
 
				+            True if the task should be retried, False otherwise
			
 
				+        """
			
 
				+        # Check retry count
			
 
				+        if task.retry_count >= self.config.max_retries:
			
 
				+            return False
			
 
				+
			
 
				+        # Check if error is non-retryable
			
 
				+        if self._is_non_retryable_error(error_message):
			
 
				+            return False
			
 
				+
			
 
				+        # Check if error is retryable
			
 
				+        if self._is_retryable_error(error_message):
			
 
				+            return True
			
 
				+
			
 
				+        # Check config for specific error types
			
 
				+        error_lower = error_message.lower()
			
 
				+
			
 
				+        if "timeout" in error_lower:
			
 
				+            return self.config.retry_on_timeout
			
 
				+
			
 
				+        if "network" in error_lower or "connection" in error_lower:
			
 
				+            return self.config.retry_on_network_error
			
 
				+
			
 
				+        if "translation" in error_lower:
			
 
				+            return self.config.retry_on_translation_error
			
 
				+
			
 
				+        # Default: don't retry unknown errors
			
 
				+        return False
			
 
				+
			
 
				+    def get_retry_delay(self, attempt_number: int) -> float:
			
 
				+        """
			
 
				+        Calculate delay before retry.
			
 
				+
			
 
				+        Uses exponential backoff if enabled.
			
 
				+
			
 
				+        Args:
			
 
				+            attempt_number: The retry attempt number (1-based)
			
 
				+
			
 
				+        Returns:
			
 
				+            Delay in seconds
			
 
				+        """
			
 
				+        if not self.config.exponential_backoff:
			
 
				+            return self.config.base_delay
			
 
				+
			
 
				+        # Exponential backoff with jitter
			
 
				+        delay = self.config.base_delay * (
			
 
				+            self.config.backoff_multiplier ** (attempt_number - 1)
			
 
				+        )
			
 
				+
			
 
				+        # Cap at max delay
			
 
				+        return min(delay, self.config.max_delay)
			
 
				+
			
 
				+    async def wait_before_retry(self, attempt_number: int) -> None:
			
 
				+        """
			
 
				+        Wait before retrying.
			
 
				+
			
 
				+        Args:
			
 
				+            attempt_number: The retry attempt number
			
 
				+        """
			
 
				+        delay = self.get_retry_delay(attempt_number)
			
 
				+        await asyncio.sleep(delay)
			
 
				+
			
 
				+    def wait_before_retry_sync(self, attempt_number: int) -> None:
			
 
				+        """
			
 
				+        Synchronous wait before retrying.
			
 
				+
			
 
				+        Args:
			
 
				+            attempt_number: The retry attempt number
			
 
				+        """
			
 
				+        delay = self.get_retry_delay(attempt_number)
			
 
				+        time.sleep(delay)
			
 
				+
			
 
				+    def record_retry(
			
 
				+        self,
			
 
				+        chapter_id: str,
			
 
				+        attempt_number: int,
			
 
				+        error_message: str,
			
 
				+        delay_used: float,
			
 
				+        success: bool = False
			
 
				+    ) -> RetryRecord:
			
 
				+        """
			
 
				+        Record a retry attempt.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+            attempt_number: The attempt number
			
 
				+            error_message: The error message
			
 
				+            delay_used: Delay used before this retry
			
 
				+            success: Whether this attempt succeeded
			
 
				+
			
 
				+        Returns:
			
 
				+            The created RetryRecord
			
 
				+        """
			
 
				+        record = RetryRecord(
			
 
				+            chapter_id=chapter_id,
			
 
				+            attempt_number=attempt_number,
			
 
				+            error_message=error_message,
			
 
				+            timestamp=datetime.now(),
			
 
				+            delay_used=delay_used,
			
 
				+            success=success
			
 
				+        )
			
 
				+
			
 
				+        if chapter_id not in self._retry_history:
			
 
				+            self._retry_history[chapter_id] = []
			
 
				+
			
 
				+        self._retry_history[chapter_id].append(record)
			
 
				+        return record
			
 
				+
			
 
				+    def get_retry_history(self, chapter_id: str) -> List[RetryRecord]:
			
 
				+        """
			
 
				+        Get retry history for a chapter.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+
			
 
				+        Returns:
			
 
				+            List of RetryRecords for the chapter
			
 
				+        """
			
 
				+        return self._retry_history.get(chapter_id, []).copy()
			
 
				+
			
 
				+    def clear_retry_history(self, chapter_id: Optional[str] = None) -> None:
			
 
				+        """
			
 
				+        Clear retry history.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: Specific chapter to clear, or None to clear all
			
 
				+        """
			
 
				+        if chapter_id:
			
 
				+            self._retry_history.pop(chapter_id, None)
			
 
				+        else:
			
 
				+            self._retry_history.clear()
			
 
				+
			
 
				+    async def retry_task(
			
 
				+        self,
			
 
				+        task: ChapterTask,
			
 
				+        retry_func: Callable[[], Any],
			
 
				+        error_message: str
			
 
				+    ) -> tuple[bool, Any]:
			
 
				+        """
			
 
				+        Retry a failed task.
			
 
				+
			
 
				+        Args:
			
 
				+            task: The task to retry
			
 
				+            retry_func: Async function to call for retry
			
 
				+            error_message: The original error message
			
 
				+
			
 
				+        Returns:
			
 
				+            Tuple of (success, result_or_error)
			
 
				+        """
			
 
				+        if not self.should_retry(task, error_message):
			
 
				+            return False, error_message
			
 
				+
			
 
				+        attempt = task.retry_count + 1
			
 
				+        delay = self.get_retry_delay(attempt)
			
 
				+
			
 
				+        # Wait before retry
			
 
				+        await self.wait_before_retry(attempt)
			
 
				+
			
 
				+        try:
			
 
				+            result = await retry_func()
			
 
				+            self.record_retry(task.chapter_id, attempt, "", delay, success=True)
			
 
				+            return True, result
			
 
				+        except Exception as e:
			
 
				+            new_error = str(e)
			
 
				+            self.record_retry(task.chapter_id, attempt, new_error, delay, success=False)
			
 
				+            return False, new_error
			
 
				+
			
 
				+    def retry_task_sync(
			
 
				+        self,
			
 
				+        task: ChapterTask,
			
 
				+        retry_func: Callable[[], Any],
			
 
				+        error_message: str
			
 
				+    ) -> tuple[bool, Any]:
			
 
				+        """
			
 
				+        Synchronously retry a failed task.
			
 
				+
			
 
				+        Args:
			
 
				+            task: The task to retry
			
 
				+            retry_func: Function to call for retry
			
 
				+            error_message: The original error message
			
 
				+
			
 
				+        Returns:
			
 
				+            Tuple of (success, result_or_error)
			
 
				+        """
			
 
				+        if not self.should_retry(task, error_message):
			
 
				+            return False, error_message
			
 
				+
			
 
				+        attempt = task.retry_count + 1
			
 
				+        delay = self.get_retry_delay(attempt)
			
 
				+
			
 
				+        # Wait before retry
			
 
				+        self.wait_before_retry_sync(attempt)
			
 
				+
			
 
				+        try:
			
 
				+            result = retry_func()
			
 
				+            self.record_retry(task.chapter_id, attempt, "", delay, success=True)
			
 
				+            return True, result
			
 
				+        except Exception as e:
			
 
				+            new_error = str(e)
			
 
				+            self.record_retry(task.chapter_id, attempt, new_error, delay, success=False)
			
 
				+            return False, new_error
			
 
				+
			
 
				+    def get_stats(self) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        Get retry statistics.
			
 
				+
			
 
				+        Returns:
			
 
				+            Dictionary with retry statistics
			
 
				+        """
			
 
				+        total_retries = sum(len(records) for records in self._retry_history.values())
			
 
				+        successful_retries = sum(
			
 
				+            sum(1 for r in records if r.success)
			
 
				+            for records in self._retry_history.values()
			
 
				+        )
			
 
				+        failed_retries = total_retries - successful_retries
			
 
				+
			
 
				+        return {
			
 
				+            "total_retries": total_retries,
			
 
				+            "successful_retries": successful_retries,
			
 
				+            "failed_retries": failed_retries,
			
 
				+            "chapters_with_retries": len(self._retry_history),
			
 
				+        }
			
 
				+
			
 
				+    def _is_retryable_error(self, error_message: str) -> bool:
			
 
				+        """Check if an error is retryable."""
			
 
				+        error_lower = error_message.lower()
			
 
				+        return any(pattern in error_lower for pattern in self.RETRYABLE_PATTERNS)
			
 
				+
			
 
				+    def _is_non_retryable_error(self, error_message: str) -> bool:
			
 
				+        """Check if an error is non-retryable."""
			
 
				+        error_lower = error_message.lower()
			
 
				+        return any(pattern in error_lower for pattern in self.NON_RETRYABLE_PATTERNS)
			
 
				+
			
 
				+    def process_queue_for_retries(
			
 
				+        self,
			
 
				+        queue: TaskQueue,
			
 
				+        retry_func: Callable[[ChapterTask], Any]
			
 
				+    ) -> List[ChapterTask]:
			
 
				+        """
			
 
				+        Process all failed tasks in a queue for retry.
			
 
				+
			
 
				+        Args:
			
 
				+            queue: The task queue
			
 
				+            retry_func: Function to call for each retry
			
 
				+
			
 
				+        Returns:
			
 
				+            List of successfully retried tasks
			
 
				+        """
			
 
				+        retried = []
			
 
				+        failed_tasks = queue.get_failed_tasks()
			
 
				+
			
 
				+        for task in failed_tasks:
			
 
				+            if task.can_retry and self.should_retry(task, task.error_message or ""):
			
 
				+                queue.reset_for_retry(task.chapter_id)
			
 
				+                retried.append(task)
			
 
				+
			
 
				+        return retried
			
--- a/src/scheduler/task_queue.py
+++ b/src/scheduler/task_queue.py
@@ -0,0 +1,345 @@
 
				+"""
			
 
				+Task queue management for chapter processing.
			
 
				+
			
 
				+This module provides a thread-safe queue for managing chapter tasks
			
 
				+during pipeline execution.
			
 
				+"""
			
 
				+
			
 
				+import threading
			
 
				+from typing import List, Optional, Iterator, Dict, Any
			
 
				+from collections import deque
			
 
				+from dataclasses import dataclass, field
			
 
				+
			
 
				+from .models import ChapterTask, TaskStatus
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TaskStats:
			
 
				+    """Statistics for the task queue."""
			
 
				+
			
 
				+    total: int = 0
			
 
				+    pending: int = 0
			
 
				+    in_progress: int = 0
			
 
				+    completed: int = 0
			
 
				+    failed: int = 0
			
 
				+    skipped: int = 0
			
 
				+
			
 
				+
			
 
				+class TaskQueue:
			
 
				+    """
			
 
				+    Thread-safe queue for managing chapter tasks.
			
 
				+
			
 
				+    This queue supports:
			
 
				+    - Adding chapters to be processed
			
 
				+    - Getting the next pending chapter
			
 
				+    - Marking chapters as completed/failed
			
 
				+    - Tracking task statistics
			
 
				+    - Querying tasks by status
			
 
				+
			
 
				+    Thread-safe: All operations are protected by locks.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        """Initialize an empty task queue."""
			
 
				+        self._tasks: Dict[str, ChapterTask] = {}
			
 
				+        self._pending_order: deque[str] = deque()  # Ordered queue of pending task IDs
			
 
				+        self._lock = threading.Lock()
			
 
				+        self._current_task_id: Optional[str] = None
			
 
				+
			
 
				+    def add_chapter(
			
 
				+        self,
			
 
				+        chapter_id: str,
			
 
				+        chapter_index: int,
			
 
				+        title: str,
			
 
				+        content: str,
			
 
				+        metadata: Optional[Dict[str, Any]] = None
			
 
				+    ) -> ChapterTask:
			
 
				+        """
			
 
				+        Add a chapter to the queue.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: Unique identifier for the chapter
			
 
				+            chapter_index: Zero-based index of the chapter
			
 
				+            title: Chapter title
			
 
				+            content: Original content
			
 
				+            metadata: Optional metadata
			
 
				+
			
 
				+        Returns:
			
 
				+            The created ChapterTask
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            task = ChapterTask(
			
 
				+                chapter_id=chapter_id,
			
 
				+                chapter_index=chapter_index,
			
 
				+                title=title,
			
 
				+                original_content=content
			
 
				+            )
			
 
				+
			
 
				+            self._tasks[chapter_id] = task
			
 
				+            self._pending_order.append(chapter_id)
			
 
				+            return task
			
 
				+
			
 
				+    def add_task(self, task: ChapterTask) -> None:
			
 
				+        """
			
 
				+        Add an existing task to the queue.
			
 
				+
			
 
				+        Args:
			
 
				+            task: The ChapterTask to add
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            if task.chapter_id not in self._tasks:
			
 
				+                self._tasks[task.chapter_id] = task
			
 
				+                if task.status == TaskStatus.PENDING:
			
 
				+                    self._pending_order.append(task.chapter_id)
			
 
				+
			
 
				+    def get_next_pending(self) -> Optional[ChapterTask]:
			
 
				+        """
			
 
				+        Get the next pending chapter task.
			
 
				+
			
 
				+        This method marks the task as IN_PROGRESS and sets it as
			
 
				+        the current task.
			
 
				+
			
 
				+        Returns:
			
 
				+            The next pending ChapterTask, or None if no pending tasks
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            while self._pending_order:
			
 
				+                chapter_id = self._pending_order.popleft()
			
 
				+                task = self._tasks.get(chapter_id)
			
 
				+
			
 
				+                if task and task.status == TaskStatus.PENDING:
			
 
				+                    task.status = TaskStatus.IN_PROGRESS
			
 
				+                    task.started_at = task.started_at or task.__class__.__dict__.get('started_at')
			
 
				+                    self._current_task_id = chapter_id
			
 
				+                    return task
			
 
				+
			
 
				+            return None
			
 
				+
			
 
				+    def get_task(self, chapter_id: str) -> Optional[ChapterTask]:
			
 
				+        """
			
 
				+        Get a task by ID.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+
			
 
				+        Returns:
			
 
				+            The ChapterTask if found, None otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            return self._tasks.get(chapter_id)
			
 
				+
			
 
				+    def get_current_task(self) -> Optional[ChapterTask]:
			
 
				+        """
			
 
				+        Get the currently processing task.
			
 
				+
			
 
				+        Returns:
			
 
				+            The current ChapterTask, or None if no current task
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            if self._current_task_id:
			
 
				+                return self._tasks.get(self._current_task_id)
			
 
				+            return None
			
 
				+
			
 
				+    def mark_completed(self, chapter_id: str, translated_content: str) -> bool:
			
 
				+        """
			
 
				+        Mark a chapter as completed.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+            translated_content: The translated content
			
 
				+
			
 
				+        Returns:
			
 
				+            True if the chapter was found and marked, False otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            task = self._tasks.get(chapter_id)
			
 
				+            if task:
			
 
				+                task.status = TaskStatus.COMPLETED
			
 
				+                task.translated_content = translated_content
			
 
				+                if self._current_task_id == chapter_id:
			
 
				+                    self._current_task_id = None
			
 
				+                return True
			
 
				+            return False
			
 
				+
			
 
				+    def mark_failed(self, chapter_id: str, error: str) -> bool:
			
 
				+        """
			
 
				+        Mark a chapter as failed.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+            error: Error message
			
 
				+
			
 
				+        Returns:
			
 
				+            True if the chapter was found and marked, False otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            task = self._tasks.get(chapter_id)
			
 
				+            if task:
			
 
				+                task.status = TaskStatus.FAILED
			
 
				+                task.error_message = error
			
 
				+                if self._current_task_id == chapter_id:
			
 
				+                    self._current_task_id = None
			
 
				+                return True
			
 
				+            return False
			
 
				+
			
 
				+    def mark_retrying(self, chapter_id: str) -> bool:
			
 
				+        """
			
 
				+        Mark a chapter as retrying.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+
			
 
				+        Returns:
			
 
				+            True if the chapter was found and marked, False otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            task = self._tasks.get(chapter_id)
			
 
				+            if task and task.status == TaskStatus.FAILED:
			
 
				+                task.status = TaskStatus.RETRYING
			
 
				+                task.retry_count += 1
			
 
				+                return task.retry_count
			
 
				+            return False
			
 
				+
			
 
				+    def reset_for_retry(self, chapter_id: str) -> bool:
			
 
				+        """
			
 
				+        Reset a failed chapter to pending for retry.
			
 
				+
			
 
				+        Args:
			
 
				+            chapter_id: The chapter ID
			
 
				+
			
 
				+        Returns:
			
 
				+            True if the chapter was reset, False otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            task = self._tasks.get(chapter_id)
			
 
				+            if task and task.status == TaskStatus.FAILED:
			
 
				+                task.status = TaskStatus.PENDING
			
 
				+                task.error_message = None
			
 
				+                self._pending_order.appendleft(chapter_id)  # Add to front
			
 
				+                return True
			
 
				+            return False
			
 
				+
			
 
				+    def get_stats(self) -> TaskStats:
			
 
				+        """
			
 
				+        Get queue statistics.
			
 
				+
			
 
				+        Returns:
			
 
				+            TaskStats object with current statistics
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            stats = TaskStats(total=len(self._tasks))
			
 
				+
			
 
				+            for task in self._tasks.values():
			
 
				+                stats.total += 0  # Already counted
			
 
				+                if task.status == TaskStatus.PENDING:
			
 
				+                    stats.pending += 1
			
 
				+                elif task.status == TaskStatus.IN_PROGRESS:
			
 
				+                    stats.in_progress += 1
			
 
				+                elif task.status == TaskStatus.COMPLETED:
			
 
				+                    stats.completed += 1
			
 
				+                elif task.status == TaskStatus.FAILED:
			
 
				+                    stats.failed += 1
			
 
				+                elif task.status == TaskStatus.SKIPPED:
			
 
				+                    stats.skipped += 1
			
 
				+
			
 
				+            return stats
			
 
				+
			
 
				+    def get_tasks_by_status(self, status: TaskStatus) -> List[ChapterTask]:
			
 
				+        """
			
 
				+        Get all tasks with a specific status.
			
 
				+
			
 
				+        Args:
			
 
				+            status: The status to filter by
			
 
				+
			
 
				+        Returns:
			
 
				+            List of ChapterTasks with the given status
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            return [task for task in self._tasks.values() if task.status == status]
			
 
				+
			
 
				+    def get_failed_tasks(self) -> List[ChapterTask]:
			
 
				+        """
			
 
				+        Get all failed tasks.
			
 
				+
			
 
				+        Returns:
			
 
				+            List of failed ChapterTasks
			
 
				+        """
			
 
				+        return self.get_tasks_by_status(TaskStatus.FAILED)
			
 
				+
			
 
				+    def get_completed_tasks(self) -> List[ChapterTask]:
			
 
				+        """
			
 
				+        Get all completed tasks.
			
 
				+
			
 
				+        Returns:
			
 
				+            List of completed ChapterTasks
			
 
				+        """
			
 
				+        return self.get_tasks_by_status(TaskStatus.COMPLETED)
			
 
				+
			
 
				+    def get_pending_tasks(self) -> List[ChapterTask]:
			
 
				+        """
			
 
				+        Get all pending tasks.
			
 
				+
			
 
				+        Returns:
			
 
				+            List of pending ChapterTasks
			
 
				+        """
			
 
				+        return self.get_tasks_by_status(TaskStatus.PENDING)
			
 
				+
			
 
				+    def get_all_tasks(self) -> List[ChapterTask]:
			
 
				+        """
			
 
				+        Get all tasks sorted by index.
			
 
				+
			
 
				+        Returns:
			
 
				+            List of all ChapterTasks sorted by chapter_index
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            return sorted(
			
 
				+                self._tasks.values(),
			
 
				+                key=lambda t: t.chapter_index
			
 
				+            )
			
 
				+
			
 
				+    def has_pending(self) -> bool:
			
 
				+        """
			
 
				+        Check if there are pending tasks.
			
 
				+
			
 
				+        Returns:
			
 
				+            True if there are pending tasks, False otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            return any(
			
 
				+                task.status == TaskStatus.PENDING
			
 
				+                for task in self._tasks.values()
			
 
				+            )
			
 
				+
			
 
				+    def is_complete(self) -> bool:
			
 
				+        """
			
 
				+        Check if all tasks are complete (success or failed).
			
 
				+
			
 
				+        Returns:
			
 
				+            True if all tasks are finished, False otherwise
			
 
				+        """
			
 
				+        with self._lock:
			
 
				+            return all(
			
 
				+                task.is_finished
			
 
				+                for task in self._tasks.values()
			
 
				+            ) and len(self._tasks) > 0
			
 
				+
			
 
				+    def clear(self) -> None:
			
 
				+        """Clear all tasks from the queue."""
			
 
				+        with self._lock:
			
 
				+            self._tasks.clear()
			
 
				+            self._pending_order.clear()
			
 
				+            self._current_task_id = None
			
 
				+
			
 
				+    def __len__(self) -> int:
			
 
				+        """Get the number of tasks in the queue."""
			
 
				+        with self._lock:
			
 
				+            return len(self._tasks)
			
 
				+
			
 
				+    def __iter__(self) -> Iterator[ChapterTask]:
			
 
				+        """Iterate over all tasks in index order."""
			
 
				+        return iter(self.get_all_tasks())
			
 
				+
			
 
				+    def __contains__(self, chapter_id: str) -> bool:
			
 
				+        """Check if a chapter ID is in the queue."""
			
 
				+        with self._lock:
			
 
				+            return chapter_id in self._tasks
			
--- a/tests/scheduler/test_integration.py
+++ b/tests/scheduler/test_integration.py
@@ -0,0 +1,459 @@
 
				+"""
			
 
				+Integration tests for the full pipeline scheduler.
			
 
				+"""
			
 
				+
			
 
				+import pytest
			
 
				+import asyncio
			
 
				+from pathlib import Path
			
 
				+from datetime import datetime
			
 
				+from unittest.mock import Mock, MagicMock, patch
			
 
				+
			
 
				+# Mock torch before importing anything that depends on it
			
 
				+import sys
			
 
				+sys.modules['torch'] = MagicMock()
			
 
				+sys.modules['transformers'] = MagicMock()
			
 
				+
			
 
				+from src.scheduler import (
			
 
				+    TaskQueue,
			
 
				+    RetryManager,
			
 
				+    ProgressNotifier,
			
 
				+    ConsoleProgressObserver,
			
 
				+    CallbackProgressObserver,
			
 
				+    RecoveryManager,
			
 
				+    ChapterTask,
			
 
				+    TaskStatus,
			
 
				+    SchedulerState,
			
 
				+    RetryConfig,
			
 
				+    PipelineProgress,
			
 
				+    CheckpointData
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def temp_work_dir(tmp_path):
			
 
				+    """Create temporary working directory."""
			
 
				+    work_dir = tmp_path / "work"
			
 
				+    work_dir.mkdir()
			
 
				+    return work_dir
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def sample_novel_file(tmp_path):
			
 
				+    """Create a sample novel file for testing."""
			
 
				+    novel_file = tmp_path / "novel.txt"
			
 
				+    content = """第一章 开始
			
 
				+
			
 
				+这是第一章的内容，包含一些文字。
			
 
				+
			
 
				+林风站在山顶，看着远方的城市。
			
 
				+
			
 
				+第二章 继续
			
 
				+
			
 
				+这是第二章的内容。
			
 
				+
			
 
				+他开始了新的旅程。"""
			
 
				+    novel_file.write_text(content, encoding="utf-8")
			
 
				+    return novel_file
			
 
				+
			
 
				+
			
 
				+class TestTaskQueue:
			
 
				+    """Test suite for TaskQueue."""
			
 
				+
			
 
				+    def test_add_and_get_task(self):
			
 
				+        """Test adding and retrieving tasks."""
			
 
				+        queue = TaskQueue()
			
 
				+        task = queue.add_chapter(
			
 
				+            "ch1",
			
 
				+            0,
			
 
				+            "第一章",
			
 
				+            "这是内容"
			
 
				+        )
			
 
				+
			
 
				+        assert task.chapter_id == "ch1"
			
 
				+        assert task.status == TaskStatus.PENDING
			
 
				+
			
 
				+    def test_get_next_pending(self):
			
 
				+        """Test getting next pending task."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容1")
			
 
				+        queue.add_chapter("ch2", 1, "第二章", "内容2")
			
 
				+
			
 
				+        task = queue.get_next_pending()
			
 
				+        assert task is not None
			
 
				+        assert task.chapter_id == "ch1"
			
 
				+        assert task.status == TaskStatus.IN_PROGRESS
			
 
				+
			
 
				+    def test_mark_completed(self):
			
 
				+        """Test marking task as completed."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容")
			
 
				+
			
 
				+        queue.get_next_pending()
			
 
				+        queue.mark_completed("ch1", "Translated content")
			
 
				+
			
 
				+        retrieved = queue.get_task("ch1")
			
 
				+        assert retrieved.status == TaskStatus.COMPLETED
			
 
				+        assert retrieved.translated_content == "Translated content"
			
 
				+
			
 
				+    def test_mark_failed(self):
			
 
				+        """Test marking task as failed."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容")
			
 
				+
			
 
				+        queue.get_next_pending()
			
 
				+        queue.mark_failed("ch1", "Translation failed")
			
 
				+
			
 
				+        retrieved = queue.get_task("ch1")
			
 
				+        assert retrieved.status == TaskStatus.FAILED
			
 
				+        assert "Translation failed" in retrieved.error_message
			
 
				+
			
 
				+    def test_get_stats(self):
			
 
				+        """Test getting queue statistics."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容1")
			
 
				+        queue.add_chapter("ch2", 1, "第二章", "内容2")
			
 
				+
			
 
				+        stats = queue.get_stats()
			
 
				+        assert stats.total == 2
			
 
				+        assert stats.pending == 2
			
 
				+
			
 
				+    def test_reset_for_retry(self):
			
 
				+        """Test resetting task for retry."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容")
			
 
				+
			
 
				+        queue.get_next_pending()
			
 
				+        queue.mark_failed("ch1", "Error")
			
 
				+        assert queue.reset_for_retry("ch1") is True
			
 
				+
			
 
				+        task = queue.get_task("ch1")
			
 
				+        assert task.status == TaskStatus.PENDING
			
 
				+
			
 
				+    def test_has_pending(self):
			
 
				+        """Test checking for pending tasks."""
			
 
				+        queue = TaskQueue()
			
 
				+        assert not queue.has_pending()
			
 
				+
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容")
			
 
				+        assert queue.has_pending()
			
 
				+
			
 
				+    def test_is_complete(self):
			
 
				+        """Test checking if queue is complete."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容")
			
 
				+
			
 
				+        assert not queue.is_complete()
			
 
				+
			
 
				+        queue.get_next_pending()
			
 
				+        queue.mark_completed("ch1", "Translated")
			
 
				+        assert queue.is_complete()
			
 
				+
			
 
				+    def test_iteration(self):
			
 
				+        """Test iterating over tasks."""
			
 
				+        queue = TaskQueue()
			
 
				+        queue.add_chapter("ch2", 1, "第二章", "内容2")
			
 
				+        queue.add_chapter("ch1", 0, "第一章", "内容1")
			
 
				+
			
 
				+        tasks = list(queue)
			
 
				+        assert len(tasks) == 2
			
 
				+        assert tasks[0].chapter_index == 0
			
 
				+        assert tasks[1].chapter_index == 1
			
 
				+
			
 
				+
			
 
				+class TestRetryManager:
			
 
				+    """Test suite for RetryManager."""
			
 
				+
			
 
				+    def test_should_retry_on_timeout(self):
			
 
				+        """Test retry decision for timeout errors."""
			
 
				+        manager = RetryManager()
			
 
				+        task = ChapterTask("ch1", 0, "第一章", "内容")
			
 
				+        task.retry_count = 1
			
 
				+
			
 
				+        assert manager.should_retry(task, "Request timeout") is True
			
 
				+
			
 
				+    def test_should_not_retry_exceeded(self):
			
 
				+        """Test no retry after max attempts."""
			
 
				+        manager = RetryManager()
			
 
				+        task = ChapterTask("ch1", 0, "第一章", "内容")
			
 
				+        task.retry_count = 3
			
 
				+
			
 
				+        assert manager.should_retry(task, "Error") is False
			
 
				+
			
 
				+    def test_get_retry_delay_exponential(self):
			
 
				+        """Test exponential backoff delay calculation."""
			
 
				+        config = RetryConfig(exponential_backoff=True, base_delay=1.0)
			
 
				+        manager = RetryManager(config)
			
 
				+
			
 
				+        assert manager.get_retry_delay(1) == 1.0
			
 
				+        assert manager.get_retry_delay(2) == 2.0
			
 
				+        assert manager.get_retry_delay(3) == 4.0
			
 
				+
			
 
				+    def test_get_retry_delay_linear(self):
			
 
				+        """Test linear delay calculation."""
			
 
				+        config = RetryConfig(exponential_backoff=False, base_delay=2.0)
			
 
				+        manager = RetryManager(config)
			
 
				+
			
 
				+        assert manager.get_retry_delay(1) == 2.0
			
 
				+        assert manager.get_retry_delay(2) == 2.0
			
 
				+
			
 
				+    def test_max_delay_cap(self):
			
 
				+        """Test that delay is capped at max_delay."""
			
 
				+        config = RetryConfig(
			
 
				+            exponential_backoff=True,
			
 
				+            base_delay=1.0,
			
 
				+            max_delay=5.0
			
 
				+        )
			
 
				+        manager = RetryManager(config)
			
 
				+
			
 
				+        # Should cap at 5.0
			
 
				+        assert manager.get_retry_delay(10) <= 5.0
			
 
				+
			
 
				+    def test_record_and_get_history(self):
			
 
				+        """Test recording and retrieving retry history."""
			
 
				+        manager = RetryManager()
			
 
				+        record = manager.record_retry("ch1", 1, "Error", 1.0)
			
 
				+
			
 
				+        assert record.chapter_id == "ch1"
			
 
				+        assert record.attempt_number == 1
			
 
				+
			
 
				+        history = manager.get_retry_history("ch1")
			
 
				+        assert len(history) == 1
			
 
				+
			
 
				+    def test_clear_retry_history(self):
			
 
				+        """Test clearing retry history."""
			
 
				+        manager = RetryManager()
			
 
				+        manager.record_retry("ch1", 1, "Error", 1.0)
			
 
				+        manager.clear_retry_history("ch1")
			
 
				+
			
 
				+        history = manager.get_retry_history("ch1")
			
 
				+        assert len(history) == 0
			
 
				+
			
 
				+    def test_get_stats(self):
			
 
				+        """Test getting retry statistics."""
			
 
				+        manager = RetryManager()
			
 
				+        manager.record_retry("ch1", 1, "Error", 1.0, success=True)
			
 
				+        manager.record_retry("ch2", 1, "Error", 1.0, success=False)
			
 
				+
			
 
				+        stats = manager.get_stats()
			
 
				+        assert stats["total_retries"] == 2
			
 
				+        assert stats["successful_retries"] == 1
			
 
				+        assert stats["failed_retries"] == 1
			
 
				+
			
 
				+
			
 
				+class TestProgressNotifier:
			
 
				+    """Test suite for ProgressNotifier."""
			
 
				+
			
 
				+    def test_register_and_notify(self):
			
 
				+        """Test registering observer and sending notifications."""
			
 
				+        notifier = ProgressNotifier()
			
 
				+        observer = CallbackProgressObserver(
			
 
				+            on_start=lambda total: None,
			
 
				+            on_chapter_complete=lambda task: None
			
 
				+        )
			
 
				+
			
 
				+        notifier.register(observer)
			
 
				+        assert notifier.observer_count == 1
			
 
				+
			
 
				+        task = ChapterTask("ch1", 0, "第一章", "内容")
			
 
				+        notifier.notify_chapter_complete(task)
			
 
				+
			
 
				+    def test_unregister_observer(self):
			
 
				+        """Test unregistering observer."""
			
 
				+        notifier = ProgressNotifier()
			
 
				+        observer = CallbackProgressObserver()
			
 
				+
			
 
				+        notifier.register(observer)
			
 
				+        assert notifier.observer_count == 1
			
 
				+
			
 
				+        notifier.unregister(observer)
			
 
				+        assert notifier.observer_count == 0
			
 
				+
			
 
				+    def test_event_history(self):
			
 
				+        """Test event history tracking."""
			
 
				+        notifier = ProgressNotifier()
			
 
				+
			
 
				+        task = ChapterTask("ch1", 0, "第一章", "内容")
			
 
				+        notifier.notify_chapter_start(task)
			
 
				+        notifier.notify_chapter_complete(task)
			
 
				+
			
 
				+        history = notifier.get_event_history()
			
 
				+        assert len(history) == 2
			
 
				+
			
 
				+    def test_clear_observers(self):
			
 
				+        """Test clearing all observers."""
			
 
				+        notifier = ProgressNotifier()
			
 
				+        notifier.register(CallbackProgressObserver())
			
 
				+        notifier.register(CallbackProgressObserver())
			
 
				+
			
 
				+        assert notifier.observer_count == 2
			
 
				+        notifier.clear_observers()
			
 
				+        assert notifier.observer_count == 0
			
 
				+
			
 
				+
			
 
				+class TestRecoveryManager:
			
 
				+    """Test suite for RecoveryManager."""
			
 
				+
			
 
				+    def test_save_and_load_checkpoint(self, temp_work_dir):
			
 
				+        """Test saving and loading checkpoint."""
			
 
				+        recovery = RecoveryManager(temp_work_dir)
			
 
				+
			
 
				+        checkpoint = CheckpointData(
			
 
				+            work_id="work123",
			
 
				+            current_chapter_index=5,
			
 
				+            completed_indices=[0, 1, 2, 3, 4],
			
 
				+            failed_indices=[]
			
 
				+        )
			
 
				+
			
 
				+        recovery.save_checkpoint(checkpoint)
			
 
				+        assert recovery.has_checkpoint() is True
			
 
				+
			
 
				+        loaded = recovery.load_checkpoint()
			
 
				+        assert loaded is not None
			
 
				+        assert loaded.work_id == "work123"
			
 
				+        assert loaded.current_chapter_index == 5
			
 
				+
			
 
				+    def test_delete_checkpoint(self, temp_work_dir):
			
 
				+        """Test deleting checkpoint."""
			
 
				+        recovery = RecoveryManager(temp_work_dir)
			
 
				+
			
 
				+        checkpoint = CheckpointData(
			
 
				+            work_id="work123",
			
 
				+            current_chapter_index=0,
			
 
				+            completed_indices=[],
			
 
				+            failed_indices=[]
			
 
				+        )
			
 
				+
			
 
				+        recovery.save_checkpoint(checkpoint)
			
 
				+        assert recovery.has_checkpoint() is True
			
 
				+
			
 
				+        recovery.delete_checkpoint()
			
 
				+        assert recovery.has_checkpoint() is False
			
 
				+
			
 
				+    def test_get_recovery_state(self, temp_work_dir):
			
 
				+        """Test getting recovery state."""
			
 
				+        recovery = RecoveryManager(temp_work_dir)
			
 
				+
			
 
				+        checkpoint = CheckpointData(
			
 
				+            work_id="work123",
			
 
				+            current_chapter_index=3,
			
 
				+            completed_indices=[0, 1, 2],
			
 
				+            failed_indices=[]
			
 
				+        )
			
 
				+
			
 
				+        recovery.save_checkpoint(checkpoint)
			
 
				+        state = recovery.get_recovery_state()
			
 
				+
			
 
				+        assert state is not None
			
 
				+        assert state["recoverable"] is True
			
 
				+        assert state["work_id"] == "work123"
			
 
				+        assert state["resume_index"] == 3
			
 
				+
			
 
				+    def test_can_resume(self, temp_work_dir):
			
 
				+        """Test checking if resume is possible."""
			
 
				+        recovery = RecoveryManager(temp_work_dir)
			
 
				+
			
 
				+        assert recovery.can_resume() is False
			
 
				+
			
 
				+        checkpoint = CheckpointData(
			
 
				+            work_id="work123",
			
 
				+            current_chapter_index=0,
			
 
				+            completed_indices=[],
			
 
				+            failed_indices=[]
			
 
				+        )
			
 
				+
			
 
				+        recovery.save_checkpoint(checkpoint)
			
 
				+        assert recovery.can_resume() is True
			
 
				+
			
 
				+
			
 
				+class TestModels:
			
 
				+    """Test suite for scheduler models."""
			
 
				+
			
 
				+    def test_chapter_task_model(self):
			
 
				+        """Test ChapterTask model."""
			
 
				+        task = ChapterTask(
			
 
				+            "ch1",
			
 
				+            0,
			
 
				+            "第一章",
			
 
				+            "内容"
			
 
				+        )
			
 
				+
			
 
				+        assert task.is_finished is False
			
 
				+        assert task.can_retry is False
			
 
				+
			
 
				+        task.status = TaskStatus.COMPLETED
			
 
				+        assert task.is_finished is True
			
 
				+
			
 
				+    def test_task_status_enum(self):
			
 
				+        """Test TaskStatus enum values."""
			
 
				+        assert TaskStatus.PENDING.value == "pending"
			
 
				+        assert TaskStatus.IN_PROGRESS.value == "in_progress"
			
 
				+        assert TaskStatus.COMPLETED.value == "completed"
			
 
				+        assert TaskStatus.FAILED.value == "failed"
			
 
				+
			
 
				+    def test_scheduler_state_enum(self):
			
 
				+        """Test SchedulerState enum values."""
			
 
				+        assert SchedulerState.IDLE.value == "idle"
			
 
				+        assert SchedulerState.RUNNING.value == "running"
			
 
				+        assert SchedulerState.PAUSED.value == "paused"
			
 
				+        assert SchedulerState.COMPLETED.value == "completed"
			
 
				+
			
 
				+    def test_pipeline_progress_model(self):
			
 
				+        """Test PipelineProgress model."""
			
 
				+        progress = PipelineProgress(
			
 
				+            total_chapters=10,
			
 
				+            completed_chapters=5
			
 
				+        )
			
 
				+
			
 
				+        assert progress.pending_chapters == 5
			
 
				+        assert progress.completion_rate == 0.5
			
 
				+
			
 
				+    def test_checkpoint_data_model(self):
			
 
				+        """Test CheckpointData model."""
			
 
				+        checkpoint = CheckpointData(
			
 
				+            work_id="work123",
			
 
				+            current_chapter_index=5,
			
 
				+            completed_indices=[0, 1, 2, 3, 4],
			
 
				+            failed_indices=[]
			
 
				+        )
			
 
				+
			
 
				+        assert checkpoint.work_id == "work123"
			
 
				+        assert checkpoint.current_chapter_index == 5
			
 
				+
			
 
				+
			
 
				+class TestConsoleProgressObserver:
			
 
				+    """Test suite for ConsoleProgressObserver."""
			
 
				+
			
 
				+    def test_observer_creation(self):
			
 
				+        """Test creating console observer."""
			
 
				+        observer = ConsoleProgressObserver(verbose=False)
			
 
				+        assert observer.verbose is False
			
 
				+
			
 
				+    def test_observer_methods_exist(self):
			
 
				+        """Test that all observer methods exist."""
			
 
				+        observer = ConsoleProgressObserver()
			
 
				+
			
 
				+        assert hasattr(observer, "on_pipeline_start")
			
 
				+        assert hasattr(observer, "on_chapter_complete")
			
 
				+        assert hasattr(observer, "on_chapter_failed")
			
 
				+
			
 
				+
			
 
				+class TestCallbackProgressObserver:
			
 
				+    """Test suite for CallbackProgressObserver."""
			
 
				+
			
 
				+    def test_callback_invocation(self):
			
 
				+        """Test that callbacks are invoked."""
			
 
				+        calls = []
			
 
				+
			
 
				+        on_start = lambda total: calls.append("start")
			
 
				+        on_complete = lambda task: calls.append("complete")
			
 
				+
			
 
				+        observer = CallbackProgressObserver(
			
 
				+            on_start=on_start,
			
 
				+            on_chapter_complete=on_complete
			
 
				+        )
			
 
				+
			
 
				+        observer.on_pipeline_start(10)
			
 
				+        observer.on_chapter_complete(ChapterTask("ch1", 0, "Title", "Content"))
			
 
				+
			
 
				+        assert "start" in calls
			
 
				+        assert "complete" in calls