| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- """
- Data models for the glossary module.
- This module defines the core data structures for terminology management.
- """
- from dataclasses import dataclass
- from typing import Dict, List, Optional
- from enum import Enum
- class TermCategory(Enum):
- """Categories for terminology entries."""
- CHARACTER = "character" # Character names (e.g., 林风)
- SKILL = "skill" # Skill names (e.g., 火球术)
- LOCATION = "location" # Location names (e.g., 东方大陆)
- ITEM = "item" # Item names (e.g., 龙剑)
- ORGANIZATION = "organization" # Organization names (e.g., 魔法学院)
- OTHER = "other" # Other terms
- @dataclass
- class GlossaryEntry:
- """
- A single entry in the glossary.
- Attributes:
- source: The original term in the source language
- target: The translated term in the target language
- category: The category of the term
- context: Optional context information for the term
- """
- source: str
- target: str
- category: TermCategory
- context: str = ""
- def __post_init__(self):
- """Validate the glossary entry."""
- if not self.source or not self.source.strip():
- raise ValueError("Source term cannot be empty")
- if not self.target or not self.target.strip():
- raise ValueError("Target term cannot be empty")
- @property
- def length(self) -> int:
- """Return the length of the source term."""
- return len(self.source)
- class Glossary:
- """
- Glossary for managing terminology translations.
- The glossary stores terms and their translations, ensuring consistent
- translation across the entire document.
- """
- def __init__(self):
- """Initialize an empty glossary."""
- self._terms: Dict[str, GlossaryEntry] = {}
- def add(self, entry: GlossaryEntry) -> None:
- """
- Add a term to the glossary.
- Args:
- entry: The GlossaryEntry to add
- """
- self._terms[entry.source] = entry
- def get(self, source: str) -> Optional[GlossaryEntry]:
- """
- Retrieve a term from the glossary.
- Args:
- source: The source term to look up
- Returns:
- The GlossaryEntry if found, None otherwise
- """
- return self._terms.get(source)
- def remove(self, source: str) -> bool:
- """
- Remove a term from the glossary.
- Args:
- source: The source term to remove
- Returns:
- True if the term was removed, False if it wasn't found
- """
- if source in self._terms:
- del self._terms[source]
- return True
- return False
- def get_all(self) -> List[GlossaryEntry]:
- """
- Get all terms in the glossary.
- Returns:
- List of all GlossaryEntry objects
- """
- return list(self._terms.values())
- def sort_by_length_desc(self) -> List[str]:
- """
- Get term sources sorted by length in descending order.
- This is used for longest-match processing, where longer terms
- should be matched first to avoid partial matches.
- Returns:
- List of source terms sorted by length (longest first)
- """
- return sorted(self._terms.keys(), key=lambda x: len(x), reverse=True)
- def __len__(self) -> int:
- """Return the number of terms in the glossary."""
- return len(self._terms)
- def __contains__(self, source: str) -> bool:
- """Check if a term is in the glossary."""
- return source in self._terms
|