Spaces:
Running
Running
Update translator.py
Browse files- translator.py +687 -32
translator.py
CHANGED
|
@@ -30,6 +30,7 @@ def check_library(name, import_stmt):
|
|
| 30 |
return False
|
| 31 |
|
| 32 |
HAS_DOCX = check_library("python-docx", "from docx import Document; from docx.shared import Pt, RGBColor; from docx.text.paragraph import Paragraph; from docx.oxml.shared import OxmlElement; from docx.oxml.ns import qn")
|
|
|
|
| 33 |
HAS_TORCH = check_library("torch", "import torch")
|
| 34 |
HAS_CT2 = check_library("CTranslate2", "import ctranslate2; from huggingface_hub import snapshot_download")
|
| 35 |
HAS_TRANSFORMERS = check_library("Transformers", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM")
|
|
@@ -967,6 +968,33 @@ class MultiAligner:
|
|
| 967 |
logger.debug("Using heuristic fallback (no quality alignments found)")
|
| 968 |
return []
|
| 969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
|
| 971 |
# ============================================================================
|
| 972 |
# DOCUMENT TRANSLATOR
|
|
@@ -985,14 +1013,19 @@ class UltimateDocumentTranslator:
|
|
| 985 |
aligner: Optional[str] = None,
|
| 986 |
nllb_model_size: str = "600M"
|
| 987 |
):
|
|
|
|
| 988 |
self.src_lang, self.tgt_lang, self.mode = src_lang, tgt_lang, mode
|
| 989 |
-
self.ct2 = None
|
| 990 |
-
self.nllb = None
|
| 991 |
self.llm = None
|
| 992 |
self.aligner = None
|
| 993 |
self.opus = None
|
| 994 |
self.madlad = None
|
|
|
|
|
|
|
|
|
|
| 995 |
|
|
|
|
| 996 |
logger.info(f"INIT | Starting Translator ({src_lang}→{tgt_lang})")
|
| 997 |
logger.info(f"INIT | Mode: {mode.value} | NMT: {nmt_backend} | Aligner: {aligner or 'auto'}")
|
| 998 |
self.log_memory("Initialization Start")
|
|
@@ -1061,6 +1094,36 @@ class UltimateDocumentTranslator:
|
|
| 1061 |
except Exception as e:
|
| 1062 |
logger.debug(f"Memory log failed: {e}")
|
| 1063 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1064 |
async def translate_text(self, text: str) -> str:
|
| 1065 |
"""Routes text through the active neural engine chain."""
|
| 1066 |
if not text.strip(): return text
|
|
@@ -1466,7 +1529,10 @@ class UltimateDocumentTranslator:
|
|
| 1466 |
logger.debug(f" > Indent-L: {pf.left_indent.pt if pf.left_indent else 0:.1f}pt | Spacing-A: {pf.space_after.pt if pf.space_after else 0:.1f}pt")
|
| 1467 |
|
| 1468 |
async def translate_document(self, input_path: Path, output_path: Path):
|
| 1469 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 1470 |
self.log_memory("Initialization")
|
| 1471 |
doc = Document(str(input_path))
|
| 1472 |
|
|
@@ -1504,6 +1570,581 @@ class UltimateDocumentTranslator:
|
|
| 1504 |
self.log_document_info(Document(str(output_path)), "OUTPUT")
|
| 1505 |
logger.info("✓ Document Translation Complete.")
|
| 1506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1507 |
|
| 1508 |
# ============================================================================
|
| 1509 |
# CLI
|
|
@@ -1520,36 +2161,35 @@ Backends Comparison:
|
|
| 1520 |
opus - Specialized bilingual models. Tiny (~200MB), extremely fast, literal.
|
| 1521 |
ct2 (wmt) - Dense Facebook models. Peak German/European quality (~6GB RAM).
|
| 1522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1523 |
Examples:
|
| 1524 |
-
#
|
| 1525 |
%(prog)s input.docx output.docx -s en -t de
|
| 1526 |
|
| 1527 |
-
#
|
| 1528 |
-
%(prog)s
|
| 1529 |
-
|
| 1530 |
-
# Maximum speed for EN-DE specialized pair (Opus-MT)
|
| 1531 |
-
%(prog)s input.docx output.docx -s en -t de --nmt opus
|
| 1532 |
|
| 1533 |
-
#
|
| 1534 |
-
%(prog)s
|
| 1535 |
|
| 1536 |
-
#
|
| 1537 |
-
%(prog)s
|
| 1538 |
|
| 1539 |
Environment Variables:
|
| 1540 |
OPENAI_API_KEY, ANTHROPIC_API_KEY - Required for LLM backends.
|
| 1541 |
"""
|
| 1542 |
)
|
| 1543 |
|
| 1544 |
-
# 1. POSITIONAL ARGUMENTS
|
| 1545 |
-
parser.add_argument('input', help='Input .docx
|
| 1546 |
-
parser.add_argument('output', help='Output .docx
|
| 1547 |
|
| 1548 |
-
# 2. LANGUAGE ARGUMENTS
|
| 1549 |
parser.add_argument('-s', '--source', default='en', help='Source language code (default: en)')
|
| 1550 |
parser.add_argument('-t', '--target', default='de', help='Target language code (default: de)')
|
| 1551 |
|
| 1552 |
-
# 3. TRANSLATION MODE
|
| 1553 |
parser.add_argument(
|
| 1554 |
'--mode',
|
| 1555 |
choices=['nmt', 'llm-align', 'llm-plain', 'hybrid'],
|
|
@@ -1557,7 +2197,6 @@ Environment Variables:
|
|
| 1557 |
help='Translation strategy (default: hybrid)'
|
| 1558 |
)
|
| 1559 |
|
| 1560 |
-
# 4. NMT ENGINE SELECTION
|
| 1561 |
parser.add_argument(
|
| 1562 |
'--nmt',
|
| 1563 |
choices=['nllb', 'madlad', 'opus', 'ct2', 'auto'],
|
|
@@ -1572,14 +2211,12 @@ Environment Variables:
|
|
| 1572 |
help='NLLB variant only: 600M (fastest), 1.3B (balanced), 3.3B (heavy)'
|
| 1573 |
)
|
| 1574 |
|
| 1575 |
-
# 5. LLM PROVIDER
|
| 1576 |
parser.add_argument(
|
| 1577 |
'--llm',
|
| 1578 |
choices=['openai', 'anthropic', 'ollama'],
|
| 1579 |
help='LLM provider for hybrid/llm modes'
|
| 1580 |
)
|
| 1581 |
|
| 1582 |
-
# 6. ALIGNER SELECTION
|
| 1583 |
parser.add_argument(
|
| 1584 |
'--aligner',
|
| 1585 |
choices=['awesome', 'simalign', 'lindat', 'fast_align', 'heuristic', 'auto'],
|
|
@@ -1591,17 +2228,32 @@ Environment Variables:
|
|
| 1591 |
|
| 1592 |
args = parser.parse_args()
|
| 1593 |
|
| 1594 |
-
# Set
|
| 1595 |
if args.verbose:
|
| 1596 |
logging.getLogger().setLevel(logging.DEBUG)
|
| 1597 |
|
| 1598 |
-
#
|
| 1599 |
input_path = Path(args.input)
|
|
|
|
|
|
|
| 1600 |
if not input_path.exists():
|
| 1601 |
logger.error(f"File not found: {input_path}")
|
| 1602 |
sys.exit(1)
|
| 1603 |
|
| 1604 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1605 |
mode_map = {
|
| 1606 |
'nmt': TranslationMode.NMT_ONLY,
|
| 1607 |
'llm-align': TranslationMode.LLM_WITH_ALIGN,
|
|
@@ -1609,12 +2261,15 @@ Environment Variables:
|
|
| 1609 |
'hybrid': TranslationMode.HYBRID
|
| 1610 |
}
|
| 1611 |
|
| 1612 |
-
#
|
|
|
|
|
|
|
| 1613 |
print(f"\n{'='*60}")
|
| 1614 |
print(f"🌍 DOCUMENT TRANSLATOR - PRODUCTION v12")
|
| 1615 |
print(f"{'='*60}")
|
|
|
|
| 1616 |
print(f"Input: {input_path.name}")
|
| 1617 |
-
print(f"Output: {
|
| 1618 |
print(f"Direction: {args.source.upper()} → {args.target.upper()}")
|
| 1619 |
print(f"Mode: {args.mode.upper()}")
|
| 1620 |
print(f"NMT Engine: {args.nmt.upper()} {'('+args.nllb_size+')' if args.nmt=='nllb' else ''}")
|
|
@@ -1623,7 +2278,7 @@ Environment Variables:
|
|
| 1623 |
print(f"LLM: {args.llm.upper()}")
|
| 1624 |
print(f"{'='*60}\n")
|
| 1625 |
|
| 1626 |
-
# Initialize
|
| 1627 |
translator = UltimateDocumentTranslator(
|
| 1628 |
src_lang=args.source,
|
| 1629 |
tgt_lang=args.target,
|
|
@@ -1634,16 +2289,16 @@ Environment Variables:
|
|
| 1634 |
nllb_model_size=args.nllb_size
|
| 1635 |
)
|
| 1636 |
|
| 1637 |
-
#
|
| 1638 |
try:
|
| 1639 |
-
await translator.
|
| 1640 |
|
| 1641 |
print(f"\n{'='*60}")
|
| 1642 |
-
print(f"✅ Success!
|
| 1643 |
-
print(f"💾 File saved to: {
|
| 1644 |
print(f"{'='*60}\n")
|
| 1645 |
except Exception as e:
|
| 1646 |
-
logger.error(f"FAILED |
|
| 1647 |
sys.exit(1)
|
| 1648 |
|
| 1649 |
|
|
|
|
| 30 |
return False
|
| 31 |
|
| 32 |
HAS_DOCX = check_library("python-docx", "from docx import Document; from docx.shared import Pt, RGBColor; from docx.text.paragraph import Paragraph; from docx.oxml.shared import OxmlElement; from docx.oxml.ns import qn")
|
| 33 |
+
HAS_PPTX = check_library("python-pptx", "from pptx import Presentation; from pptx.util import Pt, Inches; from pptx.enum.text import PP_ALIGN, MSO_VERTICAL_ANCHOR; from pptx.dml.color import RGBColor")
|
| 34 |
HAS_TORCH = check_library("torch", "import torch")
|
| 35 |
HAS_CT2 = check_library("CTranslate2", "import ctranslate2; from huggingface_hub import snapshot_download")
|
| 36 |
HAS_TRANSFORMERS = check_library("Transformers", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM")
|
|
|
|
| 968 |
logger.debug("Using heuristic fallback (no quality alignments found)")
|
| 969 |
return []
|
| 970 |
|
| 971 |
+
# ============================================================================
|
| 972 |
+
# PPTX HANDLING
|
| 973 |
+
# ============================================================================
|
| 974 |
+
|
| 975 |
+
# ============================================================================
|
| 976 |
+
# POWERPOINT-SPECIFIC STRUCTURES
|
| 977 |
+
# ============================================================================
|
| 978 |
+
|
| 979 |
+
@dataclass
|
| 980 |
+
class TranslatableTextFrame:
|
| 981 |
+
"""Text frame with positioning and shape metadata"""
|
| 982 |
+
paragraphs: List[TranslatableParagraph] = field(default_factory=list)
|
| 983 |
+
shape_metadata: Dict[str, Any] = field(default_factory=dict)
|
| 984 |
+
|
| 985 |
+
def get_all_text(self) -> str:
|
| 986 |
+
return '\n'.join(p.get_text() for p in self.paragraphs)
|
| 987 |
+
|
| 988 |
+
|
| 989 |
+
@dataclass
|
| 990 |
+
class SlideMetadata:
|
| 991 |
+
"""Slide-level properties"""
|
| 992 |
+
layout: Any = None
|
| 993 |
+
background: Any = None
|
| 994 |
+
notes: Optional[str] = None
|
| 995 |
+
|
| 996 |
+
|
| 997 |
+
|
| 998 |
|
| 999 |
# ============================================================================
|
| 1000 |
# DOCUMENT TRANSLATOR
|
|
|
|
| 1013 |
aligner: Optional[str] = None,
|
| 1014 |
nllb_model_size: str = "600M"
|
| 1015 |
):
|
| 1016 |
+
# Main initialization code
|
| 1017 |
self.src_lang, self.tgt_lang, self.mode = src_lang, tgt_lang, mode
|
| 1018 |
+
self.ct2 = None
|
| 1019 |
+
self.nllb = None
|
| 1020 |
self.llm = None
|
| 1021 |
self.aligner = None
|
| 1022 |
self.opus = None
|
| 1023 |
self.madlad = None
|
| 1024 |
+
|
| 1025 |
+
# File type tracking
|
| 1026 |
+
self.current_file_type = None
|
| 1027 |
|
| 1028 |
+
# Rest of initialization unchanged...
|
| 1029 |
logger.info(f"INIT | Starting Translator ({src_lang}→{tgt_lang})")
|
| 1030 |
logger.info(f"INIT | Mode: {mode.value} | NMT: {nmt_backend} | Aligner: {aligner or 'auto'}")
|
| 1031 |
self.log_memory("Initialization Start")
|
|
|
|
| 1094 |
except Exception as e:
|
| 1095 |
logger.debug(f"Memory log failed: {e}")
|
| 1096 |
|
| 1097 |
+
# with format specific error messages
|
| 1098 |
+
async def translate_file(self, input_path: Path, output_path: Path):
|
| 1099 |
+
"""Main entry point with enhanced error handling"""
|
| 1100 |
+
try:
|
| 1101 |
+
is_valid, file_type, error = self.validate_file(input_path)
|
| 1102 |
+
if not is_valid:
|
| 1103 |
+
raise ValueError(error)
|
| 1104 |
+
|
| 1105 |
+
self.current_file_type = file_type
|
| 1106 |
+
logger.info(f"Processing {file_type.upper()} file: {input_path.name}")
|
| 1107 |
+
|
| 1108 |
+
if file_type == 'docx':
|
| 1109 |
+
await self.translate_document(input_path, output_path)
|
| 1110 |
+
elif file_type == 'pptx':
|
| 1111 |
+
await self.translate_presentation(input_path, output_path)
|
| 1112 |
+
else:
|
| 1113 |
+
raise ValueError(f"Unsupported file type: {file_type}")
|
| 1114 |
+
|
| 1115 |
+
except ImportError as e:
|
| 1116 |
+
if 'docx' in str(e) and self.current_file_type == 'docx':
|
| 1117 |
+
logger.error("python-docx not installed. Install with: pip install python-docx")
|
| 1118 |
+
elif 'pptx' in str(e) and self.current_file_type == 'pptx':
|
| 1119 |
+
logger.error("python-pptx not installed. Install with: pip install python-pptx")
|
| 1120 |
+
raise
|
| 1121 |
+
|
| 1122 |
+
except Exception as e:
|
| 1123 |
+
format_name = "Word document" if self.current_file_type == 'docx' else "PowerPoint presentation"
|
| 1124 |
+
logger.error(f"Failed to translate {format_name}: {e}")
|
| 1125 |
+
raise
|
| 1126 |
+
|
| 1127 |
async def translate_text(self, text: str) -> str:
|
| 1128 |
"""Routes text through the active neural engine chain."""
|
| 1129 |
if not text.strip(): return text
|
|
|
|
| 1529 |
logger.debug(f" > Indent-L: {pf.left_indent.pt if pf.left_indent else 0:.1f}pt | Spacing-A: {pf.space_after.pt if pf.space_after else 0:.1f}pt")
|
| 1530 |
|
| 1531 |
async def translate_document(self, input_path: Path, output_path: Path):
|
| 1532 |
+
"""
|
| 1533 |
+
Full Word document translation lifecycle with robust XML commitment and verification logs.
|
| 1534 |
+
Called internally by translate_file().
|
| 1535 |
+
"""
|
| 1536 |
self.log_memory("Initialization")
|
| 1537 |
doc = Document(str(input_path))
|
| 1538 |
|
|
|
|
| 1570 |
self.log_document_info(Document(str(output_path)), "OUTPUT")
|
| 1571 |
logger.info("✓ Document Translation Complete.")
|
| 1572 |
|
| 1573 |
+
# ============================================================================
|
| 1574 |
+
# PPTX EXTRACTION METHODS
|
| 1575 |
+
# ============================================================================
|
| 1576 |
+
|
| 1577 |
+
def extract_text_frame(self, shape) -> TranslatableTextFrame:
|
| 1578 |
+
"""
|
| 1579 |
+
Extract text frame with shape positioning and paragraph hierarchy.
|
| 1580 |
+
PPT equivalent of extract_paragraph().
|
| 1581 |
+
"""
|
| 1582 |
+
from pptx.util import Pt, Inches
|
| 1583 |
+
|
| 1584 |
+
trans_frame = TranslatableTextFrame()
|
| 1585 |
+
|
| 1586 |
+
# Capture shape-level metadata (positioning, size, rotation)
|
| 1587 |
+
trans_frame.shape_metadata = {
|
| 1588 |
+
'left': shape.left,
|
| 1589 |
+
'top': shape.top,
|
| 1590 |
+
'width': shape.width,
|
| 1591 |
+
'height': shape.height,
|
| 1592 |
+
'rotation': shape.rotation,
|
| 1593 |
+
'shape_type': shape.shape_type,
|
| 1594 |
+
'name': shape.name
|
| 1595 |
+
}
|
| 1596 |
+
|
| 1597 |
+
# Capture text frame properties
|
| 1598 |
+
if hasattr(shape, 'text_frame'):
|
| 1599 |
+
tf = shape.text_frame
|
| 1600 |
+
trans_frame.shape_metadata['text_frame'] = {
|
| 1601 |
+
'margin_left': tf.margin_left,
|
| 1602 |
+
'margin_right': tf.margin_right,
|
| 1603 |
+
'margin_top': tf.margin_top,
|
| 1604 |
+
'margin_bottom': tf.margin_bottom,
|
| 1605 |
+
'vertical_anchor': tf.vertical_anchor,
|
| 1606 |
+
'word_wrap': tf.word_wrap,
|
| 1607 |
+
'auto_size': tf.auto_size
|
| 1608 |
+
}
|
| 1609 |
+
|
| 1610 |
+
# Extract each paragraph with runs
|
| 1611 |
+
for para in tf.paragraphs:
|
| 1612 |
+
trans_para = self.extract_ppt_paragraph(para)
|
| 1613 |
+
trans_frame.paragraphs.append(trans_para)
|
| 1614 |
+
|
| 1615 |
+
return trans_frame
|
| 1616 |
+
|
| 1617 |
+
|
| 1618 |
+
def extract_ppt_paragraph(self, para) -> TranslatableParagraph:
|
| 1619 |
+
"""
|
| 1620 |
+
Extract PowerPoint paragraph with run-level formatting.
|
| 1621 |
+
Similar to Word's extract_paragraph but for PPT-specific properties.
|
| 1622 |
+
"""
|
| 1623 |
+
from pptx.util import Pt
|
| 1624 |
+
|
| 1625 |
+
# Resolve base font from theme/master
|
| 1626 |
+
def get_resolved_ppt_font(p):
|
| 1627 |
+
# Check runs first
|
| 1628 |
+
for r in p.runs:
|
| 1629 |
+
if r.font.name:
|
| 1630 |
+
return r.font.name
|
| 1631 |
+
# Check theme defaults
|
| 1632 |
+
try:
|
| 1633 |
+
if hasattr(p, '_element') and hasattr(p._element, 'pPr'):
|
| 1634 |
+
# Theme font resolution logic here
|
| 1635 |
+
pass
|
| 1636 |
+
except:
|
| 1637 |
+
pass
|
| 1638 |
+
return "Calibri" # PPT default
|
| 1639 |
+
|
| 1640 |
+
resolved_font = get_resolved_ppt_font(para)
|
| 1641 |
+
|
| 1642 |
+
runs = []
|
| 1643 |
+
for run in para.runs:
|
| 1644 |
+
f_color = None
|
| 1645 |
+
try:
|
| 1646 |
+
if run.font.color and run.font.color.rgb:
|
| 1647 |
+
rgb = run.font.color.rgb
|
| 1648 |
+
f_color = (rgb[0], rgb[1], rgb[2])
|
| 1649 |
+
except:
|
| 1650 |
+
pass
|
| 1651 |
+
|
| 1652 |
+
runs.append(FormatRun(
|
| 1653 |
+
text=run.text,
|
| 1654 |
+
bold=run.font.bold,
|
| 1655 |
+
italic=run.font.italic,
|
| 1656 |
+
underline=run.font.underline,
|
| 1657 |
+
font_name=run.font.name if run.font.name else resolved_font,
|
| 1658 |
+
font_size=run.font.size.pt if run.font.size else 18.0,
|
| 1659 |
+
font_color=f_color
|
| 1660 |
+
))
|
| 1661 |
+
|
| 1662 |
+
trans_para = TranslatableParagraph(runs=runs)
|
| 1663 |
+
|
| 1664 |
+
# Capture paragraph-level properties
|
| 1665 |
+
trans_para.metadata['alignment'] = para.alignment
|
| 1666 |
+
trans_para.metadata['level'] = para.level # Indentation level
|
| 1667 |
+
trans_para.metadata['line_spacing'] = para.line_spacing
|
| 1668 |
+
trans_para.metadata['space_before'] = para.space_before
|
| 1669 |
+
trans_para.metadata['space_after'] = para.space_after
|
| 1670 |
+
|
| 1671 |
+
return trans_para
|
| 1672 |
+
|
| 1673 |
+
|
| 1674 |
+
def extract_table_from_slide(self, table) -> List[List[TranslatableTextFrame]]:
|
| 1675 |
+
"""
|
| 1676 |
+
Extract table structure with cell-level text frames.
|
| 1677 |
+
PPT tables are similar to Word but stored differently.
|
| 1678 |
+
"""
|
| 1679 |
+
table_data = []
|
| 1680 |
+
|
| 1681 |
+
for row in table.rows:
|
| 1682 |
+
row_data = []
|
| 1683 |
+
for cell in row.cells:
|
| 1684 |
+
cell_frame = self.extract_text_frame(cell)
|
| 1685 |
+
row_data.append(cell_frame)
|
| 1686 |
+
table_data.append(row_data)
|
| 1687 |
+
|
| 1688 |
+
return table_data
|
| 1689 |
+
|
| 1690 |
+
|
| 1691 |
+
def get_speaker_notes(self, slide) -> Optional[str]:
|
| 1692 |
+
"""
|
| 1693 |
+
Extract speaker notes from slide.
|
| 1694 |
+
PPT equivalent of footnotes in Word.
|
| 1695 |
+
"""
|
| 1696 |
+
try:
|
| 1697 |
+
if slide.has_notes_slide:
|
| 1698 |
+
notes_slide = slide.notes_slide
|
| 1699 |
+
text_frame = notes_slide.notes_text_frame
|
| 1700 |
+
return text_frame.text if text_frame.text.strip() else None
|
| 1701 |
+
except:
|
| 1702 |
+
return None
|
| 1703 |
+
|
| 1704 |
+
# ============================================================================
|
| 1705 |
+
# PPTX SAFETY CHECKS
|
| 1706 |
+
# ============================================================================
|
| 1707 |
+
|
| 1708 |
+
def is_shape_safe_to_translate(self, shape) -> bool:
|
| 1709 |
+
"""
|
| 1710 |
+
Check if PowerPoint shape can be safely translated.
|
| 1711 |
+
Equivalent to is_paragraph_safe_to_translate() for Word.
|
| 1712 |
+
"""
|
| 1713 |
+
# Skip shapes without text frames
|
| 1714 |
+
if not hasattr(shape, 'text_frame'):
|
| 1715 |
+
return False
|
| 1716 |
+
|
| 1717 |
+
try:
|
| 1718 |
+
text_frame = shape.text_frame
|
| 1719 |
+
|
| 1720 |
+
# Skip empty text frames
|
| 1721 |
+
if not text_frame.text or not text_frame.text.strip():
|
| 1722 |
+
return False
|
| 1723 |
+
|
| 1724 |
+
# Skip placeholder shapes with no actual content
|
| 1725 |
+
if len(text_frame.text.strip()) <= 1:
|
| 1726 |
+
return False
|
| 1727 |
+
|
| 1728 |
+
# Skip shapes that are likely logos or decorative
|
| 1729 |
+
if shape.name and any(keyword in shape.name.lower()
|
| 1730 |
+
for keyword in ['logo', 'watermark', 'decoration', 'icon']):
|
| 1731 |
+
logger.debug(f"Skipping decorative shape: {shape.name}")
|
| 1732 |
+
return False
|
| 1733 |
+
|
| 1734 |
+
# Skip very small shapes (likely decorative)
|
| 1735 |
+
if shape.width < 100000 or shape.height < 100000: # Less than ~0.14 inches
|
| 1736 |
+
logger.debug(f"Skipping tiny shape: {shape.width}x{shape.height}")
|
| 1737 |
+
return False
|
| 1738 |
+
|
| 1739 |
+
return True
|
| 1740 |
+
|
| 1741 |
+
except Exception as e:
|
| 1742 |
+
logger.debug(f"Shape safety check failed: {e}")
|
| 1743 |
+
return False
|
| 1744 |
+
|
| 1745 |
+
|
| 1746 |
+
def is_slide_master_or_layout(self, slide) -> bool:
|
| 1747 |
+
"""
|
| 1748 |
+
Detect if this is a master slide or layout (should not be translated).
|
| 1749 |
+
"""
|
| 1750 |
+
try:
|
| 1751 |
+
# Master slides don't have a slide_id in the normal sense
|
| 1752 |
+
if not hasattr(slide, 'slide_id'):
|
| 1753 |
+
return True
|
| 1754 |
+
# Additional checks could go here
|
| 1755 |
+
return False
|
| 1756 |
+
except:
|
| 1757 |
+
return True
|
| 1758 |
+
|
| 1759 |
+
|
| 1760 |
+
# ============================================================================
|
| 1761 |
+
# PPTX RECONSTRUCTION METHODS
|
| 1762 |
+
# ============================================================================
|
| 1763 |
+
|
| 1764 |
+
def apply_text_frame_formatting(
|
| 1765 |
+
self,
|
| 1766 |
+
shape,
|
| 1767 |
+
trans_frame: TranslatableTextFrame,
|
| 1768 |
+
translated_paragraphs: List[Tuple[str, List[Tuple[int, int]]]]
|
| 1769 |
+
):
|
| 1770 |
+
"""
|
| 1771 |
+
Reconstruct text frame with aligned formatting.
|
| 1772 |
+
PPT equivalent of apply_aligned_formatting().
|
| 1773 |
+
|
| 1774 |
+
Args:
|
| 1775 |
+
shape: PowerPoint shape object
|
| 1776 |
+
trans_frame: Original extracted text frame
|
| 1777 |
+
translated_paragraphs: List of (translated_text, alignment) tuples
|
| 1778 |
+
"""
|
| 1779 |
+
from pptx.util import Pt
|
| 1780 |
+
from pptx.enum.text import PP_ALIGN, MSO_VERTICAL_ANCHOR
|
| 1781 |
+
|
| 1782 |
+
tf = shape.text_frame
|
| 1783 |
+
|
| 1784 |
+
# Restore text frame properties
|
| 1785 |
+
tf_meta = trans_frame.shape_metadata.get('text_frame', {})
|
| 1786 |
+
if tf_meta:
|
| 1787 |
+
try:
|
| 1788 |
+
tf.margin_left = tf_meta.get('margin_left', tf.margin_left)
|
| 1789 |
+
tf.margin_right = tf_meta.get('margin_right', tf.margin_right)
|
| 1790 |
+
tf.margin_top = tf_meta.get('margin_top', tf.margin_top)
|
| 1791 |
+
tf.margin_bottom = tf_meta.get('margin_bottom', tf.margin_bottom)
|
| 1792 |
+
tf.vertical_anchor = tf_meta.get('vertical_anchor', tf.vertical_anchor)
|
| 1793 |
+
tf.word_wrap = tf_meta.get('word_wrap', tf.word_wrap)
|
| 1794 |
+
tf.auto_size = tf_meta.get('auto_size', tf.auto_size)
|
| 1795 |
+
except Exception as e:
|
| 1796 |
+
logger.debug(f"Text frame property restoration failed: {e}")
|
| 1797 |
+
|
| 1798 |
+
# Clear existing paragraphs
|
| 1799 |
+
for _ in range(len(tf.paragraphs)):
|
| 1800 |
+
tf._element.remove(tf.paragraphs[0]._element)
|
| 1801 |
+
|
| 1802 |
+
# Reconstruct paragraphs
|
| 1803 |
+
for i, (trans_para, (translated_text, alignment)) in enumerate(
|
| 1804 |
+
zip(trans_frame.paragraphs, translated_paragraphs)
|
| 1805 |
+
):
|
| 1806 |
+
para = tf.add_paragraph()
|
| 1807 |
+
|
| 1808 |
+
# Restore paragraph properties
|
| 1809 |
+
para.alignment = trans_para.metadata.get('alignment')
|
| 1810 |
+
para.level = trans_para.metadata.get('level', 0)
|
| 1811 |
+
|
| 1812 |
+
if trans_para.metadata.get('line_spacing'):
|
| 1813 |
+
para.line_spacing = trans_para.metadata['line_spacing']
|
| 1814 |
+
if trans_para.metadata.get('space_before'):
|
| 1815 |
+
para.space_before = trans_para.metadata['space_before']
|
| 1816 |
+
if trans_para.metadata.get('space_after'):
|
| 1817 |
+
para.space_after = trans_para.metadata['space_after']
|
| 1818 |
+
|
| 1819 |
+
# Apply aligned formatting to runs
|
| 1820 |
+
self.apply_ppt_paragraph_formatting(
|
| 1821 |
+
para, trans_para, translated_text, alignment
|
| 1822 |
+
)
|
| 1823 |
+
|
| 1824 |
+
|
| 1825 |
+
def apply_ppt_paragraph_formatting(
|
| 1826 |
+
self,
|
| 1827 |
+
para,
|
| 1828 |
+
trans_para: TranslatableParagraph,
|
| 1829 |
+
translated_text: str,
|
| 1830 |
+
alignment: List[Tuple[int, int]]
|
| 1831 |
+
):
|
| 1832 |
+
"""
|
| 1833 |
+
Apply aligned formatting to PowerPoint paragraph runs.
|
| 1834 |
+
Core formatting transfer logic - same as Word but for PPT runs.
|
| 1835 |
+
"""
|
| 1836 |
+
from pptx.util import Pt
|
| 1837 |
+
from pptx.dml.color import RGBColor
|
| 1838 |
+
|
| 1839 |
+
src_clean_words = trans_para.get_words()
|
| 1840 |
+
tgt_raw_units = translated_text.split()
|
| 1841 |
+
formatted_indices = trans_para.get_formatted_word_indices()
|
| 1842 |
+
|
| 1843 |
+
# Map clean indices to raw units
|
| 1844 |
+
clean_to_raw_tgt = {}
|
| 1845 |
+
clean_idx = 0
|
| 1846 |
+
for raw_idx, unit in enumerate(tgt_raw_units):
|
| 1847 |
+
if re.search(r'\w', unit):
|
| 1848 |
+
clean_to_raw_tgt[clean_idx] = raw_idx
|
| 1849 |
+
clean_idx += 1
|
| 1850 |
+
|
| 1851 |
+
# Get font template
|
| 1852 |
+
font_template = trans_para.runs[0] if trans_para.runs else None
|
| 1853 |
+
|
| 1854 |
+
# Reconstruct runs
|
| 1855 |
+
for i, unit in enumerate(tgt_raw_units):
|
| 1856 |
+
run_text = unit + (" " if i < len(tgt_raw_units)-1 else "")
|
| 1857 |
+
run = para.add_run()
|
| 1858 |
+
run.text = run_text
|
| 1859 |
+
|
| 1860 |
+
# Determine style from alignment
|
| 1861 |
+
style_type = None
|
| 1862 |
+
matched_src = [s for s, t in alignment if clean_to_raw_tgt.get(t) == i]
|
| 1863 |
+
|
| 1864 |
+
if matched_src:
|
| 1865 |
+
for s_idx in matched_src:
|
| 1866 |
+
if s_idx in formatted_indices['italic_bold']:
|
| 1867 |
+
style_type = 'italic_bold'
|
| 1868 |
+
break
|
| 1869 |
+
elif s_idx in formatted_indices['bold']:
|
| 1870 |
+
style_type = 'bold'
|
| 1871 |
+
elif s_idx in formatted_indices['italic'] and style_type != 'bold':
|
| 1872 |
+
style_type = 'italic'
|
| 1873 |
+
|
| 1874 |
+
# Apply inline styles
|
| 1875 |
+
if style_type == 'italic_bold':
|
| 1876 |
+
run.font.bold = run.font.italic = True
|
| 1877 |
+
elif style_type == 'bold':
|
| 1878 |
+
run.font.bold = True
|
| 1879 |
+
elif style_type == 'italic':
|
| 1880 |
+
run.font.italic = True
|
| 1881 |
+
|
| 1882 |
+
# Apply baseline aesthetics
|
| 1883 |
+
if font_template:
|
| 1884 |
+
self.copy_ppt_font_properties(run, font_template)
|
| 1885 |
+
|
| 1886 |
+
|
| 1887 |
+
def copy_ppt_font_properties(self, target_run, source_run: FormatRun):
|
| 1888 |
+
"""
|
| 1889 |
+
Force font properties in PowerPoint run.
|
| 1890 |
+
PPT equivalent of copy_font_properties().
|
| 1891 |
+
"""
|
| 1892 |
+
from pptx.util import Pt
|
| 1893 |
+
from pptx.dml.color import RGBColor
|
| 1894 |
+
|
| 1895 |
+
try:
|
| 1896 |
+
if source_run.font_name:
|
| 1897 |
+
target_run.font.name = source_run.font_name
|
| 1898 |
+
|
| 1899 |
+
if source_run.font_size:
|
| 1900 |
+
target_run.font.size = Pt(source_run.font_size)
|
| 1901 |
+
|
| 1902 |
+
if source_run.font_color:
|
| 1903 |
+
target_run.font.color.rgb = RGBColor(*source_run.font_color)
|
| 1904 |
+
|
| 1905 |
+
if source_run.underline is not None:
|
| 1906 |
+
target_run.font.underline = source_run.underline
|
| 1907 |
+
|
| 1908 |
+
except Exception as e:
|
| 1909 |
+
logger.debug(f"PPT font property copy failed: {e}")
|
| 1910 |
+
|
| 1911 |
+
|
| 1912 |
+
def restore_table_to_slide(
|
| 1913 |
+
self,
|
| 1914 |
+
table,
|
| 1915 |
+
table_data: List[List[TranslatableTextFrame]],
|
| 1916 |
+
translated_cells: List[List[Tuple[str, List[Tuple[int, int]]]]]
|
| 1917 |
+
):
|
| 1918 |
+
"""
|
| 1919 |
+
Restore translated content to PowerPoint table.
|
| 1920 |
+
"""
|
| 1921 |
+
for i, row in enumerate(table.rows):
|
| 1922 |
+
for j, cell in enumerate(row.cells):
|
| 1923 |
+
if i < len(table_data) and j < len(table_data[i]):
|
| 1924 |
+
trans_frame = table_data[i][j]
|
| 1925 |
+
trans_paragraphs = translated_cells[i][j]
|
| 1926 |
+
|
| 1927 |
+
# Treat cell as a shape with text frame
|
| 1928 |
+
self.apply_text_frame_formatting(
|
| 1929 |
+
cell, trans_frame, trans_paragraphs
|
| 1930 |
+
)
|
| 1931 |
+
|
| 1932 |
+
|
| 1933 |
+
def set_speaker_notes(self, slide, translated_notes: str):
|
| 1934 |
+
"""
|
| 1935 |
+
Set translated speaker notes.
|
| 1936 |
+
"""
|
| 1937 |
+
try:
|
| 1938 |
+
if not slide.has_notes_slide:
|
| 1939 |
+
notes_slide = slide.notes_slide # Creates if doesn't exist
|
| 1940 |
+
else:
|
| 1941 |
+
notes_slide = slide.notes_slide
|
| 1942 |
+
|
| 1943 |
+
text_frame = notes_slide.notes_text_frame
|
| 1944 |
+
text_frame.clear()
|
| 1945 |
+
text_frame.text = translated_notes
|
| 1946 |
+
except Exception as e:
|
| 1947 |
+
logger.warning(f"Could not set speaker notes: {e}")
|
| 1948 |
+
|
| 1949 |
+
|
| 1950 |
+
# ============================================================================
|
| 1951 |
+
# PPTX SLIDE PROCESSING
|
| 1952 |
+
# ============================================================================
|
| 1953 |
+
|
| 1954 |
+
async def translate_shape(self, shape):
|
| 1955 |
+
"""
|
| 1956 |
+
Translate a single shape (text box, placeholder, etc.)
|
| 1957 |
+
"""
|
| 1958 |
+
if not shape.has_text_frame:
|
| 1959 |
+
return
|
| 1960 |
+
|
| 1961 |
+
try:
|
| 1962 |
+
# Extract
|
| 1963 |
+
trans_frame = self.extract_text_frame(shape)
|
| 1964 |
+
|
| 1965 |
+
if not trans_frame.paragraphs:
|
| 1966 |
+
return
|
| 1967 |
+
|
| 1968 |
+
# Translate each paragraph
|
| 1969 |
+
translated_paragraphs = []
|
| 1970 |
+
for trans_para in trans_frame.paragraphs:
|
| 1971 |
+
original_text = trans_para.get_text()
|
| 1972 |
+
if not original_text.strip():
|
| 1973 |
+
translated_paragraphs.append(("", []))
|
| 1974 |
+
continue
|
| 1975 |
+
|
| 1976 |
+
# Translate
|
| 1977 |
+
translated_text = await self.translate_text(original_text)
|
| 1978 |
+
|
| 1979 |
+
# Align
|
| 1980 |
+
src_words = trans_para.get_words()
|
| 1981 |
+
tgt_words = re.findall(r"\w+", translated_text)
|
| 1982 |
+
alignment = []
|
| 1983 |
+
if self.aligner and src_words and tgt_words:
|
| 1984 |
+
alignment = self.aligner.align(src_words, tgt_words)
|
| 1985 |
+
|
| 1986 |
+
translated_paragraphs.append((translated_text, alignment))
|
| 1987 |
+
|
| 1988 |
+
# Reconstruct
|
| 1989 |
+
self.apply_text_frame_formatting(shape, trans_frame, translated_paragraphs)
|
| 1990 |
+
|
| 1991 |
+
except Exception as e:
|
| 1992 |
+
logger.error(f"Shape translation failed: {e}", exc_info=True)
|
| 1993 |
+
|
| 1994 |
+
|
| 1995 |
+
async def translate_slide(self, slide):
|
| 1996 |
+
"""
|
| 1997 |
+
Translate all content in a slide.
|
| 1998 |
+
"""
|
| 1999 |
+
# Process shapes
|
| 2000 |
+
for shape in slide.shapes:
|
| 2001 |
+
if shape.has_text_frame:
|
| 2002 |
+
await self.translate_shape(shape)
|
| 2003 |
+
|
| 2004 |
+
# Handle tables
|
| 2005 |
+
if shape.has_table:
|
| 2006 |
+
await self.translate_table_in_slide(shape.table)
|
| 2007 |
+
|
| 2008 |
+
# Handle groups (recursive)
|
| 2009 |
+
if shape.shape_type == 6: # MSO_SHAPE_TYPE.GROUP
|
| 2010 |
+
for sub_shape in shape.shapes:
|
| 2011 |
+
if sub_shape.has_text_frame:
|
| 2012 |
+
await self.translate_shape(sub_shape)
|
| 2013 |
+
|
| 2014 |
+
# Process speaker notes
|
| 2015 |
+
notes_text = self.get_speaker_notes(slide)
|
| 2016 |
+
if notes_text:
|
| 2017 |
+
translated_notes = await self.translate_text(notes_text)
|
| 2018 |
+
self.set_speaker_notes(slide, translated_notes)
|
| 2019 |
+
|
| 2020 |
+
|
| 2021 |
+
async def translate_table_in_slide(self, table):
|
| 2022 |
+
"""
|
| 2023 |
+
Translate table content in slide.
|
| 2024 |
+
"""
|
| 2025 |
+
table_data = self.extract_table_from_slide(table)
|
| 2026 |
+
translated_cells = []
|
| 2027 |
+
|
| 2028 |
+
for row_data in table_data:
|
| 2029 |
+
translated_row = []
|
| 2030 |
+
for cell_frame in row_data:
|
| 2031 |
+
cell_paragraphs = []
|
| 2032 |
+
for trans_para in cell_frame.paragraphs:
|
| 2033 |
+
text = trans_para.get_text()
|
| 2034 |
+
if text.strip():
|
| 2035 |
+
translated = await self.translate_text(text)
|
| 2036 |
+
src_words = trans_para.get_words()
|
| 2037 |
+
tgt_words = re.findall(r"\w+", translated)
|
| 2038 |
+
alignment = self.aligner.align(src_words, tgt_words) if self.aligner else []
|
| 2039 |
+
cell_paragraphs.append((translated, alignment))
|
| 2040 |
+
else:
|
| 2041 |
+
cell_paragraphs.append(("", []))
|
| 2042 |
+
translated_row.append(cell_paragraphs)
|
| 2043 |
+
translated_cells.append(translated_row)
|
| 2044 |
+
|
| 2045 |
+
self.restore_table_to_slide(table, table_data, translated_cells)
|
| 2046 |
+
|
| 2047 |
+
|
| 2048 |
+
async def translate_presentation(self, input_path: Path, output_path: Path):
|
| 2049 |
+
"""
|
| 2050 |
+
Main presentation translation lifecycle.
|
| 2051 |
+
PPT equivalent of translate_document().
|
| 2052 |
+
"""
|
| 2053 |
+
from pptx import Presentation
|
| 2054 |
+
|
| 2055 |
+
prs = Presentation(str(input_path))
|
| 2056 |
+
|
| 2057 |
+
logger.info(f"Processing {len(prs.slides)} slides")
|
| 2058 |
+
|
| 2059 |
+
for slide_num, slide in enumerate(tqdm(prs.slides, desc="Translating slides"), 1):
|
| 2060 |
+
logger.info(f"Processing slide {slide_num}")
|
| 2061 |
+
await self.translate_slide(slide)
|
| 2062 |
+
|
| 2063 |
+
logger.info(f"Saving presentation to {output_path}")
|
| 2064 |
+
prs.save(str(output_path))
|
| 2065 |
+
logger.info("✓ Presentation Translation Complete.")
|
| 2066 |
+
|
| 2067 |
+
# ============================================================================
|
| 2068 |
+
# FILE TYPE DETECTION
|
| 2069 |
+
# ============================================================================
|
| 2070 |
+
|
| 2071 |
+
def detect_file_type(self, file_path: Path) -> str:
|
| 2072 |
+
"""
|
| 2073 |
+
Detect if file is Word (.docx) or PowerPoint (.pptx).
|
| 2074 |
+
Returns: 'docx', 'pptx', or 'unknown'
|
| 2075 |
+
"""
|
| 2076 |
+
suffix = file_path.suffix.lower()
|
| 2077 |
+
|
| 2078 |
+
if suffix == '.docx':
|
| 2079 |
+
return 'docx'
|
| 2080 |
+
elif suffix == '.pptx':
|
| 2081 |
+
return 'pptx'
|
| 2082 |
+
else:
|
| 2083 |
+
# Try to detect by magic bytes
|
| 2084 |
+
try:
|
| 2085 |
+
with open(file_path, 'rb') as f:
|
| 2086 |
+
header = f.read(4)
|
| 2087 |
+
# Both formats are ZIP files (PK header)
|
| 2088 |
+
if header[:2] == b'PK':
|
| 2089 |
+
# Try to open as docx first
|
| 2090 |
+
try:
|
| 2091 |
+
from docx import Document
|
| 2092 |
+
Document(str(file_path))
|
| 2093 |
+
return 'docx'
|
| 2094 |
+
except:
|
| 2095 |
+
pass
|
| 2096 |
+
# Try as pptx
|
| 2097 |
+
try:
|
| 2098 |
+
from pptx import Presentation
|
| 2099 |
+
Presentation(str(file_path))
|
| 2100 |
+
return 'pptx'
|
| 2101 |
+
except:
|
| 2102 |
+
pass
|
| 2103 |
+
except:
|
| 2104 |
+
pass
|
| 2105 |
+
|
| 2106 |
+
return 'unknown'
|
| 2107 |
+
|
| 2108 |
+
|
| 2109 |
+
def validate_file(self, file_path: Path) -> Tuple[bool, str, str]:
|
| 2110 |
+
"""
|
| 2111 |
+
Validate input file and return (is_valid, file_type, error_message).
|
| 2112 |
+
|
| 2113 |
+
Returns:
|
| 2114 |
+
Tuple of (success, file_type, error_msg)
|
| 2115 |
+
"""
|
| 2116 |
+
if not file_path.exists():
|
| 2117 |
+
return False, 'unknown', f"File not found: {file_path}"
|
| 2118 |
+
|
| 2119 |
+
file_type = self.detect_file_type(file_path)
|
| 2120 |
+
|
| 2121 |
+
if file_type == 'unknown':
|
| 2122 |
+
return False, 'unknown', f"Unsupported file format. Only .docx and .pptx are supported."
|
| 2123 |
+
|
| 2124 |
+
# Verify we can actually open it
|
| 2125 |
+
try:
|
| 2126 |
+
if file_type == 'docx':
|
| 2127 |
+
from docx import Document
|
| 2128 |
+
doc = Document(str(file_path))
|
| 2129 |
+
# Basic sanity check
|
| 2130 |
+
if not hasattr(doc, 'paragraphs'):
|
| 2131 |
+
return False, file_type, "Invalid .docx file structure"
|
| 2132 |
+
|
| 2133 |
+
elif file_type == 'pptx':
|
| 2134 |
+
from pptx import Presentation
|
| 2135 |
+
prs = Presentation(str(file_path))
|
| 2136 |
+
# Basic sanity check
|
| 2137 |
+
if not hasattr(prs, 'slides'):
|
| 2138 |
+
return False, file_type, "Invalid .pptx file structure"
|
| 2139 |
+
|
| 2140 |
+
return True, file_type, ""
|
| 2141 |
+
|
| 2142 |
+
except Exception as e:
|
| 2143 |
+
return False, file_type, f"Cannot open file: {str(e)}"
|
| 2144 |
+
|
| 2145 |
+
|
| 2146 |
+
|
| 2147 |
+
|
| 2148 |
|
| 2149 |
# ============================================================================
|
| 2150 |
# CLI
|
|
|
|
| 2161 |
opus - Specialized bilingual models. Tiny (~200MB), extremely fast, literal.
|
| 2162 |
ct2 (wmt) - Dense Facebook models. Peak German/European quality (~6GB RAM).
|
| 2163 |
|
| 2164 |
+
Supported Formats:
|
| 2165 |
+
.docx - Microsoft Word documents (paragraphs, tables, footnotes, headers/footers)
|
| 2166 |
+
.pptx - Microsoft PowerPoint presentations (slides, text boxes, tables, notes)
|
| 2167 |
+
|
| 2168 |
Examples:
|
| 2169 |
+
# Translate Word document (NLLB-600M)
|
| 2170 |
%(prog)s input.docx output.docx -s en -t de
|
| 2171 |
|
| 2172 |
+
# Translate PowerPoint presentation
|
| 2173 |
+
%(prog)s presentation.pptx translated.pptx -s en -t es
|
|
|
|
|
|
|
|
|
|
| 2174 |
|
| 2175 |
+
# High-quality academic translation (Madlad-400)
|
| 2176 |
+
%(prog)s thesis.docx thesis_de.docx -s en -t de --nmt madlad
|
| 2177 |
|
| 2178 |
+
# Translate slides with LLM (Claude)
|
| 2179 |
+
%(prog)s slides.pptx slides_fr.pptx -s en -t fr --mode llm-align --llm anthropic
|
| 2180 |
|
| 2181 |
Environment Variables:
|
| 2182 |
OPENAI_API_KEY, ANTHROPIC_API_KEY - Required for LLM backends.
|
| 2183 |
"""
|
| 2184 |
)
|
| 2185 |
|
| 2186 |
+
# 1. POSITIONAL ARGUMENTS, HELP TEXT
|
| 2187 |
+
parser.add_argument('input', help='Input file (.docx or .pptx)')
|
| 2188 |
+
parser.add_argument('output', help='Output file (.docx or .pptx)')
|
| 2189 |
|
|
|
|
| 2190 |
parser.add_argument('-s', '--source', default='en', help='Source language code (default: en)')
|
| 2191 |
parser.add_argument('-t', '--target', default='de', help='Target language code (default: de)')
|
| 2192 |
|
|
|
|
| 2193 |
parser.add_argument(
|
| 2194 |
'--mode',
|
| 2195 |
choices=['nmt', 'llm-align', 'llm-plain', 'hybrid'],
|
|
|
|
| 2197 |
help='Translation strategy (default: hybrid)'
|
| 2198 |
)
|
| 2199 |
|
|
|
|
| 2200 |
parser.add_argument(
|
| 2201 |
'--nmt',
|
| 2202 |
choices=['nllb', 'madlad', 'opus', 'ct2', 'auto'],
|
|
|
|
| 2211 |
help='NLLB variant only: 600M (fastest), 1.3B (balanced), 3.3B (heavy)'
|
| 2212 |
)
|
| 2213 |
|
|
|
|
| 2214 |
parser.add_argument(
|
| 2215 |
'--llm',
|
| 2216 |
choices=['openai', 'anthropic', 'ollama'],
|
| 2217 |
help='LLM provider for hybrid/llm modes'
|
| 2218 |
)
|
| 2219 |
|
|
|
|
| 2220 |
parser.add_argument(
|
| 2221 |
'--aligner',
|
| 2222 |
choices=['awesome', 'simalign', 'lindat', 'fast_align', 'heuristic', 'auto'],
|
|
|
|
| 2228 |
|
| 2229 |
args = parser.parse_args()
|
| 2230 |
|
| 2231 |
+
# Set logging level
|
| 2232 |
if args.verbose:
|
| 2233 |
logging.getLogger().setLevel(logging.DEBUG)
|
| 2234 |
|
| 2235 |
+
# Enhanced validation
|
| 2236 |
input_path = Path(args.input)
|
| 2237 |
+
output_path = Path(args.output)
|
| 2238 |
+
|
| 2239 |
if not input_path.exists():
|
| 2240 |
logger.error(f"File not found: {input_path}")
|
| 2241 |
sys.exit(1)
|
| 2242 |
|
| 2243 |
+
# Validate file type compatibility
|
| 2244 |
+
input_type = input_path.suffix.lower()
|
| 2245 |
+
output_type = output_path.suffix.lower()
|
| 2246 |
+
|
| 2247 |
+
if input_type not in ['.docx', '.pptx']:
|
| 2248 |
+
logger.error(f"Unsupported input format: {input_type}. Only .docx and .pptx are supported.")
|
| 2249 |
+
sys.exit(1)
|
| 2250 |
+
|
| 2251 |
+
# Warn if output extension doesn't match input
|
| 2252 |
+
if output_type != input_type:
|
| 2253 |
+
logger.warning(f"Output extension ({output_type}) doesn't match input ({input_type}). Using {input_type}.")
|
| 2254 |
+
output_path = output_path.with_suffix(input_type)
|
| 2255 |
+
|
| 2256 |
+
# Mode mapping unchanged
|
| 2257 |
mode_map = {
|
| 2258 |
'nmt': TranslationMode.NMT_ONLY,
|
| 2259 |
'llm-align': TranslationMode.LLM_WITH_ALIGN,
|
|
|
|
| 2261 |
'hybrid': TranslationMode.HYBRID
|
| 2262 |
}
|
| 2263 |
|
| 2264 |
+
# Updated status header
|
| 2265 |
+
file_type_name = "Word Document" if input_type == '.docx' else "PowerPoint Presentation"
|
| 2266 |
+
|
| 2267 |
print(f"\n{'='*60}")
|
| 2268 |
print(f"🌍 DOCUMENT TRANSLATOR - PRODUCTION v12")
|
| 2269 |
print(f"{'='*60}")
|
| 2270 |
+
print(f"Format: {file_type_name}")
|
| 2271 |
print(f"Input: {input_path.name}")
|
| 2272 |
+
print(f"Output: {output_path.name}")
|
| 2273 |
print(f"Direction: {args.source.upper()} → {args.target.upper()}")
|
| 2274 |
print(f"Mode: {args.mode.upper()}")
|
| 2275 |
print(f"NMT Engine: {args.nmt.upper()} {'('+args.nllb_size+')' if args.nmt=='nllb' else ''}")
|
|
|
|
| 2278 |
print(f"LLM: {args.llm.upper()}")
|
| 2279 |
print(f"{'='*60}\n")
|
| 2280 |
|
| 2281 |
+
# Initialize translator
|
| 2282 |
translator = UltimateDocumentTranslator(
|
| 2283 |
src_lang=args.source,
|
| 2284 |
tgt_lang=args.target,
|
|
|
|
| 2289 |
nllb_model_size=args.nllb_size
|
| 2290 |
)
|
| 2291 |
|
| 2292 |
+
# Use unified translate_file method
|
| 2293 |
try:
|
| 2294 |
+
await translator.translate_file(input_path, output_path)
|
| 2295 |
|
| 2296 |
print(f"\n{'='*60}")
|
| 2297 |
+
print(f"✅ Success! {file_type_name} processed in {args.mode} mode.")
|
| 2298 |
+
print(f"💾 File saved to: {output_path}")
|
| 2299 |
print(f"{'='*60}\n")
|
| 2300 |
except Exception as e:
|
| 2301 |
+
logger.error(f"FAILED | Translation aborted: {e}", exc_info=args.verbose)
|
| 2302 |
sys.exit(1)
|
| 2303 |
|
| 2304 |
|