cstr commited on
Commit
94bccfa
·
verified ·
1 Parent(s): 20199a1

Update translator.py

Browse files
Files changed (1) hide show
  1. translator.py +687 -32
translator.py CHANGED
@@ -30,6 +30,7 @@ def check_library(name, import_stmt):
30
  return False
31
 
32
  HAS_DOCX = check_library("python-docx", "from docx import Document; from docx.shared import Pt, RGBColor; from docx.text.paragraph import Paragraph; from docx.oxml.shared import OxmlElement; from docx.oxml.ns import qn")
 
33
  HAS_TORCH = check_library("torch", "import torch")
34
  HAS_CT2 = check_library("CTranslate2", "import ctranslate2; from huggingface_hub import snapshot_download")
35
  HAS_TRANSFORMERS = check_library("Transformers", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM")
@@ -967,6 +968,33 @@ class MultiAligner:
967
  logger.debug("Using heuristic fallback (no quality alignments found)")
968
  return []
969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
 
971
  # ============================================================================
972
  # DOCUMENT TRANSLATOR
@@ -985,14 +1013,19 @@ class UltimateDocumentTranslator:
985
  aligner: Optional[str] = None,
986
  nllb_model_size: str = "600M"
987
  ):
 
988
  self.src_lang, self.tgt_lang, self.mode = src_lang, tgt_lang, mode
989
- self.ct2 = None # WMT translator
990
- self.nllb = None # NLLB translator
991
  self.llm = None
992
  self.aligner = None
993
  self.opus = None
994
  self.madlad = None
 
 
 
995
 
 
996
  logger.info(f"INIT | Starting Translator ({src_lang}→{tgt_lang})")
997
  logger.info(f"INIT | Mode: {mode.value} | NMT: {nmt_backend} | Aligner: {aligner or 'auto'}")
998
  self.log_memory("Initialization Start")
@@ -1061,6 +1094,36 @@ class UltimateDocumentTranslator:
1061
  except Exception as e:
1062
  logger.debug(f"Memory log failed: {e}")
1063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1064
  async def translate_text(self, text: str) -> str:
1065
  """Routes text through the active neural engine chain."""
1066
  if not text.strip(): return text
@@ -1466,7 +1529,10 @@ class UltimateDocumentTranslator:
1466
  logger.debug(f" > Indent-L: {pf.left_indent.pt if pf.left_indent else 0:.1f}pt | Spacing-A: {pf.space_after.pt if pf.space_after else 0:.1f}pt")
1467
 
1468
  async def translate_document(self, input_path: Path, output_path: Path):
1469
- """Full document lifecycle with robust XML commitment and verification logs."""
 
 
 
1470
  self.log_memory("Initialization")
1471
  doc = Document(str(input_path))
1472
 
@@ -1504,6 +1570,581 @@ class UltimateDocumentTranslator:
1504
  self.log_document_info(Document(str(output_path)), "OUTPUT")
1505
  logger.info("✓ Document Translation Complete.")
1506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1507
 
1508
  # ============================================================================
1509
  # CLI
@@ -1520,36 +2161,35 @@ Backends Comparison:
1520
  opus - Specialized bilingual models. Tiny (~200MB), extremely fast, literal.
1521
  ct2 (wmt) - Dense Facebook models. Peak German/European quality (~6GB RAM).
1522
 
 
 
 
 
1523
  Examples:
1524
- # Standard use (NLLB-600M)
1525
  %(prog)s input.docx output.docx -s en -t de
1526
 
1527
- # High-quality academic translation (Madlad-400)
1528
- %(prog)s input.docx output.docx -s en -t de --nmt madlad
1529
-
1530
- # Maximum speed for EN-DE specialized pair (Opus-MT)
1531
- %(prog)s input.docx output.docx -s en -t de --nmt opus
1532
 
1533
- # Use LLM (Claude) with local neural alignment
1534
- %(prog)s input.docx output.docx -s en -t es --mode llm-align --llm anthropic
1535
 
1536
- # Larger NLLB model for rare languages
1537
- %(prog)s input.docx output.docx -s en -t ja --nmt nllb --nllb-size 1.3B
1538
 
1539
  Environment Variables:
1540
  OPENAI_API_KEY, ANTHROPIC_API_KEY - Required for LLM backends.
1541
  """
1542
  )
1543
 
1544
- # 1. POSITIONAL ARGUMENTS
1545
- parser.add_argument('input', help='Input .docx file path')
1546
- parser.add_argument('output', help='Output .docx file path')
1547
 
1548
- # 2. LANGUAGE ARGUMENTS
1549
  parser.add_argument('-s', '--source', default='en', help='Source language code (default: en)')
1550
  parser.add_argument('-t', '--target', default='de', help='Target language code (default: de)')
1551
 
1552
- # 3. TRANSLATION MODE
1553
  parser.add_argument(
1554
  '--mode',
1555
  choices=['nmt', 'llm-align', 'llm-plain', 'hybrid'],
@@ -1557,7 +2197,6 @@ Environment Variables:
1557
  help='Translation strategy (default: hybrid)'
1558
  )
1559
 
1560
- # 4. NMT ENGINE SELECTION
1561
  parser.add_argument(
1562
  '--nmt',
1563
  choices=['nllb', 'madlad', 'opus', 'ct2', 'auto'],
@@ -1572,14 +2211,12 @@ Environment Variables:
1572
  help='NLLB variant only: 600M (fastest), 1.3B (balanced), 3.3B (heavy)'
1573
  )
1574
 
1575
- # 5. LLM PROVIDER
1576
  parser.add_argument(
1577
  '--llm',
1578
  choices=['openai', 'anthropic', 'ollama'],
1579
  help='LLM provider for hybrid/llm modes'
1580
  )
1581
 
1582
- # 6. ALIGNER SELECTION
1583
  parser.add_argument(
1584
  '--aligner',
1585
  choices=['awesome', 'simalign', 'lindat', 'fast_align', 'heuristic', 'auto'],
@@ -1591,17 +2228,32 @@ Environment Variables:
1591
 
1592
  args = parser.parse_args()
1593
 
1594
- # Set global logging level based on verbose flag
1595
  if args.verbose:
1596
  logging.getLogger().setLevel(logging.DEBUG)
1597
 
1598
- # Path validation
1599
  input_path = Path(args.input)
 
 
1600
  if not input_path.exists():
1601
  logger.error(f"File not found: {input_path}")
1602
  sys.exit(1)
1603
 
1604
- # Mapping CLI strings to Enums
 
 
 
 
 
 
 
 
 
 
 
 
 
1605
  mode_map = {
1606
  'nmt': TranslationMode.NMT_ONLY,
1607
  'llm-align': TranslationMode.LLM_WITH_ALIGN,
@@ -1609,12 +2261,15 @@ Environment Variables:
1609
  'hybrid': TranslationMode.HYBRID
1610
  }
1611
 
1612
- # --- LOGO & STATUS HEADER ---
 
 
1613
  print(f"\n{'='*60}")
1614
  print(f"🌍 DOCUMENT TRANSLATOR - PRODUCTION v12")
1615
  print(f"{'='*60}")
 
1616
  print(f"Input: {input_path.name}")
1617
- print(f"Output: {args.output}")
1618
  print(f"Direction: {args.source.upper()} → {args.target.upper()}")
1619
  print(f"Mode: {args.mode.upper()}")
1620
  print(f"NMT Engine: {args.nmt.upper()} {'('+args.nllb_size+')' if args.nmt=='nllb' else ''}")
@@ -1623,7 +2278,7 @@ Environment Variables:
1623
  print(f"LLM: {args.llm.upper()}")
1624
  print(f"{'='*60}\n")
1625
 
1626
- # Initialize the engine
1627
  translator = UltimateDocumentTranslator(
1628
  src_lang=args.source,
1629
  tgt_lang=args.target,
@@ -1634,16 +2289,16 @@ Environment Variables:
1634
  nllb_model_size=args.nllb_size
1635
  )
1636
 
1637
- # Execute lifecycle
1638
  try:
1639
- await translator.translate_document(input_path, Path(args.output))
1640
 
1641
  print(f"\n{'='*60}")
1642
- print(f"✅ Success! Document processed in {args.mode} mode.")
1643
- print(f"💾 File saved to: {args.output}")
1644
  print(f"{'='*60}\n")
1645
  except Exception as e:
1646
- logger.error(f"FAILED | Document translation aborted: {e}", exc_info=args.verbose)
1647
  sys.exit(1)
1648
 
1649
 
 
30
  return False
31
 
32
  HAS_DOCX = check_library("python-docx", "from docx import Document; from docx.shared import Pt, RGBColor; from docx.text.paragraph import Paragraph; from docx.oxml.shared import OxmlElement; from docx.oxml.ns import qn")
33
+ HAS_PPTX = check_library("python-pptx", "from pptx import Presentation; from pptx.util import Pt, Inches; from pptx.enum.text import PP_ALIGN, MSO_VERTICAL_ANCHOR; from pptx.dml.color import RGBColor")
34
  HAS_TORCH = check_library("torch", "import torch")
35
  HAS_CT2 = check_library("CTranslate2", "import ctranslate2; from huggingface_hub import snapshot_download")
36
  HAS_TRANSFORMERS = check_library("Transformers", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM")
 
968
  logger.debug("Using heuristic fallback (no quality alignments found)")
969
  return []
970
 
971
+ # ============================================================================
972
+ # PPTX HANDLING
973
+ # ============================================================================
974
+
975
+ # ============================================================================
976
+ # POWERPOINT-SPECIFIC STRUCTURES
977
+ # ============================================================================
978
+
979
+ @dataclass
980
+ class TranslatableTextFrame:
981
+ """Text frame with positioning and shape metadata"""
982
+ paragraphs: List[TranslatableParagraph] = field(default_factory=list)
983
+ shape_metadata: Dict[str, Any] = field(default_factory=dict)
984
+
985
+ def get_all_text(self) -> str:
986
+ return '\n'.join(p.get_text() for p in self.paragraphs)
987
+
988
+
989
+ @dataclass
990
+ class SlideMetadata:
991
+ """Slide-level properties"""
992
+ layout: Any = None
993
+ background: Any = None
994
+ notes: Optional[str] = None
995
+
996
+
997
+
998
 
999
  # ============================================================================
1000
  # DOCUMENT TRANSLATOR
 
1013
  aligner: Optional[str] = None,
1014
  nllb_model_size: str = "600M"
1015
  ):
1016
+ # Main initialization code
1017
  self.src_lang, self.tgt_lang, self.mode = src_lang, tgt_lang, mode
1018
+ self.ct2 = None
1019
+ self.nllb = None
1020
  self.llm = None
1021
  self.aligner = None
1022
  self.opus = None
1023
  self.madlad = None
1024
+
1025
+ # File type tracking
1026
+ self.current_file_type = None
1027
 
1028
+ # Rest of initialization unchanged...
1029
  logger.info(f"INIT | Starting Translator ({src_lang}→{tgt_lang})")
1030
  logger.info(f"INIT | Mode: {mode.value} | NMT: {nmt_backend} | Aligner: {aligner or 'auto'}")
1031
  self.log_memory("Initialization Start")
 
1094
  except Exception as e:
1095
  logger.debug(f"Memory log failed: {e}")
1096
 
1097
+ # with format specific error messages
1098
+ async def translate_file(self, input_path: Path, output_path: Path):
1099
+ """Main entry point with enhanced error handling"""
1100
+ try:
1101
+ is_valid, file_type, error = self.validate_file(input_path)
1102
+ if not is_valid:
1103
+ raise ValueError(error)
1104
+
1105
+ self.current_file_type = file_type
1106
+ logger.info(f"Processing {file_type.upper()} file: {input_path.name}")
1107
+
1108
+ if file_type == 'docx':
1109
+ await self.translate_document(input_path, output_path)
1110
+ elif file_type == 'pptx':
1111
+ await self.translate_presentation(input_path, output_path)
1112
+ else:
1113
+ raise ValueError(f"Unsupported file type: {file_type}")
1114
+
1115
+ except ImportError as e:
1116
+ if 'docx' in str(e) and self.current_file_type == 'docx':
1117
+ logger.error("python-docx not installed. Install with: pip install python-docx")
1118
+ elif 'pptx' in str(e) and self.current_file_type == 'pptx':
1119
+ logger.error("python-pptx not installed. Install with: pip install python-pptx")
1120
+ raise
1121
+
1122
+ except Exception as e:
1123
+ format_name = "Word document" if self.current_file_type == 'docx' else "PowerPoint presentation"
1124
+ logger.error(f"Failed to translate {format_name}: {e}")
1125
+ raise
1126
+
1127
  async def translate_text(self, text: str) -> str:
1128
  """Routes text through the active neural engine chain."""
1129
  if not text.strip(): return text
 
1529
  logger.debug(f" > Indent-L: {pf.left_indent.pt if pf.left_indent else 0:.1f}pt | Spacing-A: {pf.space_after.pt if pf.space_after else 0:.1f}pt")
1530
 
1531
  async def translate_document(self, input_path: Path, output_path: Path):
1532
+ """
1533
+ Full Word document translation lifecycle with robust XML commitment and verification logs.
1534
+ Called internally by translate_file().
1535
+ """
1536
  self.log_memory("Initialization")
1537
  doc = Document(str(input_path))
1538
 
 
1570
  self.log_document_info(Document(str(output_path)), "OUTPUT")
1571
  logger.info("✓ Document Translation Complete.")
1572
 
1573
+ # ============================================================================
1574
+ # PPTX EXTRACTION METHODS
1575
+ # ============================================================================
1576
+
1577
+ def extract_text_frame(self, shape) -> TranslatableTextFrame:
1578
+ """
1579
+ Extract text frame with shape positioning and paragraph hierarchy.
1580
+ PPT equivalent of extract_paragraph().
1581
+ """
1582
+ from pptx.util import Pt, Inches
1583
+
1584
+ trans_frame = TranslatableTextFrame()
1585
+
1586
+ # Capture shape-level metadata (positioning, size, rotation)
1587
+ trans_frame.shape_metadata = {
1588
+ 'left': shape.left,
1589
+ 'top': shape.top,
1590
+ 'width': shape.width,
1591
+ 'height': shape.height,
1592
+ 'rotation': shape.rotation,
1593
+ 'shape_type': shape.shape_type,
1594
+ 'name': shape.name
1595
+ }
1596
+
1597
+ # Capture text frame properties
1598
+ if hasattr(shape, 'text_frame'):
1599
+ tf = shape.text_frame
1600
+ trans_frame.shape_metadata['text_frame'] = {
1601
+ 'margin_left': tf.margin_left,
1602
+ 'margin_right': tf.margin_right,
1603
+ 'margin_top': tf.margin_top,
1604
+ 'margin_bottom': tf.margin_bottom,
1605
+ 'vertical_anchor': tf.vertical_anchor,
1606
+ 'word_wrap': tf.word_wrap,
1607
+ 'auto_size': tf.auto_size
1608
+ }
1609
+
1610
+ # Extract each paragraph with runs
1611
+ for para in tf.paragraphs:
1612
+ trans_para = self.extract_ppt_paragraph(para)
1613
+ trans_frame.paragraphs.append(trans_para)
1614
+
1615
+ return trans_frame
1616
+
1617
+
1618
+ def extract_ppt_paragraph(self, para) -> TranslatableParagraph:
1619
+ """
1620
+ Extract PowerPoint paragraph with run-level formatting.
1621
+ Similar to Word's extract_paragraph but for PPT-specific properties.
1622
+ """
1623
+ from pptx.util import Pt
1624
+
1625
+ # Resolve base font from theme/master
1626
+ def get_resolved_ppt_font(p):
1627
+ # Check runs first
1628
+ for r in p.runs:
1629
+ if r.font.name:
1630
+ return r.font.name
1631
+ # Check theme defaults
1632
+ try:
1633
+ if hasattr(p, '_element') and hasattr(p._element, 'pPr'):
1634
+ # Theme font resolution logic here
1635
+ pass
1636
+ except:
1637
+ pass
1638
+ return "Calibri" # PPT default
1639
+
1640
+ resolved_font = get_resolved_ppt_font(para)
1641
+
1642
+ runs = []
1643
+ for run in para.runs:
1644
+ f_color = None
1645
+ try:
1646
+ if run.font.color and run.font.color.rgb:
1647
+ rgb = run.font.color.rgb
1648
+ f_color = (rgb[0], rgb[1], rgb[2])
1649
+ except:
1650
+ pass
1651
+
1652
+ runs.append(FormatRun(
1653
+ text=run.text,
1654
+ bold=run.font.bold,
1655
+ italic=run.font.italic,
1656
+ underline=run.font.underline,
1657
+ font_name=run.font.name if run.font.name else resolved_font,
1658
+ font_size=run.font.size.pt if run.font.size else 18.0,
1659
+ font_color=f_color
1660
+ ))
1661
+
1662
+ trans_para = TranslatableParagraph(runs=runs)
1663
+
1664
+ # Capture paragraph-level properties
1665
+ trans_para.metadata['alignment'] = para.alignment
1666
+ trans_para.metadata['level'] = para.level # Indentation level
1667
+ trans_para.metadata['line_spacing'] = para.line_spacing
1668
+ trans_para.metadata['space_before'] = para.space_before
1669
+ trans_para.metadata['space_after'] = para.space_after
1670
+
1671
+ return trans_para
1672
+
1673
+
1674
+ def extract_table_from_slide(self, table) -> List[List[TranslatableTextFrame]]:
1675
+ """
1676
+ Extract table structure with cell-level text frames.
1677
+ PPT tables are similar to Word but stored differently.
1678
+ """
1679
+ table_data = []
1680
+
1681
+ for row in table.rows:
1682
+ row_data = []
1683
+ for cell in row.cells:
1684
+ cell_frame = self.extract_text_frame(cell)
1685
+ row_data.append(cell_frame)
1686
+ table_data.append(row_data)
1687
+
1688
+ return table_data
1689
+
1690
+
1691
+ def get_speaker_notes(self, slide) -> Optional[str]:
1692
+ """
1693
+ Extract speaker notes from slide.
1694
+ PPT equivalent of footnotes in Word.
1695
+ """
1696
+ try:
1697
+ if slide.has_notes_slide:
1698
+ notes_slide = slide.notes_slide
1699
+ text_frame = notes_slide.notes_text_frame
1700
+ return text_frame.text if text_frame.text.strip() else None
1701
+ except:
1702
+ return None
1703
+
1704
+ # ============================================================================
1705
+ # PPTX SAFETY CHECKS
1706
+ # ============================================================================
1707
+
1708
+ def is_shape_safe_to_translate(self, shape) -> bool:
1709
+ """
1710
+ Check if PowerPoint shape can be safely translated.
1711
+ Equivalent to is_paragraph_safe_to_translate() for Word.
1712
+ """
1713
+ # Skip shapes without text frames
1714
+ if not hasattr(shape, 'text_frame'):
1715
+ return False
1716
+
1717
+ try:
1718
+ text_frame = shape.text_frame
1719
+
1720
+ # Skip empty text frames
1721
+ if not text_frame.text or not text_frame.text.strip():
1722
+ return False
1723
+
1724
+ # Skip placeholder shapes with no actual content
1725
+ if len(text_frame.text.strip()) <= 1:
1726
+ return False
1727
+
1728
+ # Skip shapes that are likely logos or decorative
1729
+ if shape.name and any(keyword in shape.name.lower()
1730
+ for keyword in ['logo', 'watermark', 'decoration', 'icon']):
1731
+ logger.debug(f"Skipping decorative shape: {shape.name}")
1732
+ return False
1733
+
1734
+ # Skip very small shapes (likely decorative)
1735
+ if shape.width < 100000 or shape.height < 100000: # Less than ~0.14 inches
1736
+ logger.debug(f"Skipping tiny shape: {shape.width}x{shape.height}")
1737
+ return False
1738
+
1739
+ return True
1740
+
1741
+ except Exception as e:
1742
+ logger.debug(f"Shape safety check failed: {e}")
1743
+ return False
1744
+
1745
+
1746
+ def is_slide_master_or_layout(self, slide) -> bool:
1747
+ """
1748
+ Detect if this is a master slide or layout (should not be translated).
1749
+ """
1750
+ try:
1751
+ # Master slides don't have a slide_id in the normal sense
1752
+ if not hasattr(slide, 'slide_id'):
1753
+ return True
1754
+ # Additional checks could go here
1755
+ return False
1756
+ except:
1757
+ return True
1758
+
1759
+
1760
+ # ============================================================================
1761
+ # PPTX RECONSTRUCTION METHODS
1762
+ # ============================================================================
1763
+
1764
+ def apply_text_frame_formatting(
1765
+ self,
1766
+ shape,
1767
+ trans_frame: TranslatableTextFrame,
1768
+ translated_paragraphs: List[Tuple[str, List[Tuple[int, int]]]]
1769
+ ):
1770
+ """
1771
+ Reconstruct text frame with aligned formatting.
1772
+ PPT equivalent of apply_aligned_formatting().
1773
+
1774
+ Args:
1775
+ shape: PowerPoint shape object
1776
+ trans_frame: Original extracted text frame
1777
+ translated_paragraphs: List of (translated_text, alignment) tuples
1778
+ """
1779
+ from pptx.util import Pt
1780
+ from pptx.enum.text import PP_ALIGN, MSO_VERTICAL_ANCHOR
1781
+
1782
+ tf = shape.text_frame
1783
+
1784
+ # Restore text frame properties
1785
+ tf_meta = trans_frame.shape_metadata.get('text_frame', {})
1786
+ if tf_meta:
1787
+ try:
1788
+ tf.margin_left = tf_meta.get('margin_left', tf.margin_left)
1789
+ tf.margin_right = tf_meta.get('margin_right', tf.margin_right)
1790
+ tf.margin_top = tf_meta.get('margin_top', tf.margin_top)
1791
+ tf.margin_bottom = tf_meta.get('margin_bottom', tf.margin_bottom)
1792
+ tf.vertical_anchor = tf_meta.get('vertical_anchor', tf.vertical_anchor)
1793
+ tf.word_wrap = tf_meta.get('word_wrap', tf.word_wrap)
1794
+ tf.auto_size = tf_meta.get('auto_size', tf.auto_size)
1795
+ except Exception as e:
1796
+ logger.debug(f"Text frame property restoration failed: {e}")
1797
+
1798
+ # Clear existing paragraphs
1799
+ for _ in range(len(tf.paragraphs)):
1800
+ tf._element.remove(tf.paragraphs[0]._element)
1801
+
1802
+ # Reconstruct paragraphs
1803
+ for i, (trans_para, (translated_text, alignment)) in enumerate(
1804
+ zip(trans_frame.paragraphs, translated_paragraphs)
1805
+ ):
1806
+ para = tf.add_paragraph()
1807
+
1808
+ # Restore paragraph properties
1809
+ para.alignment = trans_para.metadata.get('alignment')
1810
+ para.level = trans_para.metadata.get('level', 0)
1811
+
1812
+ if trans_para.metadata.get('line_spacing'):
1813
+ para.line_spacing = trans_para.metadata['line_spacing']
1814
+ if trans_para.metadata.get('space_before'):
1815
+ para.space_before = trans_para.metadata['space_before']
1816
+ if trans_para.metadata.get('space_after'):
1817
+ para.space_after = trans_para.metadata['space_after']
1818
+
1819
+ # Apply aligned formatting to runs
1820
+ self.apply_ppt_paragraph_formatting(
1821
+ para, trans_para, translated_text, alignment
1822
+ )
1823
+
1824
+
1825
+ def apply_ppt_paragraph_formatting(
1826
+ self,
1827
+ para,
1828
+ trans_para: TranslatableParagraph,
1829
+ translated_text: str,
1830
+ alignment: List[Tuple[int, int]]
1831
+ ):
1832
+ """
1833
+ Apply aligned formatting to PowerPoint paragraph runs.
1834
+ Core formatting transfer logic - same as Word but for PPT runs.
1835
+ """
1836
+ from pptx.util import Pt
1837
+ from pptx.dml.color import RGBColor
1838
+
1839
+ src_clean_words = trans_para.get_words()
1840
+ tgt_raw_units = translated_text.split()
1841
+ formatted_indices = trans_para.get_formatted_word_indices()
1842
+
1843
+ # Map clean indices to raw units
1844
+ clean_to_raw_tgt = {}
1845
+ clean_idx = 0
1846
+ for raw_idx, unit in enumerate(tgt_raw_units):
1847
+ if re.search(r'\w', unit):
1848
+ clean_to_raw_tgt[clean_idx] = raw_idx
1849
+ clean_idx += 1
1850
+
1851
+ # Get font template
1852
+ font_template = trans_para.runs[0] if trans_para.runs else None
1853
+
1854
+ # Reconstruct runs
1855
+ for i, unit in enumerate(tgt_raw_units):
1856
+ run_text = unit + (" " if i < len(tgt_raw_units)-1 else "")
1857
+ run = para.add_run()
1858
+ run.text = run_text
1859
+
1860
+ # Determine style from alignment
1861
+ style_type = None
1862
+ matched_src = [s for s, t in alignment if clean_to_raw_tgt.get(t) == i]
1863
+
1864
+ if matched_src:
1865
+ for s_idx in matched_src:
1866
+ if s_idx in formatted_indices['italic_bold']:
1867
+ style_type = 'italic_bold'
1868
+ break
1869
+ elif s_idx in formatted_indices['bold']:
1870
+ style_type = 'bold'
1871
+ elif s_idx in formatted_indices['italic'] and style_type != 'bold':
1872
+ style_type = 'italic'
1873
+
1874
+ # Apply inline styles
1875
+ if style_type == 'italic_bold':
1876
+ run.font.bold = run.font.italic = True
1877
+ elif style_type == 'bold':
1878
+ run.font.bold = True
1879
+ elif style_type == 'italic':
1880
+ run.font.italic = True
1881
+
1882
+ # Apply baseline aesthetics
1883
+ if font_template:
1884
+ self.copy_ppt_font_properties(run, font_template)
1885
+
1886
+
1887
+ def copy_ppt_font_properties(self, target_run, source_run: FormatRun):
1888
+ """
1889
+ Force font properties in PowerPoint run.
1890
+ PPT equivalent of copy_font_properties().
1891
+ """
1892
+ from pptx.util import Pt
1893
+ from pptx.dml.color import RGBColor
1894
+
1895
+ try:
1896
+ if source_run.font_name:
1897
+ target_run.font.name = source_run.font_name
1898
+
1899
+ if source_run.font_size:
1900
+ target_run.font.size = Pt(source_run.font_size)
1901
+
1902
+ if source_run.font_color:
1903
+ target_run.font.color.rgb = RGBColor(*source_run.font_color)
1904
+
1905
+ if source_run.underline is not None:
1906
+ target_run.font.underline = source_run.underline
1907
+
1908
+ except Exception as e:
1909
+ logger.debug(f"PPT font property copy failed: {e}")
1910
+
1911
+
1912
+ def restore_table_to_slide(
1913
+ self,
1914
+ table,
1915
+ table_data: List[List[TranslatableTextFrame]],
1916
+ translated_cells: List[List[Tuple[str, List[Tuple[int, int]]]]]
1917
+ ):
1918
+ """
1919
+ Restore translated content to PowerPoint table.
1920
+ """
1921
+ for i, row in enumerate(table.rows):
1922
+ for j, cell in enumerate(row.cells):
1923
+ if i < len(table_data) and j < len(table_data[i]):
1924
+ trans_frame = table_data[i][j]
1925
+ trans_paragraphs = translated_cells[i][j]
1926
+
1927
+ # Treat cell as a shape with text frame
1928
+ self.apply_text_frame_formatting(
1929
+ cell, trans_frame, trans_paragraphs
1930
+ )
1931
+
1932
+
1933
+ def set_speaker_notes(self, slide, translated_notes: str):
1934
+ """
1935
+ Set translated speaker notes.
1936
+ """
1937
+ try:
1938
+ if not slide.has_notes_slide:
1939
+ notes_slide = slide.notes_slide # Creates if doesn't exist
1940
+ else:
1941
+ notes_slide = slide.notes_slide
1942
+
1943
+ text_frame = notes_slide.notes_text_frame
1944
+ text_frame.clear()
1945
+ text_frame.text = translated_notes
1946
+ except Exception as e:
1947
+ logger.warning(f"Could not set speaker notes: {e}")
1948
+
1949
+
1950
+ # ============================================================================
1951
+ # PPTX SLIDE PROCESSING
1952
+ # ============================================================================
1953
+
1954
+ async def translate_shape(self, shape):
1955
+ """
1956
+ Translate a single shape (text box, placeholder, etc.)
1957
+ """
1958
+ if not shape.has_text_frame:
1959
+ return
1960
+
1961
+ try:
1962
+ # Extract
1963
+ trans_frame = self.extract_text_frame(shape)
1964
+
1965
+ if not trans_frame.paragraphs:
1966
+ return
1967
+
1968
+ # Translate each paragraph
1969
+ translated_paragraphs = []
1970
+ for trans_para in trans_frame.paragraphs:
1971
+ original_text = trans_para.get_text()
1972
+ if not original_text.strip():
1973
+ translated_paragraphs.append(("", []))
1974
+ continue
1975
+
1976
+ # Translate
1977
+ translated_text = await self.translate_text(original_text)
1978
+
1979
+ # Align
1980
+ src_words = trans_para.get_words()
1981
+ tgt_words = re.findall(r"\w+", translated_text)
1982
+ alignment = []
1983
+ if self.aligner and src_words and tgt_words:
1984
+ alignment = self.aligner.align(src_words, tgt_words)
1985
+
1986
+ translated_paragraphs.append((translated_text, alignment))
1987
+
1988
+ # Reconstruct
1989
+ self.apply_text_frame_formatting(shape, trans_frame, translated_paragraphs)
1990
+
1991
+ except Exception as e:
1992
+ logger.error(f"Shape translation failed: {e}", exc_info=True)
1993
+
1994
+
1995
+ async def translate_slide(self, slide):
1996
+ """
1997
+ Translate all content in a slide.
1998
+ """
1999
+ # Process shapes
2000
+ for shape in slide.shapes:
2001
+ if shape.has_text_frame:
2002
+ await self.translate_shape(shape)
2003
+
2004
+ # Handle tables
2005
+ if shape.has_table:
2006
+ await self.translate_table_in_slide(shape.table)
2007
+
2008
+ # Handle groups (recursive)
2009
+ if shape.shape_type == 6: # MSO_SHAPE_TYPE.GROUP
2010
+ for sub_shape in shape.shapes:
2011
+ if sub_shape.has_text_frame:
2012
+ await self.translate_shape(sub_shape)
2013
+
2014
+ # Process speaker notes
2015
+ notes_text = self.get_speaker_notes(slide)
2016
+ if notes_text:
2017
+ translated_notes = await self.translate_text(notes_text)
2018
+ self.set_speaker_notes(slide, translated_notes)
2019
+
2020
+
2021
+ async def translate_table_in_slide(self, table):
2022
+ """
2023
+ Translate table content in slide.
2024
+ """
2025
+ table_data = self.extract_table_from_slide(table)
2026
+ translated_cells = []
2027
+
2028
+ for row_data in table_data:
2029
+ translated_row = []
2030
+ for cell_frame in row_data:
2031
+ cell_paragraphs = []
2032
+ for trans_para in cell_frame.paragraphs:
2033
+ text = trans_para.get_text()
2034
+ if text.strip():
2035
+ translated = await self.translate_text(text)
2036
+ src_words = trans_para.get_words()
2037
+ tgt_words = re.findall(r"\w+", translated)
2038
+ alignment = self.aligner.align(src_words, tgt_words) if self.aligner else []
2039
+ cell_paragraphs.append((translated, alignment))
2040
+ else:
2041
+ cell_paragraphs.append(("", []))
2042
+ translated_row.append(cell_paragraphs)
2043
+ translated_cells.append(translated_row)
2044
+
2045
+ self.restore_table_to_slide(table, table_data, translated_cells)
2046
+
2047
+
2048
+ async def translate_presentation(self, input_path: Path, output_path: Path):
2049
+ """
2050
+ Main presentation translation lifecycle.
2051
+ PPT equivalent of translate_document().
2052
+ """
2053
+ from pptx import Presentation
2054
+
2055
+ prs = Presentation(str(input_path))
2056
+
2057
+ logger.info(f"Processing {len(prs.slides)} slides")
2058
+
2059
+ for slide_num, slide in enumerate(tqdm(prs.slides, desc="Translating slides"), 1):
2060
+ logger.info(f"Processing slide {slide_num}")
2061
+ await self.translate_slide(slide)
2062
+
2063
+ logger.info(f"Saving presentation to {output_path}")
2064
+ prs.save(str(output_path))
2065
+ logger.info("✓ Presentation Translation Complete.")
2066
+
2067
+ # ============================================================================
2068
+ # FILE TYPE DETECTION
2069
+ # ============================================================================
2070
+
2071
+ def detect_file_type(self, file_path: Path) -> str:
2072
+ """
2073
+ Detect if file is Word (.docx) or PowerPoint (.pptx).
2074
+ Returns: 'docx', 'pptx', or 'unknown'
2075
+ """
2076
+ suffix = file_path.suffix.lower()
2077
+
2078
+ if suffix == '.docx':
2079
+ return 'docx'
2080
+ elif suffix == '.pptx':
2081
+ return 'pptx'
2082
+ else:
2083
+ # Try to detect by magic bytes
2084
+ try:
2085
+ with open(file_path, 'rb') as f:
2086
+ header = f.read(4)
2087
+ # Both formats are ZIP files (PK header)
2088
+ if header[:2] == b'PK':
2089
+ # Try to open as docx first
2090
+ try:
2091
+ from docx import Document
2092
+ Document(str(file_path))
2093
+ return 'docx'
2094
+ except:
2095
+ pass
2096
+ # Try as pptx
2097
+ try:
2098
+ from pptx import Presentation
2099
+ Presentation(str(file_path))
2100
+ return 'pptx'
2101
+ except:
2102
+ pass
2103
+ except:
2104
+ pass
2105
+
2106
+ return 'unknown'
2107
+
2108
+
2109
+ def validate_file(self, file_path: Path) -> Tuple[bool, str, str]:
2110
+ """
2111
+ Validate input file and return (is_valid, file_type, error_message).
2112
+
2113
+ Returns:
2114
+ Tuple of (success, file_type, error_msg)
2115
+ """
2116
+ if not file_path.exists():
2117
+ return False, 'unknown', f"File not found: {file_path}"
2118
+
2119
+ file_type = self.detect_file_type(file_path)
2120
+
2121
+ if file_type == 'unknown':
2122
+ return False, 'unknown', f"Unsupported file format. Only .docx and .pptx are supported."
2123
+
2124
+ # Verify we can actually open it
2125
+ try:
2126
+ if file_type == 'docx':
2127
+ from docx import Document
2128
+ doc = Document(str(file_path))
2129
+ # Basic sanity check
2130
+ if not hasattr(doc, 'paragraphs'):
2131
+ return False, file_type, "Invalid .docx file structure"
2132
+
2133
+ elif file_type == 'pptx':
2134
+ from pptx import Presentation
2135
+ prs = Presentation(str(file_path))
2136
+ # Basic sanity check
2137
+ if not hasattr(prs, 'slides'):
2138
+ return False, file_type, "Invalid .pptx file structure"
2139
+
2140
+ return True, file_type, ""
2141
+
2142
+ except Exception as e:
2143
+ return False, file_type, f"Cannot open file: {str(e)}"
2144
+
2145
+
2146
+
2147
+
2148
 
2149
  # ============================================================================
2150
  # CLI
 
2161
  opus - Specialized bilingual models. Tiny (~200MB), extremely fast, literal.
2162
  ct2 (wmt) - Dense Facebook models. Peak German/European quality (~6GB RAM).
2163
 
2164
+ Supported Formats:
2165
+ .docx - Microsoft Word documents (paragraphs, tables, footnotes, headers/footers)
2166
+ .pptx - Microsoft PowerPoint presentations (slides, text boxes, tables, notes)
2167
+
2168
  Examples:
2169
+ # Translate Word document (NLLB-600M)
2170
  %(prog)s input.docx output.docx -s en -t de
2171
 
2172
+ # Translate PowerPoint presentation
2173
+ %(prog)s presentation.pptx translated.pptx -s en -t es
 
 
 
2174
 
2175
+ # High-quality academic translation (Madlad-400)
2176
+ %(prog)s thesis.docx thesis_de.docx -s en -t de --nmt madlad
2177
 
2178
+ # Translate slides with LLM (Claude)
2179
+ %(prog)s slides.pptx slides_fr.pptx -s en -t fr --mode llm-align --llm anthropic
2180
 
2181
  Environment Variables:
2182
  OPENAI_API_KEY, ANTHROPIC_API_KEY - Required for LLM backends.
2183
  """
2184
  )
2185
 
2186
+ # 1. POSITIONAL ARGUMENTS, HELP TEXT
2187
+ parser.add_argument('input', help='Input file (.docx or .pptx)')
2188
+ parser.add_argument('output', help='Output file (.docx or .pptx)')
2189
 
 
2190
  parser.add_argument('-s', '--source', default='en', help='Source language code (default: en)')
2191
  parser.add_argument('-t', '--target', default='de', help='Target language code (default: de)')
2192
 
 
2193
  parser.add_argument(
2194
  '--mode',
2195
  choices=['nmt', 'llm-align', 'llm-plain', 'hybrid'],
 
2197
  help='Translation strategy (default: hybrid)'
2198
  )
2199
 
 
2200
  parser.add_argument(
2201
  '--nmt',
2202
  choices=['nllb', 'madlad', 'opus', 'ct2', 'auto'],
 
2211
  help='NLLB variant only: 600M (fastest), 1.3B (balanced), 3.3B (heavy)'
2212
  )
2213
 
 
2214
  parser.add_argument(
2215
  '--llm',
2216
  choices=['openai', 'anthropic', 'ollama'],
2217
  help='LLM provider for hybrid/llm modes'
2218
  )
2219
 
 
2220
  parser.add_argument(
2221
  '--aligner',
2222
  choices=['awesome', 'simalign', 'lindat', 'fast_align', 'heuristic', 'auto'],
 
2228
 
2229
  args = parser.parse_args()
2230
 
2231
+ # Set logging level
2232
  if args.verbose:
2233
  logging.getLogger().setLevel(logging.DEBUG)
2234
 
2235
+ # Enhanced validation
2236
  input_path = Path(args.input)
2237
+ output_path = Path(args.output)
2238
+
2239
  if not input_path.exists():
2240
  logger.error(f"File not found: {input_path}")
2241
  sys.exit(1)
2242
 
2243
+ # Validate file type compatibility
2244
+ input_type = input_path.suffix.lower()
2245
+ output_type = output_path.suffix.lower()
2246
+
2247
+ if input_type not in ['.docx', '.pptx']:
2248
+ logger.error(f"Unsupported input format: {input_type}. Only .docx and .pptx are supported.")
2249
+ sys.exit(1)
2250
+
2251
+ # Warn if output extension doesn't match input
2252
+ if output_type != input_type:
2253
+ logger.warning(f"Output extension ({output_type}) doesn't match input ({input_type}). Using {input_type}.")
2254
+ output_path = output_path.with_suffix(input_type)
2255
+
2256
+ # Mode mapping unchanged
2257
  mode_map = {
2258
  'nmt': TranslationMode.NMT_ONLY,
2259
  'llm-align': TranslationMode.LLM_WITH_ALIGN,
 
2261
  'hybrid': TranslationMode.HYBRID
2262
  }
2263
 
2264
+ # Updated status header
2265
+ file_type_name = "Word Document" if input_type == '.docx' else "PowerPoint Presentation"
2266
+
2267
  print(f"\n{'='*60}")
2268
  print(f"🌍 DOCUMENT TRANSLATOR - PRODUCTION v12")
2269
  print(f"{'='*60}")
2270
+ print(f"Format: {file_type_name}")
2271
  print(f"Input: {input_path.name}")
2272
+ print(f"Output: {output_path.name}")
2273
  print(f"Direction: {args.source.upper()} → {args.target.upper()}")
2274
  print(f"Mode: {args.mode.upper()}")
2275
  print(f"NMT Engine: {args.nmt.upper()} {'('+args.nllb_size+')' if args.nmt=='nllb' else ''}")
 
2278
  print(f"LLM: {args.llm.upper()}")
2279
  print(f"{'='*60}\n")
2280
 
2281
+ # Initialize translator
2282
  translator = UltimateDocumentTranslator(
2283
  src_lang=args.source,
2284
  tgt_lang=args.target,
 
2289
  nllb_model_size=args.nllb_size
2290
  )
2291
 
2292
+ # Use unified translate_file method
2293
  try:
2294
+ await translator.translate_file(input_path, output_path)
2295
 
2296
  print(f"\n{'='*60}")
2297
+ print(f"✅ Success! {file_type_name} processed in {args.mode} mode.")
2298
+ print(f"💾 File saved to: {output_path}")
2299
  print(f"{'='*60}\n")
2300
  except Exception as e:
2301
+ logger.error(f"FAILED | Translation aborted: {e}", exc_info=args.verbose)
2302
  sys.exit(1)
2303
 
2304