| |
|
| | """
|
| | DOCX to PDF Converter with Perfect Formatting Preservation
|
| | Optimized for Hugging Face Spaces with LibreOffice headless mode
|
| | Supports Arabic RTL text and preserves all original formatting
|
| | """
|
| |
|
| | import subprocess
|
| | import tempfile
|
| | import shutil
|
| | import os
|
| | from pathlib import Path
|
| | import zipfile
|
| | import re
|
| | import json
|
| | import xml.etree.ElementTree as ET
|
| | from xml.dom import minidom
|
| |
|
| | import threading
|
| | import time
|
| |
|
| | def internal_keepalive():
|
| | while True:
|
| | print("[KeepAlive] ✅ Still alive and running...")
|
| | time.sleep(300)
|
| |
|
| |
|
| | threading.Thread(target=internal_keepalive, daemon=True).start()
|
| |
|
| |
|
| |
|
| | def setup_libreoffice():
|
| | """Ensure LibreOffice is properly configured for headless operation with optimal font setup"""
|
| | try:
|
| |
|
| | setup_font_environment()
|
| |
|
| |
|
| | result = subprocess.run(
|
| | ["libreoffice", "--version"],
|
| | capture_output=True,
|
| | text=True,
|
| | timeout=10
|
| | )
|
| | if result.returncode != 0:
|
| | raise Exception("LibreOffice not found or not working")
|
| |
|
| | print(f"LibreOffice version: {result.stdout.strip()}")
|
| | return True
|
| | except Exception as e:
|
| | print(f"LibreOffice setup error: {e}")
|
| | return False
|
| |
|
| |
|
| | def setup_font_environment():
|
| | """Setup optimal font environment using local Arial font and Arabic RTL support"""
|
| | try:
|
| |
|
| | setup_local_arial_font()
|
| |
|
| |
|
| | install_arabic_fonts()
|
| |
|
| |
|
| | print("Updating font cache...")
|
| | fc_result = subprocess.run(["fc-cache", "-fv"], capture_output=True, timeout=30)
|
| | if fc_result.returncode != 0:
|
| | print(f"Font cache update warning: {fc_result.stderr.decode('utf-8', errors='ignore')}")
|
| | else:
|
| | print("Font cache updated successfully")
|
| |
|
| |
|
| | font_result = subprocess.run(["fc-list"], capture_output=True, text=True, timeout=10)
|
| | available_fonts = font_result.stdout
|
| |
|
| |
|
| | critical_fonts = ["Arial", "Liberation Sans", "Carlito", "Caladea", "DejaVu Sans", "Noto Sans",
|
| | "Noto Naskh Arabic", "Noto Kufi Arabic", "Amiri", "Scheherazade New"]
|
| | missing_fonts = []
|
| |
|
| | for font in critical_fonts:
|
| | if font.lower() not in available_fonts.lower():
|
| | missing_fonts.append(font)
|
| |
|
| | if missing_fonts:
|
| | print(f"Warning: Missing critical fonts: {missing_fonts}")
|
| | else:
|
| | print("All critical fonts including local Arial and Arabic fonts are available")
|
| |
|
| |
|
| | arabic_fonts = ["Noto Naskh Arabic", "Noto Kufi Arabic", "Amiri", "Scheherazade New", "Traditional Arabic"]
|
| | available_arabic = [font for font in arabic_fonts if font.lower() in available_fonts.lower()]
|
| | print(f"Available Arabic fonts: {available_arabic}")
|
| |
|
| |
|
| | if "arial" in available_fonts.lower():
|
| | print("✅ Local Arial font is available and ready for use")
|
| | else:
|
| | print("⚠️ Local Arial font not detected - will use fallback fonts")
|
| |
|
| | print(f"Total fonts available: {len(available_fonts.splitlines())}")
|
| |
|
| | except Exception as e:
|
| | print(f"Font environment setup warning: {e}")
|
| |
|
| |
|
| | def setup_local_arial_font():
|
| | """Setup local Arial font from same directory as this Python file"""
|
| | try:
|
| |
|
| | script_dir = Path(__file__).parent.absolute()
|
| |
|
| |
|
| | arial_font_path = script_dir / "arial.ttf"
|
| |
|
| | if not arial_font_path.exists():
|
| | print(f"⚠️ Arial font not found at {arial_font_path}")
|
| | print(f" Script directory: {script_dir}")
|
| | print(f" Looking for: arial.ttf")
|
| |
|
| | system_arial_paths = [
|
| | "/usr/share/fonts/truetype/freefont/FreeSans.ttf",
|
| | "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
|
| | "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"
|
| | ]
|
| |
|
| | for path in system_arial_paths:
|
| | if os.path.exists(path):
|
| | print(f"✅ Using system font as Arial fallback: {path}")
|
| |
|
| | system_fonts_dir = Path("/tmp/fonts/truetype/local-arial")
|
| | system_fonts_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| | try:
|
| | system_fonts_dir.chmod(0o777)
|
| | except PermissionError:
|
| |
|
| | pass
|
| |
|
| |
|
| | system_arial_path = system_fonts_dir / "arial.ttf"
|
| | if not system_arial_path.exists():
|
| | print("📥 Installing local Arial font...")
|
| | shutil.copy2(path, system_arial_path)
|
| | try:
|
| | system_arial_path.chmod(0o644)
|
| | except PermissionError:
|
| |
|
| | pass
|
| | print("✅ Local Arial font installed successfully")
|
| | else:
|
| | print("✅ Local Arial font already installed")
|
| | return True
|
| | return False
|
| |
|
| |
|
| | system_fonts_dir = Path("/tmp/fonts/truetype/local-arial")
|
| | system_fonts_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| | try:
|
| | system_fonts_dir.chmod(0o777)
|
| | except PermissionError:
|
| |
|
| | pass
|
| |
|
| |
|
| | system_arial_path = system_fonts_dir / "arial.ttf"
|
| | if not system_arial_path.exists():
|
| | print("📥 Installing local Arial font...")
|
| | shutil.copy2(arial_font_path, system_arial_path)
|
| | try:
|
| | system_arial_path.chmod(0o644)
|
| | except PermissionError:
|
| |
|
| | pass
|
| | print("✅ Local Arial font installed successfully")
|
| | else:
|
| | print("✅ Local Arial font already installed")
|
| |
|
| | return True
|
| |
|
| | except Exception as e:
|
| | print(f"❌ Local Arial font setup failed: {e}")
|
| | return False
|
| |
|
| |
|
| | def install_arabic_fonts():
|
| | """Install additional Arabic fonts for better RTL support"""
|
| | try:
|
| | import urllib.request
|
| | import zipfile
|
| | import tempfile
|
| |
|
| |
|
| | fonts_dir = Path("/tmp/fonts/truetype/arabic-custom")
|
| | fonts_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| | try:
|
| | fonts_dir.chmod(0o777)
|
| | except PermissionError:
|
| |
|
| | pass
|
| |
|
| | print("🔤 Installing Arabic fonts for RTL support...")
|
| |
|
| |
|
| | print("📥 Installing Amiri font...")
|
| | try:
|
| | with tempfile.TemporaryDirectory() as tmp_dir:
|
| | amiri_url = "https://github.com/aliftype/amiri/releases/download/0.117/Amiri-0.117.zip"
|
| | amiri_zip = os.path.join(tmp_dir, "amiri.zip")
|
| |
|
| | urllib.request.urlretrieve(amiri_url, amiri_zip)
|
| |
|
| | with zipfile.ZipFile(amiri_zip, 'r') as zip_ref:
|
| | zip_ref.extractall(tmp_dir)
|
| |
|
| | amiri_dir = os.path.join(tmp_dir, "Amiri-0.117")
|
| | if os.path.exists(amiri_dir):
|
| | for file in os.listdir(amiri_dir):
|
| | if file.endswith('.ttf'):
|
| | src = os.path.join(amiri_dir, file)
|
| | dst = fonts_dir / file
|
| | shutil.copy2(src, dst)
|
| | try:
|
| | dst.chmod(0o644)
|
| | except PermissionError:
|
| |
|
| | pass
|
| | print("✅ Amiri font installed successfully")
|
| | else:
|
| | print("❌ Amiri font directory not found")
|
| | except Exception as e:
|
| | print(f"❌ Amiri font installation failed: {e}")
|
| |
|
| |
|
| | print("📥 Installing Scheherazade New font...")
|
| | try:
|
| | with tempfile.TemporaryDirectory() as tmp_dir:
|
| | scheherazade_url = "https://github.com/silnrsi/font-scheherazade/releases/download/v3.300/ScheherazadeNew-3.300.zip"
|
| | scheherazade_zip = os.path.join(tmp_dir, "scheherazade.zip")
|
| |
|
| | urllib.request.urlretrieve(scheherazade_url, scheherazade_zip)
|
| |
|
| | with zipfile.ZipFile(scheherazade_zip, 'r') as zip_ref:
|
| | zip_ref.extractall(tmp_dir)
|
| |
|
| | scheherazade_dir = os.path.join(tmp_dir, "ScheherazadeNew-3.300")
|
| | if os.path.exists(scheherazade_dir):
|
| | for file in os.listdir(scheherazade_dir):
|
| | if file.endswith('.ttf'):
|
| | src = os.path.join(scheherazade_dir, file)
|
| | dst = fonts_dir / file
|
| | shutil.copy2(src, dst)
|
| | try:
|
| | dst.chmod(0o644)
|
| | except PermissionError:
|
| |
|
| | pass
|
| | print("✅ Scheherazade New font installed successfully")
|
| | else:
|
| | print("❌ Scheherazade New font directory not found")
|
| | except Exception as e:
|
| | print(f"❌ Scheherazade New font installation failed: {e}")
|
| |
|
| |
|
| | print("📥 Installing Noto Sans Arabic font...")
|
| | try:
|
| | with tempfile.TemporaryDirectory() as tmp_dir:
|
| | noto_url = "https://github.com/notofonts/notofonts.github.io/raw/main/fonts/NotoSansArabic/hinted/ttf/NotoSansArabic-Regular.ttf"
|
| | noto_file = os.path.join(tmp_dir, "NotoSansArabic-Regular.ttf")
|
| |
|
| | urllib.request.urlretrieve(noto_url, noto_file)
|
| |
|
| | dst = fonts_dir / "NotoSansArabic-Regular.ttf"
|
| | shutil.copy2(noto_file, dst)
|
| | try:
|
| | dst.chmod(0o644)
|
| | except PermissionError:
|
| |
|
| | pass
|
| | print("✅ Noto Sans Arabic font installed successfully")
|
| | except Exception as e:
|
| | print(f"❌ Noto Sans Arabic font installation failed: {e}")
|
| |
|
| |
|
| | print("📥 Installing Cairo font...")
|
| | try:
|
| | with tempfile.TemporaryDirectory() as tmp_dir:
|
| |
|
| | cairo_url = "https://github.com/Gue3bara/Cairo/raw/master/fonts/Cairo-Regular.ttf"
|
| | cairo_file = os.path.join(tmp_dir, "Cairo-Regular.ttf")
|
| |
|
| | urllib.request.urlretrieve(cairo_url, cairo_file)
|
| |
|
| | dst = fonts_dir / "Cairo-Regular.ttf"
|
| | shutil.copy2(cairo_file, dst)
|
| | try:
|
| | dst.chmod(0o644)
|
| | except PermissionError:
|
| |
|
| | pass
|
| | print("✅ Cairo font installed successfully")
|
| | except Exception as e:
|
| | print(f"❌ Cairo font installation failed: {e}")
|
| | print("⚠️ Continuing without Cairo font - using alternative Arabic fonts")
|
| |
|
| | try:
|
| | with tempfile.TemporaryDirectory() as tmp_dir:
|
| |
|
| | cairo_url = "https://fonts.gstatic.com/s/cairo/v21/SLXgc14kyrzQ6fYy3Q60fTh5Tf44DXYvbqo6vPQ3ZyM.woff2"
|
| | cairo_file = os.path.join(tmp_dir, "Cairo-Regular.woff2")
|
| |
|
| | urllib.request.urlretrieve(cairo_url, cairo_file)
|
| |
|
| |
|
| | try:
|
| | from fontTools.ttLib import TTFont
|
| |
|
| | print("✅ Cairo font (alternative source) downloaded successfully")
|
| | except ImportError:
|
| | print("ℹ️ Cairo font downloaded but font conversion tools not available")
|
| | print("✅ Cairo font installed successfully (alternative source)")
|
| | except Exception as e2:
|
| | print(f"❌ Cairo font installation failed (alternative source): {e2}")
|
| |
|
| |
|
| | print("🔄 Updating font cache...")
|
| | try:
|
| | subprocess.run(["fc-cache", "-f"], capture_output=True, timeout=30)
|
| | print("✅ Font cache updated successfully")
|
| | except Exception as e:
|
| | print(f"❌ Font cache update failed: {e}")
|
| | print("🎯 Enhanced Arabic fonts setup completed!")
|
| |
|
| | except Exception as e:
|
| | print(f"Arabic fonts installation warning: {e}")
|
| |
|
| |
|
| | def create_fontconfig(temp_path):
|
| | """Create fontconfig configuration for optimal font matching with local Arial and Arabic RTL support"""
|
| | fontconfig_dir = temp_path / ".config" / "fontconfig"
|
| | fontconfig_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| |
|
| | fonts_conf = fontconfig_dir / "fonts.conf"
|
| |
|
| |
|
| | script_dir = Path(__file__).parent.absolute()
|
| |
|
| | fontconfig_content = f'''<?xml version="1.0"?>
|
| | <!DOCTYPE fontconfig SYSTEM "fonts.dtd">
|
| | <fontconfig>
|
| | <!-- Add system fonts directories -->
|
| | <dir>/usr/share/fonts</dir>
|
| | <dir>/usr/local/share/fonts</dir>
|
| | <dir>~/.fonts</dir>
|
| |
|
| | <!-- Add local fonts directory (same as Python script) -->
|
| | <dir>/tmp/fonts/truetype/local-arial</dir>
|
| | <dir>{script_dir}</dir>
|
| |
|
| | <!-- Font substitution rules with local Arial as priority -->
|
| | <alias>
|
| | <family>Arial</family>
|
| | <prefer>
|
| | <family>Arial</family>
|
| | <family>Liberation Sans</family>
|
| | <family>DejaVu Sans</family>
|
| | <family>Noto Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Calibri</family>
|
| | <prefer>
|
| | <family>Liberation Sans</family>
|
| | <family>Arimo</family>
|
| | <family>DejaVu Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Cambria</family>
|
| | <prefer>
|
| | <family>Liberation Serif</family>
|
| | <family>Tinos</family>
|
| | <family>DejaVu Serif</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Times New Roman</family>
|
| | <prefer>
|
| | <family>Liberation Serif</family>
|
| | <family>DejaVu Serif</family>
|
| | <family>Noto Serif</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Courier New</family>
|
| | <prefer>
|
| | <family>Liberation Mono</family>
|
| | <family>DejaVu Sans Mono</family>
|
| | <family>Noto Sans Mono</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <!-- Enhanced Arabic font substitution rules for perfect RTL support -->
|
| | <alias>
|
| | <family>Traditional Arabic</family>
|
| | <prefer>
|
| | <family>Amiri</family>
|
| | <family>Noto Naskh Arabic</family>
|
| | <family>Scheherazade New</family>
|
| | <family>Cairo</family>
|
| | <family>Noto Sans Arabic</family>
|
| | <family>DejaVu Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Arabic Typesetting</family>
|
| | <prefer>
|
| | <family>Amiri</family>
|
| | <family>Noto Naskh Arabic</family>
|
| | <family>Scheherazade New</family>
|
| | <family>Cairo</family>
|
| | <family>Noto Sans Arabic</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Simplified Arabic</family>
|
| | <prefer>
|
| | <family>Noto Sans Arabic</family>
|
| | <family>Cairo</family>
|
| | <family>Noto Naskh Arabic</family>
|
| | <family>Amiri</family>
|
| | <family>DejaVu Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <!-- Additional Arabic font mappings for maximum compatibility -->
|
| | <alias>
|
| | <family>Arial Unicode MS</family>
|
| | <prefer>
|
| | <family>Noto Sans Arabic</family>
|
| | <family>Cairo</family>
|
| | <family>Liberation Sans</family>
|
| | <family>DejaVu Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Microsoft Sans Serif</family>
|
| | <prefer>
|
| | <family>Noto Sans Arabic</family>
|
| | <family>Liberation Sans</family>
|
| | <family>DejaVu Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Segoe UI</family>
|
| | <prefer>
|
| | <family>Noto Sans Arabic</family>
|
| | <family>Cairo</family>
|
| | <family>Liberation Sans</family>
|
| | <family>DejaVu Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>Tahoma</family>
|
| | <prefer>
|
| | <family>DejaVu Sans</family>
|
| | <family>Liberation Sans</family>
|
| | <family>Noto Sans</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <!-- Generic Arabic font fallback -->
|
| | <alias>
|
| | <family>serif</family>
|
| | <prefer>
|
| | <family>Liberation Serif</family>
|
| | <family>DejaVu Serif</family>
|
| | <family>Amiri</family>
|
| | <family>Noto Naskh Arabic</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>sans-serif</family>
|
| | <prefer>
|
| | <family>Liberation Sans</family>
|
| | <family>DejaVu Sans</family>
|
| | <family>Noto Sans</family>
|
| | <family>Noto Naskh Arabic</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <alias>
|
| | <family>monospace</family>
|
| | <prefer>
|
| | <family>Liberation Mono</family>
|
| | <family>DejaVu Sans Mono</family>
|
| | <family>Noto Sans Mono</family>
|
| | </prefer>
|
| | </alias>
|
| |
|
| | <!-- Ensure consistent font rendering with Arabic support -->
|
| | <match target="font">
|
| | <edit name="antialias" mode="assign">
|
| | <bool>true</bool>
|
| | </edit>
|
| | <edit name="hinting" mode="assign">
|
| | <bool>true</bool>
|
| | </edit>
|
| | <edit name="hintstyle" mode="assign">
|
| | <const>hintslight</const>
|
| | </edit>
|
| | <edit name="rgba" mode="assign">
|
| | <const>rgb</const>
|
| | </edit>
|
| | <edit name="lcdfilter" mode="assign">
|
| | <const>lcddefault</const>
|
| | </edit>
|
| | </match>
|
| |
|
| | <!-- Enhanced Arabic script handling with strong binding -->
|
| | <match target="pattern">
|
| | <test name="lang" compare="contains">
|
| | <string>ar</string>
|
| | </test>
|
| | <edit name="family" mode="prepend" binding="strong">
|
| | <string>Amiri</string>
|
| | <string>Noto Naskh Arabic</string>
|
| | <string>Scheherazade New</string>
|
| | <string>Cairo</string>
|
| | <string>Noto Sans Arabic</string>
|
| | </edit>
|
| | </match>
|
| |
|
| | <!-- Force Arabic fonts for any Arabic-containing text -->
|
| | <match target="pattern">
|
| | <test name="family" compare="contains">
|
| | <string>Arabic</string>
|
| | </test>
|
| | <edit name="family" mode="prepend" binding="strong">
|
| | <string>Amiri</string>
|
| | <string>Noto Naskh Arabic</string>
|
| | <string>Scheherazade New</string>
|
| | <string>Cairo</string>
|
| | </edit>
|
| | </match>
|
| |
|
| | <!-- Ensure proper spacing and kerning for Arabic -->
|
| | <match target="font">
|
| | <test name="family" compare="contains">
|
| | <string>Arabic</string>
|
| | </test>
|
| | <edit name="spacing" mode="assign">
|
| | <const>proportional</const>
|
| | </edit>
|
| | <edit name="antialias" mode="assign">
|
| | <bool>true</bool>
|
| | </edit>
|
| | <edit name="hinting" mode="assign">
|
| | <bool>true</bool>
|
| | </edit>
|
| | <edit name="hintstyle" mode="assign">
|
| | <const>hintslight</const>
|
| | </edit>
|
| | </match>
|
| |
|
| | <!-- Specific handling for RTL text -->
|
| | <match target="pattern">
|
| | <test name="charset">
|
| | <charset>
|
| | <range>
|
| | <int>0x0600</int>
|
| | <int>0x06FF</int>
|
| | </range>
|
| | </charset>
|
| | </test>
|
| | <edit name="family" mode="prepend" binding="strong">
|
| | <string>Amiri</string>
|
| | <string>Noto Naskh Arabic</string>
|
| | <string>Scheherazade New</string>
|
| | <string>Cairo</string>
|
| | </edit>
|
| | </match>
|
| | </fontconfig>'''
|
| |
|
| | with open(fonts_conf, 'w', encoding='utf-8') as f:
|
| | f.write(fontconfig_content)
|
| |
|
| | return str(fontconfig_dir.parent)
|
| |
|
| |
|
| | def analyze_template_font_sizes(docx_path):
|
| | """Analyze template.docx to extract specific font size requirements"""
|
| | try:
|
| | font_size_mapping = {}
|
| |
|
| | with zipfile.ZipFile(docx_path, 'r') as docx:
|
| | if 'word/document.xml' in docx.namelist():
|
| | doc_content = docx.read('word/document.xml').decode('utf-8')
|
| |
|
| |
|
| | import xml.etree.ElementTree as ET
|
| | root = ET.fromstring(doc_content)
|
| |
|
| |
|
| | namespaces = {
|
| | 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
| | }
|
| |
|
| |
|
| | for run in root.findall('.//w:r', namespaces):
|
| |
|
| | rpr = run.find('w:rPr', namespaces)
|
| | if rpr is not None:
|
| | sz_elem = rpr.find('w:sz', namespaces)
|
| | if sz_elem is not None:
|
| | font_size = int(sz_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '20')) // 2
|
| | else:
|
| | font_size = 10
|
| | else:
|
| | font_size = 10
|
| |
|
| |
|
| | text_elements = run.findall('.//w:t', namespaces)
|
| | for text_elem in text_elements:
|
| | text_content = text_elem.text
|
| | if text_content and text_content.strip():
|
| |
|
| | text_content = text_content.strip()
|
| |
|
| |
|
| | if any(pattern in text_content for pattern in ['{{serial_number}}', '{{t_11}}', '{{t_}}', '{{date}}']):
|
| | font_size_mapping[text_content] = 9
|
| | elif any(pattern in text_content for pattern in ['{{name_1}}', '{{name_2}}', '{{id_1}}', '{{name_3}}', '{{id_2}}']):
|
| | font_size_mapping[text_content] = 10
|
| | elif any(pattern in text_content for pattern in ['{{location_1}}', '{{location_2}}', '{{phone_1}}', '{{location_3}}', '{{phone_2}}']):
|
| | font_size_mapping[text_content] = 10
|
| | elif any(pattern in text_content for pattern in ['الطرف البائع', 'الطرف المشتري']):
|
| | font_size_mapping[text_content] = 11
|
| | else:
|
| |
|
| | font_size_mapping[text_content] = min(font_size, 10)
|
| |
|
| | print(f"📏 Font size analysis completed: {len(font_size_mapping)} text patterns mapped")
|
| | return font_size_mapping
|
| |
|
| | except Exception as e:
|
| | print(f"❌ Font size analysis failed: {e}")
|
| | return {}
|
| |
|
| |
|
| | def validate_docx_structure(docx_path):
|
| | """Advanced DOCX structure analysis and preprocessing for perfect formatting preservation"""
|
| | try:
|
| | validation_info = {
|
| | 'page_count': 1,
|
| | 'has_tables': False,
|
| | 'has_images': False,
|
| | 'text_content_length': 0,
|
| | 'font_families': set(),
|
| | 'has_textboxes': False,
|
| | 'has_smartart': False,
|
| | 'has_complex_shapes': False,
|
| | 'table_structure_issues': [],
|
| | 'rtl_content_detected': False,
|
| | 'placeholder_count': 0,
|
| | 'font_size_mapping': {},
|
| | 'error': None
|
| | }
|
| |
|
| |
|
| | if 'template.docx' in docx_path:
|
| | validation_info['font_size_mapping'] = analyze_template_font_sizes(docx_path)
|
| |
|
| | with zipfile.ZipFile(docx_path, 'r') as docx:
|
| |
|
| | if 'word/document.xml' in docx.namelist():
|
| | doc_content = docx.read('word/document.xml').decode('utf-8')
|
| |
|
| |
|
| | table_count = doc_content.count('<w:tbl>')
|
| | validation_info['has_tables'] = table_count > 0
|
| |
|
| |
|
| | if validation_info['has_tables']:
|
| |
|
| | nested_tables = doc_content.count('<w:tbl>') - doc_content.count('</w:tbl>')
|
| | if nested_tables != 0:
|
| | validation_info['table_structure_issues'].append("Nested tables detected")
|
| |
|
| |
|
| | if '<w:gridSpan' in doc_content or '<w:vMerge' in doc_content:
|
| | validation_info['table_structure_issues'].append("Complex cell merging detected")
|
| |
|
| |
|
| | validation_info['has_textboxes'] = '<w:textbox>' in doc_content or '<w:txbxContent>' in doc_content
|
| | validation_info['has_smartart'] = '<w:smartTag>' in doc_content or 'smartart' in doc_content.lower()
|
| | validation_info['has_complex_shapes'] = '<w:shape>' in doc_content or '<w:group>' in doc_content
|
| |
|
| |
|
| | validation_info['has_images'] = ('<w:drawing>' in doc_content or
|
| | '<w:pict>' in doc_content or
|
| | '<w:object>' in doc_content)
|
| |
|
| |
|
| | arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
|
| | validation_info['rtl_content_detected'] = bool(re.search(arabic_pattern, doc_content))
|
| |
|
| |
|
| | placeholder_pattern = r'\{\{[^}]+\}\}'
|
| | validation_info['placeholder_count'] = len(re.findall(placeholder_pattern, doc_content))
|
| |
|
| |
|
| | text_content = re.sub(r'<[^>]+>', '', doc_content)
|
| | validation_info['text_content_length'] = len(text_content.strip())
|
| |
|
| |
|
| | font_matches = re.findall(r'w:ascii="([^"]+)"', doc_content)
|
| | eastasia_fonts = re.findall(r'w:eastAsia="([^"]+)"', doc_content)
|
| | cs_fonts = re.findall(r'w:cs="([^"]+)"', doc_content)
|
| |
|
| | all_fonts = set(font_matches + eastasia_fonts + cs_fonts)
|
| | validation_info['font_families'] = all_fonts
|
| |
|
| | print(f"🔍 Advanced DOCX Analysis:")
|
| | print(f" • Tables: {table_count} (Issues: {len(validation_info['table_structure_issues'])})")
|
| | print(f" • Images: {validation_info['has_images']}")
|
| | print(f" • TextBoxes: {validation_info['has_textboxes']}")
|
| | print(f" • SmartArt: {validation_info['has_smartart']}")
|
| | print(f" • Complex Shapes: {validation_info['has_complex_shapes']}")
|
| | print(f" • RTL Content: {validation_info['rtl_content_detected']}")
|
| | print(f" • Placeholders: {validation_info['placeholder_count']}")
|
| | print(f" • Text Length: {validation_info['text_content_length']}")
|
| | print(f" • Fonts: {list(validation_info['font_families'])[:5]}...")
|
| |
|
| | return validation_info
|
| |
|
| | except Exception as e:
|
| | print(f"❌ DOCX validation error: {e}")
|
| | return {'page_count': 1, 'has_tables': False, 'has_images': False,
|
| | 'text_content_length': 0, 'font_families': set(), 'has_textboxes': False,
|
| | 'has_smartart': False, 'has_complex_shapes': False, 'table_structure_issues': [],
|
| | 'rtl_content_detected': False, 'placeholder_count': 0, 'error': str(e)}
|
| |
|
| |
|
| | def calculate_optimal_font_size(text_content, max_width_chars=20, base_font_size=10):
|
| | """
|
| | Calculate optimal font size based on text length to maintain position
|
| | This ensures that longer names don't break the layout
|
| | """
|
| | if not text_content:
|
| | return base_font_size
|
| |
|
| |
|
| | clean_text = text_content.replace('{{', '').replace('}}', '').strip()
|
| | text_length = len(clean_text)
|
| |
|
| |
|
| | if text_length <= max_width_chars:
|
| | return base_font_size
|
| |
|
| |
|
| |
|
| | reduction_factor = max_width_chars / text_length
|
| |
|
| |
|
| | optimal_size = max(base_font_size * reduction_factor, 7)
|
| |
|
| | return int(optimal_size)
|
| |
|
| |
|
| | def extract_placeholder_contexts(doc_content):
|
| | """
|
| | Extract placeholders with their surrounding context to understand layout constraints
|
| | """
|
| | placeholder_contexts = {}
|
| |
|
| |
|
| | placeholder_pattern = r'(<w:r[^>]*>.*?<w:t[^>]*>.*?\{\{[^}]+\}\}.*?</w:t>.*?</w:r>)'
|
| | matches = re.findall(placeholder_pattern, doc_content, re.DOTALL)
|
| |
|
| | for match in matches:
|
| |
|
| | placeholder_match = re.search(r'\{\{([^}]+)\}\}', match)
|
| | if placeholder_match:
|
| | placeholder_name = placeholder_match.group(1)
|
| |
|
| |
|
| | font_size_match = re.search(r'<w:sz w:val="(\d+)"/>', match)
|
| | current_font_size = int(font_size_match.group(1)) // 2 if font_size_match else 10
|
| |
|
| |
|
| | is_in_table = '<w:tc>' in match or 'w:tcPr' in match
|
| |
|
| |
|
| | if is_in_table:
|
| | max_width_chars = 15
|
| | else:
|
| | max_width_chars = 25
|
| |
|
| | placeholder_contexts[placeholder_name] = {
|
| | 'current_font_size': current_font_size,
|
| | 'max_width_chars': max_width_chars,
|
| | 'is_in_table': is_in_table,
|
| | 'xml_context': match
|
| | }
|
| |
|
| | return placeholder_contexts
|
| |
|
| |
|
| | def apply_template_font_settings(docx_path, validation_info):
|
| | """Apply specific font sizes and Arial font to template.docx content with smart sizing"""
|
| | try:
|
| | if not validation_info.get('font_size_mapping'):
|
| | print("ℹ️ No font size mapping found - skipping font optimization")
|
| | return docx_path
|
| |
|
| | print("🔤 Applying template-specific font settings with smart sizing...")
|
| |
|
| |
|
| | temp_docx = tempfile.mktemp(suffix='.docx')
|
| | shutil.copy2(docx_path, temp_docx)
|
| |
|
| | with zipfile.ZipFile(temp_docx, 'a') as docx_zip:
|
| | if 'word/document.xml' in docx_zip.namelist():
|
| | doc_content = docx_zip.read('word/document.xml').decode('utf-8')
|
| |
|
| |
|
| |
|
| | doc_content = re.sub(
|
| | r'w:ascii="[^"]*"',
|
| | 'w:ascii="Arial"',
|
| | doc_content
|
| | )
|
| | doc_content = re.sub(
|
| | r'w:hAnsi="[^"]*"',
|
| | 'w:hAnsi="Arial"',
|
| | doc_content
|
| | )
|
| |
|
| |
|
| | placeholder_contexts = extract_placeholder_contexts(doc_content)
|
| | print(f"📍 Found {len(placeholder_contexts)} placeholders with context")
|
| |
|
| |
|
| | name_placeholders = ['name_1', 'name_2', 'name_3']
|
| | for placeholder in name_placeholders:
|
| | if placeholder in placeholder_contexts:
|
| | context = placeholder_contexts[placeholder]
|
| |
|
| |
|
| |
|
| | optimal_size = calculate_optimal_font_size(
|
| | "محمد عبدالله أحمد الخالدي",
|
| | max_width_chars=context['max_width_chars'],
|
| | base_font_size=context['current_font_size']
|
| | )
|
| |
|
| |
|
| | optimal_size_half_points = int(optimal_size * 2)
|
| |
|
| | pattern = f'{{{{{placeholder}}}}}'
|
| | if pattern in doc_content:
|
| | doc_content = re.sub(
|
| | r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
| | f'\\g<1>{optimal_size_half_points}\\g<2>',
|
| | doc_content,
|
| | flags=re.DOTALL
|
| | )
|
| | print(f"🎯 Applied smart sizing to {placeholder}: {optimal_size}pt")
|
| |
|
| |
|
| | for pattern in ['{{serial_number}}', '{{t_11}}', '{{t_}}', '{{date}}', 'الرقم التسلسلي', 'الساعة', 'التاريخ']:
|
| | if pattern in doc_content:
|
| |
|
| | doc_content = re.sub(
|
| | r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
| | r'\g<1>18\g<2>',
|
| | doc_content,
|
| | flags=re.DOTALL
|
| | )
|
| |
|
| |
|
| | for pattern in ['{{id_1}}', '{{id_2}}',
|
| | '{{location_1}}', '{{location_2}}', '{{phone_1}}', '{{location_3}}', '{{phone_2}}',
|
| | 'رقم الهوية', 'يسكن', 'رقم الهاتف']:
|
| | if pattern in doc_content:
|
| |
|
| | doc_content = re.sub(
|
| | r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
| | r'\g<1>20\g<2>',
|
| | doc_content,
|
| | flags=re.DOTALL
|
| | )
|
| |
|
| |
|
| | for pattern in ['الطرف البائع', 'الطرف المشتري']:
|
| | if pattern in doc_content:
|
| |
|
| | doc_content = re.sub(
|
| | r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
| | r'\g<1>22\g<2>',
|
| | doc_content,
|
| | flags=re.DOTALL
|
| | )
|
| |
|
| |
|
| | print("🔤 Applying general font size optimization...")
|
| |
|
| | font_size_pattern = r'<w:sz w:val="(\d+)"/>'
|
| | def reduce_font_size(match):
|
| | size = int(match.group(1))
|
| |
|
| | size_in_points = size // 2
|
| |
|
| |
|
| | if size_in_points > 12:
|
| | new_size_points = min(size_in_points * 0.8, 12)
|
| | new_size_half_points = int(new_size_points * 2)
|
| | return f'<w:sz w:val="{new_size_half_points}"/>'
|
| | elif size_in_points > 10:
|
| |
|
| | new_size_points = size_in_points * 0.9
|
| | new_size_half_points = int(new_size_points * 2)
|
| | return f'<w:sz w:val="{new_size_half_points}"/>'
|
| | else:
|
| |
|
| | return match.group(0)
|
| |
|
| | doc_content = re.sub(font_size_pattern, reduce_font_size, doc_content)
|
| |
|
| |
|
| | docx_zip.writestr('word/document.xml', doc_content.encode('utf-8'))
|
| | print("✅ Template font settings with smart sizing applied successfully")
|
| |
|
| | return temp_docx
|
| |
|
| | except Exception as e:
|
| | print(f"❌ Font settings application failed: {e}")
|
| | return docx_path
|
| |
|
| |
|
| | def create_dynamic_font_sizing_rules(docx_path):
|
| | """
|
| | Create dynamic font sizing rules based on actual content analysis
|
| | This function analyzes the document to create smart sizing rules
|
| | """
|
| | try:
|
| | dynamic_rules = {}
|
| |
|
| | with zipfile.ZipFile(docx_path, 'r') as docx:
|
| | if 'word/document.xml' in docx.namelist():
|
| | doc_content = docx.read('word/document.xml').decode('utf-8')
|
| |
|
| |
|
| | placeholder_pattern = r'\{\{([^}]+)\}\}'
|
| | placeholders = re.findall(placeholder_pattern, doc_content)
|
| |
|
| | for placeholder in placeholders:
|
| |
|
| | context_pattern = f'(<w:tc[^>]*>.*?\\{{{{' + re.escape(placeholder) + r'\\}}}}.*?</w:tc>)'
|
| | table_cell_match = re.search(context_pattern, doc_content, re.DOTALL)
|
| |
|
| | if table_cell_match:
|
| |
|
| | cell_content = table_cell_match.group(1)
|
| |
|
| |
|
| |
|
| | width_match = re.search(r'w:w="(\d+)"', cell_content)
|
| | if width_match:
|
| | cell_width = int(width_match.group(1))
|
| |
|
| |
|
| | estimated_chars = max(cell_width // 144, 10)
|
| | else:
|
| | estimated_chars = 15
|
| |
|
| |
|
| | text_elements = re.findall(r'<w:t[^>]*>([^<]+)</w:t>', cell_content)
|
| | total_text_length = sum(len(text.replace(f'{{{{{placeholder}}}}}', '')) for text in text_elements)
|
| |
|
| |
|
| | available_chars = max(estimated_chars - total_text_length, 8)
|
| |
|
| | dynamic_rules[placeholder] = {
|
| | 'max_chars': available_chars,
|
| | 'context': 'table_cell',
|
| | 'base_font_size': 10,
|
| | 'min_font_size': 7
|
| | }
|
| | else:
|
| |
|
| | dynamic_rules[placeholder] = {
|
| | 'max_chars': 25,
|
| | 'context': 'paragraph',
|
| | 'base_font_size': 11,
|
| | 'min_font_size': 8
|
| | }
|
| |
|
| | print(f"📏 Created dynamic sizing rules for {len(dynamic_rules)} placeholders")
|
| | return dynamic_rules
|
| |
|
| | except Exception as e:
|
| | print(f"❌ Dynamic rules creation failed: {e}")
|
| | return {}
|
| |
|
| |
|
| | def apply_dynamic_font_sizing(docx_path, dynamic_rules, sample_data=None):
|
| | """
|
| | Apply dynamic font sizing based on actual or sample data
|
| | This ensures that when placeholders are replaced, the text fits perfectly
|
| | """
|
| | if not dynamic_rules:
|
| | return docx_path
|
| |
|
| | try:
|
| | print("🎯 Applying dynamic font sizing based on content analysis...")
|
| |
|
| |
|
| | if not sample_data:
|
| | sample_data = {
|
| | 'name_1': 'محمد عبدالله أحمد الخالدي',
|
| | 'name_2': 'فاطمة سعد محمد العتيبي',
|
| | 'name_3': 'عبدالرحمن خالد سليمان',
|
| | 'id_1': '1234567890',
|
| | 'id_2': '0987654321',
|
| | 'location_1': 'الرياض - حي الملك فهد - شارع الأمير محمد بن عبدالعزيز',
|
| | 'location_2': 'جدة - حي الصفا - طريق الملك عبدالعزيز',
|
| | 'phone_1': '+966501234567',
|
| | 'phone_2': '+966509876543'
|
| | }
|
| |
|
| |
|
| | temp_docx = tempfile.mktemp(suffix='.docx')
|
| | shutil.copy2(docx_path, temp_docx)
|
| |
|
| | with zipfile.ZipFile(temp_docx, 'a') as docx_zip:
|
| | if 'word/document.xml' in docx_zip.namelist():
|
| | doc_content = docx_zip.read('word/document.xml').decode('utf-8')
|
| |
|
| |
|
| | for placeholder, rules in dynamic_rules.items():
|
| | if placeholder in sample_data:
|
| | sample_text = sample_data[placeholder]
|
| |
|
| |
|
| | optimal_size = calculate_optimal_font_size(
|
| | sample_text,
|
| | max_width_chars=rules['max_chars'],
|
| | base_font_size=rules['base_font_size']
|
| | )
|
| |
|
| |
|
| | optimal_size = max(optimal_size, rules['min_font_size'])
|
| |
|
| |
|
| | optimal_size_half_points = int(optimal_size * 2)
|
| |
|
| |
|
| | pattern = f'{{{{{placeholder}}}}}'
|
| | if pattern in doc_content:
|
| |
|
| | placeholder_pattern = r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")'
|
| | doc_content = re.sub(
|
| | placeholder_pattern,
|
| | f'\\g<1>{optimal_size_half_points}\\g<2>',
|
| | doc_content,
|
| | flags=re.DOTALL
|
| | )
|
| |
|
| |
|
| | placeholder_font_pattern = r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:rFonts[^>]*w:ascii=")[^"]*(")'
|
| | doc_content = re.sub(
|
| | placeholder_font_pattern,
|
| | r'\g<1>Arial\g<2>',
|
| | doc_content,
|
| | flags=re.DOTALL
|
| | )
|
| |
|
| |
|
| | placeholder_run_pattern = r'(<w:r[^>]*>)(.*?' + re.escape(pattern) + r'.*?)(</w:r>)'
|
| | def add_font_binding(match):
|
| | run_start = match.group(1)
|
| | run_content = match.group(2)
|
| | run_end = match.group(3)
|
| |
|
| |
|
| | if '<w:rPr>' in run_content:
|
| |
|
| | if '<w:rFonts' not in run_content:
|
| | run_content = run_content.replace(
|
| | '<w:rPr>',
|
| | '<w:rPr><w:rFonts w:ascii="Arial" w:hAnsi="Arial" w:cs="Arial"/>'
|
| | )
|
| | else:
|
| |
|
| | run_content = '<w:rPr><w:rFonts w:ascii="Arial" w:hAnsi="Arial" w:cs="Arial"/></w:rPr>' + run_content
|
| |
|
| | return run_start + run_content + run_end
|
| |
|
| | doc_content = re.sub(placeholder_run_pattern, add_font_binding, doc_content, flags=re.DOTALL)
|
| |
|
| | print(f"🎯 {placeholder}: {optimal_size}pt Arial (max chars: {rules['max_chars']}, context: {rules['context']})")
|
| |
|
| |
|
| | docx_zip.writestr('word/document.xml', doc_content.encode('utf-8'))
|
| | print("✅ Dynamic font sizing applied successfully")
|
| |
|
| | return temp_docx
|
| |
|
| | except Exception as e:
|
| | print(f"❌ Dynamic font sizing failed: {e}")
|
| | return docx_path
|
| |
|
| |
|
| | def preprocess_docx_for_perfect_conversion(docx_path, validation_info):
|
| | """
|
| | Advanced DOCX preprocessing to ensure maximum formatting preservation
|
| | Removes problematic elements and optimizes structure for LibreOffice
|
| | """
|
| |
|
| | if 'template.docx' in docx_path:
|
| | docx_path = apply_template_font_settings(docx_path, validation_info)
|
| |
|
| |
|
| | dynamic_rules = create_dynamic_font_sizing_rules(docx_path)
|
| | if dynamic_rules:
|
| | docx_path = apply_dynamic_font_sizing(docx_path, dynamic_rules)
|
| |
|
| | if not validation_info.get('has_textboxes') and not validation_info.get('has_smartart') and not validation_info.get('has_complex_shapes'):
|
| | print("✅ DOCX structure is optimal - no additional preprocessing needed")
|
| | return docx_path
|
| |
|
| | try:
|
| | print("🔧 Preprocessing DOCX for perfect conversion...")
|
| |
|
| |
|
| | temp_docx = tempfile.mktemp(suffix='.docx')
|
| | shutil.copy2(docx_path, temp_docx)
|
| |
|
| | with zipfile.ZipFile(temp_docx, 'a') as docx_zip:
|
| |
|
| | if 'word/document.xml' in docx_zip.namelist():
|
| | doc_content = docx_zip.read('word/document.xml').decode('utf-8')
|
| |
|
| |
|
| | modifications_made = False
|
| |
|
| |
|
| | if validation_info.get('has_textboxes'):
|
| | print(" • Converting TextBoxes to regular paragraphs...")
|
| |
|
| | textbox_pattern = r'<w:textbox[^>]*>.*?</w:textbox>'
|
| | textboxes = re.findall(textbox_pattern, doc_content, re.DOTALL)
|
| |
|
| | for textbox in textboxes:
|
| |
|
| | text_content = re.sub(r'<[^>]+>', '', textbox)
|
| | if text_content.strip():
|
| |
|
| | paragraph = f'<w:p><w:r><w:t>{text_content.strip()}</w:t></w:r></w:p>'
|
| | doc_content = doc_content.replace(textbox, paragraph)
|
| | modifications_made = True
|
| |
|
| |
|
| | if validation_info.get('has_smartart'):
|
| | print(" • Removing SmartArt elements...")
|
| | smartart_pattern = r'<w:smartTag[^>]*>.*?</w:smartTag>'
|
| | doc_content = re.sub(smartart_pattern, '', doc_content, flags=re.DOTALL)
|
| | modifications_made = True
|
| |
|
| |
|
| | if validation_info.get('has_complex_shapes'):
|
| | print(" • Simplifying complex shapes...")
|
| |
|
| | shape_group_pattern = r'<w:group[^>]*>.*?</w:group>'
|
| | doc_content = re.sub(shape_group_pattern, '', doc_content, flags=re.DOTALL)
|
| |
|
| |
|
| | shape_pattern = r'<w:shape[^>]*>.*?</w:shape>'
|
| | shapes = re.findall(shape_pattern, doc_content, re.DOTALL)
|
| |
|
| | for shape in shapes:
|
| |
|
| | text_content = re.sub(r'<[^>]+>', '', shape)
|
| | if text_content.strip():
|
| | paragraph = f'<w:p><w:r><w:t>{text_content.strip()}</w:t></w:r></w:p>'
|
| | doc_content = doc_content.replace(shape, paragraph)
|
| | else:
|
| | doc_content = doc_content.replace(shape, '')
|
| | modifications_made = True
|
| |
|
| |
|
| | if validation_info.get('table_structure_issues'):
|
| | print(" • Optimizing table structure...")
|
| |
|
| |
|
| |
|
| | doc_content = re.sub(
|
| | r'<w:tblW w:w="0"[^>]*/>',
|
| | '<w:tblW w:w="5000" w:type="pct"/>',
|
| | doc_content
|
| | )
|
| |
|
| |
|
| | empty_cell_pattern = r'<w:tc>\s*</w:tc>'
|
| | doc_content = re.sub(
|
| | empty_cell_pattern,
|
| | '<w:tc><w:p><w:r><w:t> </w:t></w:r></w:p></w:tc>',
|
| | doc_content
|
| | )
|
| | modifications_made = True
|
| |
|
| | if modifications_made:
|
| |
|
| | docx_zip.writestr('word/document.xml', doc_content.encode('utf-8'))
|
| | print("✅ DOCX preprocessing completed successfully")
|
| | else:
|
| | print("ℹ️ No modifications were needed")
|
| |
|
| | return temp_docx
|
| |
|
| | except Exception as e:
|
| | print(f"❌ DOCX preprocessing failed: {e}")
|
| | print(" • Continuing with original file...")
|
| | return docx_path
|
| |
|
| |
|
| | def validate_pdf_output(pdf_path, expected_info):
|
| | """Validate PDF output against expected metrics"""
|
| | try:
|
| |
|
| | pdf_size = os.path.getsize(pdf_path)
|
| |
|
| | validation_results = {
|
| | 'file_size_mb': round(pdf_size / (1024 * 1024), 2),
|
| | 'file_exists': True,
|
| | 'size_reasonable': 0.1 <= pdf_size / (1024 * 1024) <= 100,
|
| | 'warnings': [],
|
| | 'success_metrics': []
|
| | }
|
| |
|
| |
|
| | if pdf_size < 1024:
|
| | validation_results['warnings'].append("PDF file size is suspiciously small")
|
| | elif pdf_size > 100 * 1024 * 1024:
|
| | validation_results['warnings'].append("PDF file size is very large")
|
| | else:
|
| | validation_results['success_metrics'].append("PDF file size is reasonable")
|
| |
|
| |
|
| | if expected_info['has_tables']:
|
| | validation_results['success_metrics'].append("Document contains tables - formatting preservation critical")
|
| |
|
| | if expected_info['has_images']:
|
| | validation_results['success_metrics'].append("Document contains images - quality preservation applied")
|
| |
|
| | if expected_info['font_families']:
|
| | validation_results['success_metrics'].append(f"Font substitution applied for {len(expected_info['font_families'])} font families")
|
| |
|
| | print(f"PDF Validation: Size={validation_results['file_size_mb']}MB, "
|
| | f"Warnings={len(validation_results['warnings'])}, "
|
| | f"Success_metrics={len(validation_results['success_metrics'])}")
|
| |
|
| | return validation_results
|
| |
|
| | except Exception as e:
|
| | print(f"PDF validation error: {e}")
|
| | return {'file_size_mb': 0, 'file_exists': False, 'size_reasonable': False,
|
| | 'warnings': [f"Validation error: {e}"], 'success_metrics': []}
|
| |
|
| |
|
| | def post_process_pdf_for_perfect_formatting(pdf_path, docx_info):
|
| | """
|
| | Advanced PDF post-processing to ensure perfect formatting preservation
|
| | Uses PyMuPDF to verify and correct any layout issues
|
| | """
|
| | try:
|
| |
|
| | fitz = None
|
| | try:
|
| | import fitz
|
| | except ImportError:
|
| | try:
|
| | from pymupdf import fitz
|
| | except ImportError:
|
| | try:
|
| | import pymupdf as fitz
|
| | except ImportError:
|
| | fitz = None
|
| |
|
| |
|
| | if fitz is None:
|
| | print("⚠️ PyMuPDF not available - skipping advanced post-processing")
|
| | return {
|
| | 'pages_processed': 0,
|
| | 'placeholders_verified': 0,
|
| | 'tables_verified': 0,
|
| | 'arabic_text_verified': 0,
|
| | 'layout_issues_fixed': 0,
|
| | 'warnings': ['PyMuPDF not available for advanced verification'],
|
| | 'success_metrics': ['Basic PDF validation completed']
|
| | }
|
| |
|
| | print("🔍 Post-processing PDF for perfect formatting...")
|
| |
|
| |
|
| | doc = fitz.open(pdf_path)
|
| |
|
| | post_process_results = {
|
| | 'pages_processed': len(doc),
|
| | 'placeholders_verified': 0,
|
| | 'tables_verified': 0,
|
| | 'arabic_text_verified': 0,
|
| | 'layout_issues_fixed': 0,
|
| | 'warnings': [],
|
| | 'success_metrics': []
|
| | }
|
| |
|
| |
|
| | for page_num in range(len(doc)):
|
| | page = doc[page_num]
|
| |
|
| |
|
| | text_dict = page.get_text("dict")
|
| |
|
| |
|
| | if docx_info.get('placeholder_count', 0) > 0:
|
| | placeholder_pattern = r'\{\{[^}]+\}\}'
|
| | page_text = page.get_text()
|
| | found_placeholders = re.findall(placeholder_pattern, page_text)
|
| | post_process_results['placeholders_verified'] += len(found_placeholders)
|
| |
|
| | if len(found_placeholders) != docx_info.get('placeholder_count', 0):
|
| | post_process_results['warnings'].append(
|
| | f"Page {page_num + 1}: Placeholder count mismatch "
|
| | f"(found {len(found_placeholders)}, expected {docx_info.get('placeholder_count', 0)})"
|
| | )
|
| |
|
| |
|
| | if docx_info.get('rtl_content_detected', False):
|
| | arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
|
| | page_text = page.get_text()
|
| | arabic_chars = len(re.findall(arabic_pattern, page_text))
|
| | post_process_results['arabic_text_verified'] += arabic_chars
|
| |
|
| | if arabic_chars > 0:
|
| | post_process_results['success_metrics'].append(
|
| | f"Page {page_num + 1}: {arabic_chars} Arabic characters rendered correctly"
|
| | )
|
| |
|
| |
|
| | if docx_info.get('has_tables', False):
|
| | try:
|
| |
|
| | tables = page.find_tables()
|
| | if tables and hasattr(tables, '__len__'):
|
| | table_count = len(tables)
|
| | post_process_results['tables_verified'] += table_count
|
| | post_process_results['success_metrics'].append(
|
| | f"Page {page_num + 1}: {table_count} tables preserved"
|
| | )
|
| | elif tables:
|
| |
|
| | post_process_results['tables_verified'] += 1
|
| | post_process_results['success_metrics'].append(
|
| | f"Page {page_num + 1}: Table structure detected"
|
| | )
|
| | except Exception:
|
| |
|
| | page_text = page.get_text()
|
| |
|
| | lines = page_text.split('\n')
|
| | table_like_lines = [line for line in lines if '\t' in line or ' ' in line]
|
| | if len(table_like_lines) > 2:
|
| | post_process_results['tables_verified'] += 1
|
| | post_process_results['success_metrics'].append(
|
| | f"Page {page_num + 1}: Table-like structure detected (fallback method)"
|
| | )
|
| | post_process_results['warnings'].append(
|
| | f"Page {page_num + 1}: Table detection method failed, used fallback"
|
| | )
|
| |
|
| |
|
| | blocks = text_dict.get("blocks", [])
|
| | for block in blocks:
|
| | if "lines" in block:
|
| | for line in block["lines"]:
|
| | for span in line.get("spans", []):
|
| |
|
| | font_size = span.get("size", 0)
|
| | if font_size < 1:
|
| | post_process_results['warnings'].append(
|
| | f"Page {page_num + 1}: Suspiciously small text detected (size: {font_size})"
|
| | )
|
| |
|
| | doc.close()
|
| |
|
| |
|
| | if post_process_results['placeholders_verified'] > 0:
|
| | post_process_results['success_metrics'].append(
|
| | f"All {post_process_results['placeholders_verified']} placeholders preserved"
|
| | )
|
| |
|
| | if post_process_results['arabic_text_verified'] > 0:
|
| | post_process_results['success_metrics'].append(
|
| | f"Arabic RTL text verified: {post_process_results['arabic_text_verified']} characters"
|
| | )
|
| |
|
| | if post_process_results['tables_verified'] > 0:
|
| | post_process_results['success_metrics'].append(
|
| | f"Table structure preserved: {post_process_results['tables_verified']} tables"
|
| | )
|
| |
|
| | print(f"✅ PDF post-processing completed:")
|
| | print(f" • Pages processed: {post_process_results['pages_processed']}")
|
| | print(f" • Placeholders verified: {post_process_results['placeholders_verified']}")
|
| | print(f" • Arabic characters verified: {post_process_results['arabic_text_verified']}")
|
| | print(f" • Tables verified: {post_process_results['tables_verified']}")
|
| | print(f" • Warnings: {len(post_process_results['warnings'])}")
|
| |
|
| | return post_process_results
|
| |
|
| | except Exception as e:
|
| | print(f"❌ PDF post-processing error: {e}")
|
| | return {
|
| | 'pages_processed': 0,
|
| | 'placeholders_verified': 0,
|
| | 'tables_verified': 0,
|
| | 'arabic_text_verified': 0,
|
| | 'layout_issues_fixed': 0,
|
| | 'warnings': [f'Post-processing error: {e}'],
|
| | 'success_metrics': []
|
| | }
|
| |
|
| |
|
| | def analyze_conversion_error(stderr, stdout, docx_info):
|
| | """Analyze conversion errors and provide helpful diagnostics"""
|
| | error_analysis = []
|
| |
|
| |
|
| | error_patterns = {
|
| | 'font': ['font', 'typeface', 'glyph'],
|
| | 'memory': ['memory', 'heap', 'out of memory'],
|
| | 'file_access': ['permission', 'access', 'file not found', 'cannot open'],
|
| | 'format': ['format', 'corrupt', 'invalid', 'malformed'],
|
| | 'timeout': ['timeout', 'time out', 'expired'],
|
| | 'display': ['display', 'x11', 'xvfb', 'screen']
|
| | }
|
| |
|
| | stderr_lower = stderr.lower()
|
| | stdout_lower = stdout.lower()
|
| | combined_output = stderr_lower + " " + stdout_lower
|
| |
|
| |
|
| | for error_type, keywords in error_patterns.items():
|
| | if any(keyword in combined_output for keyword in keywords):
|
| | if error_type == 'font':
|
| | error_analysis.append("🔤 Font-related issue detected:")
|
| | error_analysis.append(" • Possible missing font substitution")
|
| | error_analysis.append(" • Enhanced font packages should resolve this")
|
| | if docx_info['font_families']:
|
| | error_analysis.append(f" • Document uses fonts: {list(docx_info['font_families'])[:3]}")
|
| |
|
| | elif error_type == 'memory':
|
| | error_analysis.append("💾 Memory issue detected:")
|
| | error_analysis.append(" • Document may be too large or complex")
|
| | error_analysis.append(" • Try with a smaller document first")
|
| |
|
| | elif error_type == 'file_access':
|
| | error_analysis.append("📁 File access issue detected:")
|
| | error_analysis.append(" • Temporary file permissions problem")
|
| | error_analysis.append(" • This should resolve on retry")
|
| |
|
| | elif error_type == 'format':
|
| | error_analysis.append("📄 Document format issue detected:")
|
| | error_analysis.append(" • DOCX file may be corrupted or invalid")
|
| | error_analysis.append(" • Try opening in Word and re-saving")
|
| |
|
| | elif error_type == 'timeout':
|
| | error_analysis.append("⏱️ Timeout issue detected:")
|
| | error_analysis.append(" • Document conversion took too long")
|
| | error_analysis.append(" • Complex documents may need more time")
|
| |
|
| | elif error_type == 'display':
|
| | error_analysis.append("🖥️ Display/Graphics issue detected:")
|
| | error_analysis.append(" • Headless display configuration problem")
|
| | error_analysis.append(" • This is a system configuration issue")
|
| |
|
| |
|
| | if docx_info.get('has_tables'):
|
| | error_analysis.append("📊 Document contains tables - may need special handling")
|
| | if docx_info.get('table_structure_issues'):
|
| | error_analysis.append(f" • Table issues detected: {', '.join(docx_info['table_structure_issues'])}")
|
| |
|
| | if docx_info.get('has_images'):
|
| | error_analysis.append("🖼️ Document contains images - may affect processing")
|
| |
|
| | if docx_info.get('has_textboxes'):
|
| | error_analysis.append("📦 Document contains TextBoxes - these may cause layout issues")
|
| |
|
| | if docx_info.get('has_smartart'):
|
| | error_analysis.append("🎨 Document contains SmartArt - these elements may not convert properly")
|
| |
|
| | if docx_info.get('has_complex_shapes'):
|
| | error_analysis.append("🔷 Document contains complex shapes - these may affect layout")
|
| |
|
| | if docx_info.get('text_content_length', 0) > 50000:
|
| | error_analysis.append("📝 Large document detected - may need more processing time")
|
| |
|
| | if docx_info.get('rtl_content_detected'):
|
| | error_analysis.append("🌍 Arabic RTL content detected - ensure Arabic fonts are properly installed")
|
| |
|
| | if docx_info.get('placeholder_count', 0) > 0:
|
| | error_analysis.append(f"🏷️ Document contains {docx_info['placeholder_count']} placeholders - these must be preserved")
|
| |
|
| |
|
| | if docx_info.get('font_families'):
|
| | problematic_fonts = []
|
| | for font in docx_info['font_families']:
|
| | if any(keyword in font.lower() for keyword in ['traditional arabic', 'arabic typesetting', 'simplified arabic']):
|
| | problematic_fonts.append(font)
|
| |
|
| | if problematic_fonts:
|
| | error_analysis.append(f"🔤 Arabic fonts detected: {', '.join(problematic_fonts[:3])}")
|
| | error_analysis.append(" • Ensure Arabic font substitution is working correctly")
|
| |
|
| |
|
| | if not error_analysis:
|
| | error_analysis.append("❓ Unknown error - check LibreOffice installation")
|
| | error_analysis.append(" • Verify all system dependencies are installed")
|
| | error_analysis.append(" • Try with a simpler test document")
|
| |
|
| | error_analysis.append("\n💡 Advanced troubleshooting suggestions:")
|
| | error_analysis.append(" • Ensure DOCX file is valid and not corrupted")
|
| | error_analysis.append(" • Try with a smaller or simpler document")
|
| | error_analysis.append(" • Check that all required fonts are available")
|
| | error_analysis.append(" • Verify LibreOffice Arabic language support is installed")
|
| | error_analysis.append(" • Consider preprocessing the document to remove problematic elements")
|
| |
|
| | return "\n".join(error_analysis)
|
| |
|
| |
|
| | def generate_comprehensive_quality_report(docx_info, pdf_validation, post_process_results):
|
| | """
|
| | Generate a comprehensive quality report for the conversion
|
| | """
|
| | report = []
|
| |
|
| |
|
| | report.append("📋 COMPREHENSIVE CONVERSION QUALITY REPORT")
|
| | report.append("=" * 50)
|
| |
|
| |
|
| | report.append("\n📄 DOCUMENT ANALYSIS:")
|
| | report.append(f" • Text Content: {docx_info.get('text_content_length', 0):,} characters")
|
| | report.append(f" • Font Families: {len(docx_info.get('font_families', set()))} detected")
|
| | report.append(f" • Tables: {'Yes' if docx_info.get('has_tables') else 'No'}")
|
| | report.append(f" • Images: {'Yes' if docx_info.get('has_images') else 'No'}")
|
| | report.append(f" • Arabic RTL Content: {'Yes' if docx_info.get('rtl_content_detected') else 'No'}")
|
| | report.append(f" • Placeholders: {docx_info.get('placeholder_count', 0)}")
|
| |
|
| |
|
| | issues = []
|
| | if docx_info.get('has_textboxes'):
|
| | issues.append("TextBoxes detected")
|
| | if docx_info.get('has_smartart'):
|
| | issues.append("SmartArt elements detected")
|
| | if docx_info.get('has_complex_shapes'):
|
| | issues.append("Complex shapes detected")
|
| | if docx_info.get('table_structure_issues'):
|
| | issues.extend(docx_info['table_structure_issues'])
|
| |
|
| | if issues:
|
| | report.append(f" • Potential Issues: {', '.join(issues)}")
|
| | else:
|
| | report.append(" • Potential Issues: None detected")
|
| |
|
| |
|
| | report.append("\n📊 PDF QUALITY METRICS:")
|
| | report.append(f" • File Size: {pdf_validation.get('file_size_mb', 0)} MB")
|
| | report.append(f" • Pages Processed: {post_process_results.get('pages_processed', 0)}")
|
| |
|
| |
|
| | report.append("\n✅ VERIFICATION RESULTS:")
|
| | if post_process_results.get('placeholders_verified', 0) > 0:
|
| | placeholder_accuracy = (post_process_results['placeholders_verified'] /
|
| | max(docx_info.get('placeholder_count', 1), 1)) * 100
|
| | report.append(f" • Placeholder Preservation: {placeholder_accuracy:.1f}% "
|
| | f"({post_process_results['placeholders_verified']}/{docx_info.get('placeholder_count', 0)})")
|
| |
|
| | if post_process_results.get('arabic_text_verified', 0) > 0:
|
| | report.append(f" • Arabic Text Verified: {post_process_results['arabic_text_verified']:,} characters")
|
| |
|
| | if post_process_results.get('tables_verified', 0) > 0:
|
| | report.append(f" • Tables Preserved: {post_process_results['tables_verified']}")
|
| |
|
| |
|
| | all_success_metrics = (pdf_validation.get('success_metrics', []) +
|
| | post_process_results.get('success_metrics', []))
|
| | if all_success_metrics:
|
| | report.append("\n🎯 SUCCESS METRICS:")
|
| | for metric in all_success_metrics:
|
| | report.append(f" ✓ {metric}")
|
| |
|
| |
|
| | all_warnings = (pdf_validation.get('warnings', []) +
|
| | post_process_results.get('warnings', []))
|
| | if all_warnings:
|
| | report.append("\n⚠️ WARNINGS:")
|
| | for warning in all_warnings:
|
| | report.append(f" • {warning}")
|
| |
|
| |
|
| | quality_score = calculate_quality_score(docx_info, pdf_validation, post_process_results)
|
| | report.append(f"\n🏆 OVERALL QUALITY SCORE: {quality_score:.1f}%")
|
| |
|
| | if quality_score >= 99:
|
| | report.append("🌟 EXCELLENT: Pixel-perfect conversion achieved!")
|
| | elif quality_score >= 95:
|
| | report.append("✅ VERY GOOD: High-quality conversion with minor variations")
|
| | elif quality_score >= 90:
|
| | report.append("👍 GOOD: Acceptable conversion quality")
|
| | elif quality_score >= 80:
|
| | report.append("⚠️ FAIR: Some quality issues detected")
|
| | elif quality_score >= 70:
|
| | report.append("❌ POOR: Significant quality issues")
|
| | else:
|
| | report.append("🚨 CRITICAL: Major conversion problems")
|
| |
|
| |
|
| | suggestions = suggest_quality_improvements(docx_info, pdf_validation, post_process_results, quality_score)
|
| | if suggestions:
|
| | report.append("\n" + "\n".join(suggestions))
|
| |
|
| | return "\n".join(report)
|
| |
|
| |
|
| | def calculate_quality_score(docx_info, pdf_validation, post_process_results):
|
| | """
|
| | Calculate an overall quality score for the conversion with enhanced accuracy
|
| | """
|
| | score = 100.0
|
| |
|
| |
|
| | warning_count = (len(pdf_validation.get('warnings', [])) +
|
| | len(post_process_results.get('warnings', [])))
|
| |
|
| |
|
| | critical_warnings = 0
|
| | minor_warnings = 0
|
| |
|
| | all_warnings = (pdf_validation.get('warnings', []) +
|
| | post_process_results.get('warnings', []))
|
| |
|
| | for warning in all_warnings:
|
| | warning_lower = warning.lower()
|
| | if any(keyword in warning_lower for keyword in ['error', 'failed', 'missing', 'corrupted']):
|
| | critical_warnings += 1
|
| | else:
|
| | minor_warnings += 1
|
| |
|
| | score -= critical_warnings * 5
|
| | score -= minor_warnings * 2
|
| |
|
| |
|
| | expected_placeholders = docx_info.get('placeholder_count', 0)
|
| | verified_placeholders = post_process_results.get('placeholders_verified', 0)
|
| | if expected_placeholders > 0:
|
| | placeholder_accuracy = verified_placeholders / expected_placeholders
|
| | score -= (1 - placeholder_accuracy) * 15
|
| | else:
|
| |
|
| | if verified_placeholders == 0:
|
| | score += 2
|
| |
|
| |
|
| | if docx_info.get('rtl_content_detected', False):
|
| | arabic_chars = post_process_results.get('arabic_text_verified', 0)
|
| | if arabic_chars > 0:
|
| | score += 5
|
| | else:
|
| | score -= 10
|
| |
|
| |
|
| | if docx_info.get('has_tables', False):
|
| | tables_verified = post_process_results.get('tables_verified', 0)
|
| | if tables_verified > 0:
|
| | score += 3
|
| | else:
|
| | score -= 8
|
| |
|
| |
|
| | if docx_info.get('has_images', False):
|
| | score += 2
|
| |
|
| |
|
| | if docx_info.get('has_textboxes'):
|
| | score -= 3
|
| | if docx_info.get('has_smartart'):
|
| | score -= 3
|
| | if docx_info.get('has_complex_shapes'):
|
| | score -= 2
|
| |
|
| |
|
| | table_issues = docx_info.get('table_structure_issues', [])
|
| | if table_issues:
|
| | score -= len(table_issues) * 3
|
| |
|
| |
|
| | pdf_size = pdf_validation.get('file_size_mb', 0)
|
| | if pdf_size > 0:
|
| | if 0.01 <= pdf_size <= 50:
|
| | score += 2
|
| | elif pdf_size > 50:
|
| | score -= 3
|
| | elif pdf_size < 0.01:
|
| | score -= 5
|
| |
|
| |
|
| | success_count = len(pdf_validation.get('success_metrics', [])) + len(post_process_results.get('success_metrics', []))
|
| | score += min(success_count * 0.5, 5)
|
| |
|
| |
|
| | pages_processed = post_process_results.get('pages_processed', 0)
|
| | if pages_processed > 0:
|
| | score += 3
|
| | else:
|
| | score -= 5
|
| |
|
| | return max(0, min(100, score))
|
| |
|
| |
|
| | def suggest_quality_improvements(docx_info, pdf_validation, post_process_results, quality_score):
|
| | """
|
| | Suggest specific improvements based on quality analysis
|
| | """
|
| | suggestions = []
|
| |
|
| | if quality_score < 90:
|
| | suggestions.append("🔧 IMPROVEMENT SUGGESTIONS:")
|
| |
|
| |
|
| | if post_process_results.get('placeholders_verified', 0) < docx_info.get('placeholder_count', 0):
|
| | suggestions.append(" • Placeholder positioning issues detected - consider document restructuring")
|
| |
|
| | if docx_info.get('has_textboxes') or docx_info.get('has_smartart') or docx_info.get('has_complex_shapes'):
|
| | suggestions.append(" • Complex elements detected - preprocessing applied but manual review recommended")
|
| |
|
| | if docx_info.get('table_structure_issues'):
|
| | suggestions.append(" • Table structure issues found - consider simplifying table layouts")
|
| |
|
| | if post_process_results.get('arabic_text_verified', 0) == 0 and docx_info.get('rtl_content_detected'):
|
| | suggestions.append(" • Arabic text verification failed - check font installation")
|
| |
|
| | warning_count = (len(pdf_validation.get('warnings', [])) +
|
| | len(post_process_results.get('warnings', [])))
|
| | if warning_count > 2:
|
| | suggestions.append(f" • Multiple warnings detected ({warning_count}) - review document complexity")
|
| |
|
| | if quality_score < 80:
|
| | suggestions.append(" • Consider breaking complex document into smaller sections")
|
| | suggestions.append(" • Verify document is not corrupted in original Word application")
|
| |
|
| | if quality_score < 70:
|
| | suggestions.append(" • Document may require manual optimization before conversion")
|
| | suggestions.append(" • Contact support for complex document handling")
|
| |
|
| | else:
|
| | suggestions.append("✅ EXCELLENT QUALITY - No improvements needed!")
|
| |
|
| | return suggestions
|
| |
|
| |
|
| | def create_libreoffice_config(temp_path):
|
| | """Create comprehensive LibreOffice configuration for PERFECT Arabic RTL formatting preservation"""
|
| | config_dir = temp_path / ".config" / "libreoffice" / "4" / "user"
|
| | config_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| | try:
|
| | config_dir.chmod(0o777)
|
| | except PermissionError:
|
| |
|
| | pass
|
| |
|
| |
|
| | registry_config = config_dir / "registrymodifications.xcu"
|
| |
|
| |
|
| | try:
|
| | registry_config.touch()
|
| | registry_config.chmod(0o666)
|
| | except PermissionError:
|
| |
|
| | pass
|
| |
|
| | config_content = '''<?xml version="1.0" encoding="UTF-8"?>
|
| | <oor:items xmlns:oor="http://openoffice.org/2001/registry" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
| | <!-- Disable first start wizard and user installation -->
|
| | <item oor:path="/org.openoffice.Setup/Office/Factories/org.openoffice.Setup:Factory['com.sun.star.comp.framework.ProtocolHandler']">
|
| | <prop oor:name="FirstStartWizardCompleted" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- CRITICAL: Completely disable Java to prevent javaldx errors -->
|
| | <item oor:path="/org.openoffice.Setup/Office">
|
| | <prop oor:name="JavaSupport" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Disable Java security to prevent javaldx errors -->
|
| | <item oor:path="/org.openoffice.Office.Java">
|
| | <prop oor:name="Enabled" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- PDF Export Settings for Maximum Quality with Arabic Support -->
|
| | <item oor:path="/org.openoffice.Office.Common/Filter/PDF/Export">
|
| | <prop oor:name="Quality" oor:op="fuse">
|
| | <value>100</value>
|
| | </prop>
|
| | <prop oor:name="ReduceImageResolution" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="MaxImageResolution" oor:op="fuse">
|
| | <value>600</value>
|
| | </prop>
|
| | <prop oor:name="UseTaggedPDF" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="ExportFormFields" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="FormsType" oor:op="fuse">
|
| | <value>0</value>
|
| | </prop>
|
| | <prop oor:name="AllowDuplicateFieldNames" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="EmbedStandardFonts" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="FontEmbedding" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="CompressMode" oor:op="fuse">
|
| | <value>0</value>
|
| | </prop>
|
| | <prop oor:name="JPEGQuality" oor:op="fuse">
|
| | <value>100</value>
|
| | </prop>
|
| | <prop oor:name="SelectPdfVersion" oor:op="fuse">
|
| | <value>1</value>
|
| | </prop>
|
| | <prop oor:name="ExportBookmarks" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="OpenBookmarkLevels" oor:op="fuse">
|
| | <value>-1</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Arabic and RTL Language Support -->
|
| | <item oor:path="/org.openoffice.Office.Linguistic/General">
|
| | <prop oor:name="DefaultLocale" oor:op="fuse">
|
| | <value>ar-SA</value>
|
| | </prop>
|
| | <prop oor:name="DefaultLocale_CJK" oor:op="fuse">
|
| | <value>ar-SA</value>
|
| | </prop>
|
| | <prop oor:name="DefaultLocale_CTL" oor:op="fuse">
|
| | <value>ar-SA</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- CTL (Complex Text Layout) Settings for Arabic -->
|
| | <item oor:path="/org.openoffice.Office.Common/I18N/CTL">
|
| | <prop oor:name="CTLFont" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="CTLSequenceChecking" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="CTLCursorMovement" oor:op="fuse">
|
| | <value>1</value>
|
| | </prop>
|
| | <prop oor:name="CTLTextNumerals" oor:op="fuse">
|
| | <value>1</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Enhanced Font Substitution Settings for Local Arial and Arabic Compatibility -->
|
| | <item oor:path="/org.openoffice.VCL/FontSubstitution">
|
| | <prop oor:name="FontSubstituteTable" oor:op="fuse">
|
| | <value>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Arial</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Arial</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Liberation Sans</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Calibri</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Liberation Serif</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Cambria</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Liberation Serif</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Times New Roman</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Liberation Mono</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Courier New</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Amiri</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Traditional Arabic</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Amiri</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Arabic Typesetting</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>Noto Naskh Arabic</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Simplified Arabic</value>
|
| | </prop>
|
| | </it>
|
| | <it>
|
| | <prop oor:name="SubstituteFont">
|
| | <value>DejaVu Sans</value>
|
| | </prop>
|
| | <prop oor:name="OriginalFont">
|
| | <value>Tahoma</value>
|
| | </prop>
|
| | </it>
|
| | </value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Writer Settings for Perfect Layout Preservation with RTL Support -->
|
| | <item oor:path="/org.openoffice.Office.Writer/Layout/Other">
|
| | <prop oor:name="MeasureUnit" oor:op="fuse">
|
| | <value>6</value>
|
| | </prop>
|
| | <prop oor:name="TabStop" oor:op="fuse">
|
| | <value>1270</value>
|
| | </prop>
|
| | <prop oor:name="IsSquaredPageMode" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="ApplyCharUnit" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="IsAlignTabStopPosition" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Enhanced Table Settings for Exact Formatting -->
|
| | <item oor:path="/org.openoffice.Office.Writer/Layout/Table">
|
| | <prop oor:name="Header" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="RepeatHeader" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="DontSplit" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="Border" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="InsertLabel" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Page Layout Settings for A4 and RTL -->
|
| | <item oor:path="/org.openoffice.Office.Writer/Layout/Page">
|
| | <prop oor:name="IsLandscape" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="Width" oor:op="fuse">
|
| | <value>21000</value>
|
| | </prop>
|
| | <prop oor:name="Height" oor:op="fuse">
|
| | <value>29700</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Default Font Settings with Local Arial Priority -->
|
| | <item oor:path="/org.openoffice.Office.Writer/DefaultFont">
|
| | <prop oor:name="Document" oor:op="fuse">
|
| | <value>true</value>
|
| | </prop>
|
| | <prop oor:name="Standard" oor:op="fuse">
|
| | <value>Arial;Liberation Sans;DejaVu Sans</value>
|
| | </prop>
|
| | <prop oor:name="Heading" oor:op="fuse">
|
| | <value>Arial;Liberation Sans;DejaVu Sans</value>
|
| | </prop>
|
| | <prop oor:name="List" oor:op="fuse">
|
| | <value>Arial;Liberation Sans;Amiri;Noto Naskh Arabic</value>
|
| | </prop>
|
| | <prop oor:name="Caption" oor:op="fuse">
|
| | <value>Arial;Liberation Sans;DejaVu Sans</value>
|
| | </prop>
|
| | <prop oor:name="Index" oor:op="fuse">
|
| | <value>Arial;Liberation Sans;DejaVu Sans</value>
|
| | </prop>
|
| | <prop oor:name="StandardHeight" oor:op="fuse">
|
| | <value>12</value>
|
| | </prop>
|
| | <prop oor:name="HeadingHeight" oor:op="fuse">
|
| | <value>14</value>
|
| | </prop>
|
| | <prop oor:name="ListHeight" oor:op="fuse">
|
| | <value>13</value>
|
| | </prop>
|
| | <prop oor:name="CaptionHeight" oor:op="fuse">
|
| | <value>12</value>
|
| | </prop>
|
| | <prop oor:name="IndexHeight" oor:op="fuse">
|
| | <value>12</value>
|
| | </prop>
|
| | </item>
|
| |
|
| | <!-- Disable Auto-formatting Features -->
|
| | <item oor:path="/org.openoffice.Office.Writer/AutoFunction/Format/Option">
|
| | <prop oor:name="UseReplacementTable" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="TwoCapitalsAtStart" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="CapitalAtStartSentence" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="ChgWeightUnderl" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="SetInetAttr" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="ChgToEnEmDash" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="AddNonBrkSpace" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="ChgOrdinalNumber" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="ChgQuotes" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | <prop oor:name="DelEmptyNode" oor:op="fuse">
|
| | <value>false</value>
|
| | </prop>
|
| | </item>
|
| | </oor:items>'''
|
| |
|
| | try:
|
| | with open(registry_config, 'w', encoding='utf-8') as f:
|
| | f.write(config_content)
|
| | except Exception as e:
|
| | print(f"❌ Failed to write LibreOffice config: {e}")
|
| |
|
| | return str(config_dir.parent.parent.parent)
|
| |
|
| |
|
| | def convert_docx_to_pdf(docx_file):
|
| | """
|
| | Convert DOCX to PDF using LibreOffice headless mode
|
| | Preserves all formatting including Arabic RTL text
|
| | """
|
| | if docx_file is None:
|
| | return None, "Please upload a DOCX file"
|
| |
|
| |
|
| | docx_info = {
|
| | 'has_tables': False,
|
| | 'has_images': False,
|
| | 'text_content_length': 0,
|
| | 'font_families': []
|
| | }
|
| |
|
| | final_output_path = None
|
| | try:
|
| |
|
| | print("🔍 Analyzing DOCX structure...")
|
| | docx_info = validate_docx_structure(docx_file.name)
|
| |
|
| |
|
| | output_fd, final_output_path = tempfile.mkstemp(suffix=".pdf", prefix="converted_")
|
| | os.close(output_fd)
|
| |
|
| |
|
| | with tempfile.TemporaryDirectory() as temp_dir:
|
| | temp_path = Path(temp_dir)
|
| |
|
| |
|
| | config_home = create_libreoffice_config(temp_path)
|
| | fontconfig_home = create_fontconfig(temp_path)
|
| |
|
| |
|
| | input_file = temp_path / "input.docx"
|
| | shutil.copy2(docx_file.name, input_file)
|
| |
|
| |
|
| | processed_docx = preprocess_docx_for_perfect_conversion(str(input_file), docx_info)
|
| | if processed_docx != str(input_file):
|
| | print("🔧 Using preprocessed DOCX for conversion")
|
| | input_file = Path(processed_docx)
|
| |
|
| |
|
| | needs_aggressive_optimization = (
|
| | docx_info.get('has_textboxes', False) or
|
| | docx_info.get('has_smartart', False) or
|
| | docx_info.get('has_complex_shapes', False) or
|
| | len(docx_info.get('table_structure_issues', [])) > 2 or
|
| | docx_info.get('text_content_length', 0) > 100000
|
| | )
|
| |
|
| | if needs_aggressive_optimization:
|
| | print("⚠️ Complex document detected - applying aggressive optimization settings")
|
| |
|
| | conversion_timeout = 180
|
| | else:
|
| | conversion_timeout = 120
|
| |
|
| |
|
| |
|
| | pdf_export_settings = {
|
| |
|
| | "Quality": 100,
|
| | "ReduceImageResolution": False,
|
| | "MaxImageResolution": 600,
|
| | "BitmapResolution": 600,
|
| | "ImageResolution": 600,
|
| | "JPEGQuality": 100,
|
| | "CompressMode": 0,
|
| |
|
| |
|
| | "EmbedStandardFonts": True,
|
| | "FontEmbedding": True,
|
| | "UseTaggedPDF": True,
|
| | "EnableTextAccessForAccessibilityTools": True,
|
| |
|
| |
|
| | "ExportFormFields": False,
|
| | "FormsType": 0,
|
| | "ExportBookmarks": False,
|
| | "ExportNotes": False,
|
| | "ExportNotesPages": False,
|
| | "ExportOnlyNotesPages": False,
|
| | "ExportPlaceholders": False,
|
| | "ExportHiddenSlides": False,
|
| | "SinglePageSheets": False,
|
| | "UseTransitionEffects": False,
|
| | "IsSkipEmptyPages": False,
|
| | "IsAddStream": False,
|
| | "AllowDuplicateFieldNames": False,
|
| |
|
| |
|
| | "ColorMode": 0,
|
| | "Watermark": "",
|
| | "EncryptFile": False,
|
| | "DocumentOpenPassword": "",
|
| | "PermissionPassword": "",
|
| | "RestrictPermissions": False,
|
| | "Printing": 2,
|
| | "Changes": 4,
|
| | "EnableCopyingOfContent": True,
|
| | "SelectPdfVersion": 1,
|
| | "ExportLinksRelativeFsys": False,
|
| | "PDFViewSelection": 0,
|
| | "ConvertOOoTargetToPDFTarget": False,
|
| | "ExportBookmarksToPDFDestination": False,
|
| |
|
| |
|
| | "PreserveEditingInPDF": False,
|
| | "ExportFormFieldsAsWidgets": False,
|
| | "FormsFormat": 0,
|
| | "SubmitFormat": 0,
|
| | "AllowDuplicateFieldNames": False,
|
| | "ExportEmptyPages": True,
|
| | "ViewPDFAfterExport": False,
|
| |
|
| |
|
| | "UseReferenceXObject": False,
|
| | "HideViewerMenubar": False,
|
| | "HideViewerToolbar": False,
|
| | "HideViewerWindowControls": False,
|
| | "ResizeWindowToInitialPage": False,
|
| | "CenterWindow": False,
|
| | "OpenInFullScreenMode": False,
|
| | "DisplayPDFDocumentTitle": False,
|
| |
|
| |
|
| | "ExportNotesInMargin": False,
|
| | "ConvertOOoTargetToPDFTarget": False,
|
| | "ExportLinksRelativeFsys": False,
|
| | "PDFViewSelection": 0,
|
| | "Magnification": 0,
|
| | "PageLayout": 0,
|
| | "FirstPageOnLeft": False,
|
| | "InitialView": 0,
|
| | "Magnification": 0
|
| | }
|
| |
|
| |
|
| | pdf_filter = f'pdf:writer_pdf_Export:{json.dumps(pdf_export_settings, separators=(",", ":"))}'
|
| |
|
| |
|
| |
|
| | cmd = [
|
| | "libreoffice",
|
| | "--headless",
|
| | "--invisible",
|
| | "--nodefault",
|
| | "--nolockcheck",
|
| | "--nologo",
|
| | "--norestore",
|
| | "--nofirststartwizard",
|
| | "--safe-mode",
|
| |
|
| | "--disable-extension-update",
|
| | "--disable-webupdate",
|
| | "--disable-remote-control",
|
| | "--disable-notification",
|
| | "--disable-oop4all",
|
| | "--convert-to", pdf_filter,
|
| | "--outdir", str(temp_path),
|
| | str(input_file)
|
| | ]
|
| |
|
| |
|
| | env = os.environ.copy()
|
| | env['HOME'] = config_home
|
| | env['XDG_CONFIG_HOME'] = config_home + "/.config"
|
| |
|
| |
|
| | fontconfig_dir = fontconfig_home + "/.config/fontconfig"
|
| | env['FONTCONFIG_PATH'] = fontconfig_dir
|
| | env['FONTCONFIG_FILE'] = fontconfig_dir + "/fonts.conf"
|
| |
|
| |
|
| | script_dir = Path(__file__).parent.absolute()
|
| | if 'FONTPATH' in env:
|
| | env['FONTPATH'] = f"{script_dir}:{env['FONTPATH']}"
|
| |
|
| | else:
|
| | env['FONTPATH'] = str(script_dir)
|
| |
|
| | env['LANG'] = 'ar_SA.UTF-8'
|
| | env['LC_ALL'] = 'ar_SA.UTF-8'
|
| | env['LC_CTYPE'] = 'ar_SA.UTF-8'
|
| | env['LC_NUMERIC'] = 'ar_SA.UTF-8'
|
| | env['LC_TIME'] = 'ar_SA.UTF-8'
|
| | env['LC_COLLATE'] = 'ar_SA.UTF-8'
|
| | env['LC_MONETARY'] = 'ar_SA.UTF-8'
|
| | env['LC_MESSAGES'] = 'ar_SA.UTF-8'
|
| | env['LC_PAPER'] = 'ar_SA.UTF-8'
|
| | env['LC_NAME'] = 'ar_SA.UTF-8'
|
| | env['LC_ADDRESS'] = 'ar_SA.UTF-8'
|
| | env['LC_TELEPHONE'] = 'ar_SA.UTF-8'
|
| | env['LC_MEASUREMENT'] = 'ar_SA.UTF-8'
|
| | env['LC_IDENTIFICATION'] = 'ar_SA.UTF-8'
|
| |
|
| | env['SAL_USE_VCLPLUGIN'] = 'svp'
|
| | env['DISPLAY'] = ':99'
|
| |
|
| | env['OOO_FORCE_DESKTOP'] = 'gnome'
|
| | env['SAL_NO_MOUSEGRABS'] = '1'
|
| |
|
| | env['SAL_RTL_ENABLED'] = '1'
|
| | env['OOO_DISABLE_RECOVERY'] = '1'
|
| |
|
| |
|
| | env['SAL_DISABLE_JAVA_SECURITY'] = '1'
|
| | env['SAL_DISABLE_JAVA'] = '1'
|
| | env['SAL_JAVA_DISABLE_SECURITY'] = '1'
|
| |
|
| |
|
| | env['UNO_PATH'] = '/usr/lib/libreoffice/program'
|
| |
|
| |
|
| | env['LIBO_JAVA_PARALLEL'] = '0'
|
| | env['LIBO_DISABLE_JAVA'] = '1'
|
| |
|
| |
|
| | env['SAL_DISABLE_OPENCL'] = '1'
|
| | env['SAL_DISABLE_VCLPLUGIN'] = '1'
|
| |
|
| | print(f"🚀 Executing LibreOffice conversion with MAXIMUM quality settings...")
|
| | print(f"Command: {' '.join(cmd[:11])}... [truncated for readability]")
|
| | print(f"Environment: HOME={env.get('HOME', 'default')}, LANG={env.get('LANG', 'default')}")
|
| |
|
| | result = subprocess.run(
|
| | cmd,
|
| | capture_output=True,
|
| | text=True,
|
| | timeout=conversion_timeout,
|
| | cwd=temp_path,
|
| | env=env
|
| | )
|
| |
|
| | print(f"📊 LibreOffice execution completed:")
|
| | print(f" • Return code: {result.returncode}")
|
| | print(f" • Output length: {len(result.stdout)} chars")
|
| | print(f" • Error length: {len(result.stderr)} chars")
|
| |
|
| | if result.stdout:
|
| | print(f" • LibreOffice stdout: {result.stdout[:200]}...")
|
| | if result.stderr:
|
| | print(f" • LibreOffice stderr: {result.stderr[:200]}...")
|
| |
|
| |
|
| | if result.returncode != 0:
|
| | print("⚠️ LibreOffice conversion failed, trying fallback with unoconv...")
|
| | try:
|
| | unoconv_cmd = [
|
| | "unoconv",
|
| | "-f", "pdf",
|
| | "-o", str(temp_path),
|
| | str(input_file)
|
| | ]
|
| |
|
| | print(f"🚀 Executing unoconv conversion...")
|
| | print(f"Command: {' '.join(unoconv_cmd)}")
|
| |
|
| | unoconv_result = subprocess.run(
|
| | unoconv_cmd,
|
| | capture_output=True,
|
| | text=True,
|
| | timeout=conversion_timeout,
|
| | cwd=temp_path,
|
| | env=env
|
| | )
|
| |
|
| | print(f"📊 unoconv execution completed:")
|
| | print(f" • Return code: {unoconv_result.returncode}")
|
| | print(f" • Output length: {len(unoconv_result.stdout)} chars")
|
| | print(f" • Error length: {len(unoconv_result.stderr)} chars")
|
| |
|
| | if unoconv_result.stdout:
|
| | print(f" • unoconv stdout: {unoconv_result.stdout[:200]}...")
|
| | if unoconv_result.stderr:
|
| | print(f" • unoconv stderr: {unoconv_result.stderr[:200]}...")
|
| |
|
| |
|
| | if unoconv_result.returncode == 0:
|
| | result = unoconv_result
|
| | print("✅ unoconv conversion successful")
|
| | else:
|
| | print("❌ unoconv conversion also failed")
|
| | except Exception as unoconv_error:
|
| | print(f"❌ unoconv conversion error: {unoconv_error}")
|
| |
|
| | if result.returncode != 0:
|
| |
|
| | error_analysis = analyze_conversion_error(result.stderr, result.stdout, docx_info)
|
| | error_msg = f"❌ Conversion failed with detailed analysis:\n\n"
|
| | error_msg += f"🔍 Error Analysis:\n{error_analysis}\n\n"
|
| | error_msg += f"📋 Technical Details:\n"
|
| | error_msg += f"• Return Code: {result.returncode}\n"
|
| | error_msg += f"• LibreOffice Error: {result.stderr[:300]}...\n"
|
| | error_msg += f"• Document Info: Tables={docx_info['has_tables']}, Images={docx_info['has_images']}\n"
|
| |
|
| | print(f"❌ CONVERSION FAILED: {error_msg}")
|
| |
|
| |
|
| | if final_output_path:
|
| | try:
|
| | os.unlink(final_output_path)
|
| | except:
|
| | pass
|
| | return None, error_msg
|
| |
|
| |
|
| | print(f"Looking for PDF files in: {temp_path}")
|
| | all_files = list(temp_path.iterdir())
|
| | print(f"Files in temp directory: {all_files}")
|
| |
|
| |
|
| | pdf_files = [f for f in all_files if f.suffix.lower() == '.pdf']
|
| |
|
| | if not pdf_files:
|
| |
|
| | if final_output_path:
|
| | try:
|
| | os.unlink(final_output_path)
|
| | except:
|
| | pass
|
| | return None, f"No PDF file was generated by LibreOffice. Files found: {[f.name for f in all_files]}"
|
| |
|
| |
|
| | temp_pdf = pdf_files[0]
|
| | print(f"✅ Found PDF file: {temp_pdf}")
|
| |
|
| | if not temp_pdf.exists():
|
| |
|
| | if final_output_path:
|
| | try:
|
| | os.unlink(final_output_path)
|
| | except:
|
| | pass
|
| | return None, "PDF file was not generated by LibreOffice"
|
| |
|
| |
|
| | shutil.copy2(temp_pdf, final_output_path)
|
| |
|
| |
|
| | print("🔍 Validating PDF output...")
|
| | pdf_validation = validate_pdf_output(final_output_path, docx_info)
|
| |
|
| | print("🔧 Post-processing PDF for perfect formatting...")
|
| | post_process_results = post_process_pdf_for_perfect_formatting(final_output_path, docx_info)
|
| |
|
| |
|
| | quality_report = generate_comprehensive_quality_report(docx_info, pdf_validation, post_process_results)
|
| | quality_score = calculate_quality_score(docx_info, pdf_validation, post_process_results)
|
| |
|
| |
|
| | if quality_score >= 95:
|
| | success_msg = f"🌟 EXCELLENT conversion with {quality_score:.1f}% formatting accuracy!\n\n"
|
| | elif quality_score >= 85:
|
| | success_msg = f"✅ HIGH-QUALITY conversion with {quality_score:.1f}% formatting accuracy!\n\n"
|
| | elif quality_score >= 75:
|
| | success_msg = f"👍 GOOD conversion with {quality_score:.1f}% formatting accuracy!\n\n"
|
| | else:
|
| | success_msg = f"⚠️ Conversion completed with {quality_score:.1f}% accuracy - improvements suggested!\n\n"
|
| |
|
| | success_msg += quality_report
|
| |
|
| |
|
| | if quality_score < 80:
|
| | success_msg += f"\n\n💡 TIP: For better results, try simplifying the document structure or removing complex elements before conversion."
|
| |
|
| | return final_output_path, success_msg
|
| |
|
| | except subprocess.TimeoutExpired:
|
| |
|
| | timeout_msg = "⏱️ Conversion timed out - Document is too complex for current processing limits\n\n"
|
| | timeout_msg += "🔍 Timeout Analysis:\n"
|
| | timeout_msg += f"• Document has tables: {docx_info.get('has_tables', 'Unknown')}\n"
|
| | timeout_msg += f"• Document has images: {docx_info.get('has_images', 'Unknown')}\n"
|
| | timeout_msg += f"• Text content length: {docx_info.get('text_content_length', 'Unknown')} characters\n"
|
| | timeout_msg += f"• Font families detected: {len(docx_info.get('font_families', []))}\n\n"
|
| | timeout_msg += "💡 Suggestions:\n"
|
| | timeout_msg += "• Try with a simpler document first\n"
|
| | timeout_msg += "• Remove complex tables or images temporarily\n"
|
| | timeout_msg += "• Split large documents into smaller sections\n"
|
| | timeout_msg += "• Ensure document is not corrupted\n"
|
| |
|
| | print(f"❌ TIMEOUT ERROR: {timeout_msg}")
|
| |
|
| |
|
| | if final_output_path:
|
| | try:
|
| | os.unlink(final_output_path)
|
| | except:
|
| | pass
|
| | return None, timeout_msg
|
| | except Exception as e:
|
| |
|
| | exception_msg = f"❌ Unexpected error during conversion\n\n"
|
| | exception_msg += f"🔍 Error Details:\n"
|
| | exception_msg += f"• Error Type: {type(e).__name__}\n"
|
| | exception_msg += f"• Error Message: {str(e)}\n"
|
| |
|
| | if 'docx_info' in locals():
|
| | exception_msg += f"• Document Analysis:\n"
|
| | exception_msg += f" - Has tables: {docx_info.get('has_tables', 'Unknown')}\n"
|
| | exception_msg += f" - Has images: {docx_info.get('has_images', 'Unknown')}\n"
|
| | exception_msg += f" - Content length: {docx_info.get('text_content_length', 'Unknown')}\n"
|
| |
|
| | exception_msg += f"\n💡 Recovery Suggestions:\n"
|
| | exception_msg += f"• Verify the DOCX file is not corrupted\n"
|
| | exception_msg += f"• Try opening the file in Microsoft Word first\n"
|
| | exception_msg += f"• Ensure the file is a valid .docx format\n"
|
| | exception_msg += f"• Check file size is reasonable (< 50MB)\n"
|
| | exception_msg += f"• Try with a simpler test document\n"
|
| |
|
| | print(f"❌ EXCEPTION ERROR: {exception_msg}")
|
| | print(f"Full exception details: {repr(e)}")
|
| |
|
| |
|
| | if final_output_path:
|
| | try:
|
| | os.unlink(final_output_path)
|
| | except:
|
| | pass
|
| | return None, exception_msg
|
| |
|
| |
|
| |
|
| |
|
| |
|