Spaces:
Running
Running
import os | |
import time | |
import fitz # PyMuPDF | |
from docx import Document | |
from docx.shared import Pt, RGBColor, Inches | |
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT | |
from PIL import Image | |
import io | |
import gradio as gr | |
import subprocess | |
# Install required system dependencies | |
def install_dependencies(): | |
subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
install_dependencies() | |
def convert_pdf_to_word(pdf_file, filename): | |
"""Convert PDF to Word with maximum fidelity""" | |
try: | |
# Prepare output path | |
docx_path = filename.replace('.pdf', '_converted.docx') | |
# First try with pdf2docx | |
try: | |
from pdf2docx import Converter | |
cv = Converter(pdf_file.name) | |
cv.convert(docx_path, | |
start=0, | |
end=None, | |
keep_layout=True) | |
cv.close() | |
except Exception as e: | |
print(f"Primary conversion method failed: {e}, trying fallback...") | |
# Fallback to libreoffice | |
subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name], | |
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx")) | |
if os.path.exists(temp_docx): | |
os.rename(temp_docx, docx_path) | |
else: | |
raise Exception("LibreOffice conversion failed") | |
# Enhance the output with PyMuPDF for better formatting | |
try: | |
doc = Document(docx_path) | |
pdf_doc = fitz.open(pdf_file.name) | |
# Process each page for precise formatting | |
for page_num in range(len(pdf_doc)): | |
page = pdf_doc.load_page(page_num) | |
blocks = page.get_text("dict")["blocks"] | |
for b in blocks: | |
if "lines" in b: | |
for line in b["lines"]: | |
for span in line["spans"]: | |
# Match text style in Word doc | |
for paragraph in doc.paragraphs: | |
if span["text"].strip() and span["text"].strip() in paragraph.text: | |
for run in paragraph.runs: | |
if span["text"].strip() in run.text: | |
# Set font properties | |
run.font.name = span["font"] | |
run.font.size = Pt(span["size"]) | |
# Set color | |
if "color" in span: | |
color = span["color"] | |
r = (color >> 16) & 0xff | |
g = (color >> 8) & 0xff | |
b = color & 0xff | |
run.font.color.rgb = RGBColor(r, g, b) | |
# Set styles | |
run.font.bold = bool(span["flags"] & 2 ** 4) | |
run.font.italic = bool(span["flags"] & 2 ** 1) | |
run.font.underline = bool(span["flags"] & 2 ** 2) | |
# Handle images with precise positioning | |
for page_num in range(len(pdf_doc)): | |
page = pdf_doc.load_page(page_num) | |
image_list = page.get_images(full=True) | |
for img_index, img in enumerate(image_list): | |
xref = img[0] | |
base_image = pdf_doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
# Convert to PIL Image | |
image = Image.open(io.BytesIO(image_bytes)) | |
image_path = f"/tmp/img_{page_num}_{img_index}.png" | |
image.save(image_path) | |
# Add to document with original dimensions | |
doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72)) | |
os.remove(image_path) | |
pdf_doc.close() | |
doc.save(docx_path) | |
except Exception as e: | |
print(f"Formatting enhancement failed: {e}, using basic conversion") | |
return docx_path | |
except Exception as e: | |
raise Exception(f"Conversion failed: {str(e)}") | |
def process_pdf(file): | |
if not file: | |
raise gr.Error("Please upload a PDF file first") | |
if not file.name.lower().endswith('.pdf'): | |
raise gr.Error("Please upload a PDF file") | |
try: | |
start_time = time.time() | |
output_path = convert_pdf_to_word(file, file.name) | |
conversion_time = time.time() - start_time | |
return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds" | |
except Exception as e: | |
raise gr.Error(f"Conversion failed: {str(e)}") | |
# Create Gradio interface | |
with gr.Blocks(title="PDF to Word Converter") as demo: | |
gr.Markdown("# PDF to Word Converter") | |
gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.") | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
convert_btn = gr.Button("Convert to Word", variant="primary") | |
with gr.Column(): | |
status = gr.Textbox(label="Status") | |
file_output = gr.File(label="Download Word File") | |
convert_btn.click( | |
fn=process_pdf, | |
inputs=file_input, | |
outputs=[file_output, status] | |
) | |
if __name__ == "__main__": | |
demo.launch() |