arad1367's picture
Update app.py
d5041e6 verified
raw
history blame
9.26 kB
import spaces
import gradio as gr
import json
import os
from pathlib import Path
import logging
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.document_converter import PdfFormatOption
import requests
from urllib.parse import urlparse
from datetime import datetime
import tempfile
from docx import Document
from docx.shared import Inches
import markdown
# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
def is_valid_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def markdown_to_docx(markdown_content):
"""Convert markdown content to DOCX format"""
doc = Document()
# Split content into lines
lines = markdown_content.split('\n')
for line in lines:
# Handle headers
if line.startswith('# '):
doc.add_heading(line[2:], level=1)
elif line.startswith('## '):
doc.add_heading(line[3:], level=2)
elif line.startswith('### '):
doc.add_heading(line[4:], level=3)
# Handle lists
elif line.startswith('* ') or line.startswith('- '):
doc.add_paragraph(line[2:], style='List Bullet')
elif line.startswith('1. '):
doc.add_paragraph(line[3:], style='List Number')
# Handle normal text
elif line.strip():
doc.add_paragraph(line)
# Handle empty lines
else:
doc.add_paragraph()
return doc
def create_output_files(content, original_name):
"""Create temporary files for different formats and return their paths"""
files = {}
# Generate base filename
base_name = Path(original_name).stem
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Create markdown file
md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name
with open(md_path, "w", encoding="utf-8") as f:
f.write(content)
files['markdown'] = md_path
# Create JSON file
json_content = {
"title": original_name,
"content": content,
"metadata": {
"conversion_date": datetime.now().isoformat()
}
}
json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name
with open(json_path, "w", encoding="utf-8") as f:
json.dump(json_content, f, ensure_ascii=False, indent=2)
files['json'] = json_path
# Create proper DOCX file
docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name
doc = markdown_to_docx(content)
doc.save(docx_path)
files['docx'] = docx_path
return files
@spaces.GPU()
def process_document(input_type, file_input, url_input, use_gpu, table_mode):
try:
logger.debug(f"Processing with input type: {input_type}")
logger.debug(f"File input: {file_input}")
# Configure pipeline
pipeline_options = PdfPipelineOptions(do_table_structure=True)
if table_mode:
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
else:
pipeline_options.table_structure_options.mode = TableFormerMode.FAST
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
# Handle different input types
if input_type == "file":
if file_input is None:
return None, None, None, None, "Please upload a file"
source = file_input
original_name = Path(file_input).name
elif input_type == "url":
if not url_input or not is_valid_url(url_input):
return None, None, None, None, "Please enter a valid URL"
source = url_input
original_name = Path(urlparse(url_input).path).name or "url_document"
else:
return None, None, None, None, "Invalid input type"
# Convert document
logger.debug(f"Converting document: {source}")
result = converter.convert(source)
# Get markdown content
markdown_content = result.document.export_to_markdown()
# Create output files
output_files = create_output_files(markdown_content, original_name)
return (
output_files['markdown'],
output_files['json'],
output_files['docx'],
markdown_content,
"Conversion completed successfully! Use the download buttons below to get your files."
)
except Exception as e:
logger.exception("Error occurred during conversion")
return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs."
# Create title HTML with custom style and duplicate button CSS
title_html = """
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
<h1 style="color: #FFD700; font-size: 2.5rem; margin-bottom: 0.5rem;">Professional Document Converter</h1>
<p style="color: #FFA500; font-size: 1.1rem; margin-bottom: 1.5rem;">Convert documents from files or URLs to various formats</p>
<p style="color: #87CEEB; font-size: 0.9rem;">Please like this Space if you find it useful! Your support is appreciated πŸ™</p>
</div>
<style>
.duplicate-button {
margin: 0.5em auto 1em;
display: block;
background-color: #FFD700 !important;
color: black !important;
border: none !important;
font-weight: bold !important;
}
.duplicate-button:hover {
background-color: #FFA500 !important;
transform: translateY(-2px);
transition: all 0.2s ease;
}
</style>
"""
# Create Gradio interface with custom theme
with gr.Blocks(css="footer {display: none}") as demo:
gr.HTML(title_html)
# Add duplicate button at the top
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_classes="duplicate-button",
)
with gr.Row():
with gr.Column(scale=1):
input_type = gr.Radio(
choices=["file", "url"],
value="file",
label="Input Type"
)
# File input with proper file type handling
file_input = gr.File(
label="Upload Document",
file_types=[".pdf", ".PDF"],
type="filepath"
)
# URL input
url_input = gr.Textbox(
label="Or Enter URL",
placeholder="https://arxiv.org/pdf/2408.09869"
)
# Processing options
use_gpu = gr.Checkbox(label="Use GPU", value=True)
table_mode = gr.Checkbox(label="Use Accurate Table Mode (Slower but better)", value=False)
convert_btn = gr.Button("Convert Document", variant="primary")
with gr.Column(scale=2):
# Status message
status_message = gr.Markdown("")
# Preview area
preview = gr.Markdown("", label="Preview")
# Download files
with gr.Group() as download_group:
gr.Markdown("### Download Files")
with gr.Row():
markdown_output = gr.File(label="Download Markdown")
json_output = gr.File(label="Download JSON")
docx_output = gr.File(label="Download DOCX")
# Define the main conversion event
convert_btn.click(
fn=process_document,
inputs=[input_type, file_input, url_input, use_gpu, table_mode],
outputs=[markdown_output, json_output, docx_output, preview, status_message]
)
# Updated footer with better visibility
footer = """
<div style="text-align: center; margin: 2rem auto; padding: 1rem; border-top: 1px solid #FFD700; max-width: 800px;">
<div style="margin-bottom: 1rem;">
<a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">LinkedIn</a> |
<a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">GitHub</a> |
<a href="https://arad1367.pythonanywhere.com/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">PhD Defense Demo</a> |
<a href="https://github.com/DS4SD/docling" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">Docling Project</a>
</div>
<p style="color: #FFA500; margin-top: 0.5rem;">Made with πŸ’– by Pejman Ebrahimi</p>
</div>
"""
gr.HTML(footer)
# Launch the app
if __name__ == "__main__":
demo.queue(max_size=5, timeout=120) # Increased timeout to 120 seconds
demo.launch(
show_error=True,
share=False,
debug=True,
show_api=False,
server_name="0.0.0.0"
)