docling_free / app.py
hellorahulk's picture
Add URL input support for document processing
ec3f76a
import os
import gradio as gr
import pandas as pd
from dockling_parser import DocumentParser
from dockling_parser.exceptions import ParserError, UnsupportedFormatError
import tempfile
import mimetypes
import traceback
import requests
from urllib.parse import urlparse
TITLE = "πŸ“„ Smart Document Parser"
DESCRIPTION = """
A powerful document parsing application that automatically extracts structured information from various document formats.
Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically.
"""
ARTICLE = """
## πŸš€ Features
- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
- Support for File Upload and URLs
- Rich Information Extraction
- Smart Processing with Confidence Scoring
- Automatic Format Detection
Made with ❀️ using Docling and Gradio
"""
ERROR_MESSAGES = {
"no_input": (
"⚠️ No input provided",
"Please upload a document or provide a URL.",
"No sections available",
"No entities available",
"Confidence Score: 0.0"
),
"invalid_url": (
"⚠️ Invalid URL",
"Please provide a valid URL to a document.",
"No sections available",
"No entities available",
"Confidence Score: 0.0"
),
"download_error": (
"⚠️ Failed to download document",
"Could not download the document from the provided URL.",
"No sections available",
"No entities available",
"Confidence Score: 0.0"
),
"unsupported_format": (
"⚠️ Unsupported file format",
"Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
"No sections available",
"No entities available",
"Confidence Score: 0.0"
),
"processing_error": (
"⚠️ Error processing document",
"An error occurred while processing the document. Please try again with a different file.",
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
}
# Initialize the document parser
parser = DocumentParser()
def download_file(url: str) -> str:
"""Download file from URL and save to temporary file"""
try:
# Extract filename from URL
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
if not filename:
filename = "document.pdf" # Default filename
# Download file
response = requests.get(url, allow_redirects=True)
response.raise_for_status()
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
tmp_file.write(response.content)
return tmp_file.name
except Exception as e:
raise Exception(f"Failed to download file: {str(e)}")
def process_input(file_input, url_input):
"""Process either uploaded file or URL input"""
# Check if we have any input
if file_input is None and not url_input:
return ERROR_MESSAGES["no_input"]
temp_file = None
try:
# Handle URL input if provided
if url_input:
try:
temp_file = download_file(url_input)
result = parser.parse(temp_file)
except Exception as e:
return ERROR_MESSAGES["download_error"]
# Handle file upload
else:
result = parser.parse(file_input)
# Prepare the outputs
metadata_df = pd.DataFrame([{
"Property": k,
"Value": str(v)
} for k, v in result.metadata.dict().items()])
# Extract structured content
sections = result.structured_content.get('sections', [])
sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])
# Format entities if available
entities = result.structured_content.get('entities', {})
entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}"
for entity_type, entities_list in entities.items()]) if entities else "No entities detected"
return (
result.content, # Main content
metadata_df, # Metadata as table
sections_text, # Structured sections
entities_text, # Named entities
f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
)
except UnsupportedFormatError as e:
error_msg = f"⚠️ {str(e)}"
return (
error_msg,
pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
except ParserError as e:
error_msg = f"⚠️ {str(e)}"
return (
error_msg,
pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
except Exception as e:
error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}"
return (
error_msg,
pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
finally:
# Cleanup temporary file if it was created
if temp_file and os.path.exists(temp_file):
try:
os.unlink(temp_file)
except:
pass
# Create Gradio interface
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
gr.Markdown(f"# {TITLE}")
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload Document",
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
type="filepath"
)
url_input = gr.Textbox(
label="Or Enter Document URL",
placeholder="https://example.com/document.pdf"
)
submit_btn = gr.Button("Process Document", variant="primary")
with gr.Column():
confidence = gr.Textbox(label="Processing Confidence")
with gr.Tabs():
with gr.TabItem("πŸ“ Content"):
content_output = gr.Textbox(
label="Extracted Content",
lines=10,
max_lines=30
)
with gr.TabItem("πŸ“Š Metadata"):
metadata_output = gr.Dataframe(
label="Document Metadata",
headers=["Property", "Value"]
)
with gr.TabItem("πŸ“‘ Sections"):
sections_output = gr.Textbox(
label="Document Sections",
lines=10,
max_lines=30
)
with gr.TabItem("🏷️ Entities"):
entities_output = gr.Textbox(
label="Named Entities",
lines=5,
max_lines=15
)
# Handle file submission
submit_btn.click(
fn=process_input,
inputs=[file_input, url_input],
outputs=[
content_output,
metadata_output,
sections_output,
entities_output,
confidence
]
)
gr.Markdown("""
### πŸ“Œ Supported Formats
- PDF Documents (*.pdf)
- Word Documents (*.docx)
- Text Files (*.txt)
- HTML Files (*.html)
- Markdown Files (*.md)
### πŸ”— Example URLs
- ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf
- Research Papers
- Documentation
""")
gr.Markdown(ARTICLE)
# Launch the app
if __name__ == "__main__":
iface.launch()