Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / app.py

hellorahulk

Add URL input support for document processing

ec3f76a 5 months ago

raw

history blame contribute delete

8 kB

	import os
	import gradio as gr
	import pandas as pd
	from dockling_parser import DocumentParser
	from dockling_parser.exceptions import ParserError, UnsupportedFormatError
	import tempfile
	import mimetypes
	import traceback
	import requests
	from urllib.parse import urlparse

	TITLE = "📄 Smart Document Parser"
	DESCRIPTION = """
	A powerful document parsing application that automatically extracts structured information from various document formats.
	Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically.
	"""

	ARTICLE = """
	## 🚀 Features

	- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
	- Support for File Upload and URLs
	- Rich Information Extraction
	- Smart Processing with Confidence Scoring
	- Automatic Format Detection

	Made with ❤️ using Docling and Gradio
	"""

	ERROR_MESSAGES = {
	"no_input": (
	"⚠️ No input provided",
	"Please upload a document or provide a URL.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	),
	"invalid_url": (
	"⚠️ Invalid URL",
	"Please provide a valid URL to a document.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	),
	"download_error": (
	"⚠️ Failed to download document",
	"Could not download the document from the provided URL.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	),
	"unsupported_format": (
	"⚠️ Unsupported file format",
	"Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	),
	"processing_error": (
	"⚠️ Error processing document",
	"An error occurred while processing the document. Please try again with a different file.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	}

	# Initialize the document parser
	parser = DocumentParser()

	def download_file(url: str) -> str:
	"""Download file from URL and save to temporary file"""
	try:
	# Extract filename from URL
	parsed_url = urlparse(url)
	filename = os.path.basename(parsed_url.path)
	if not filename:
	filename = "document.pdf" # Default filename

	# Download file
	response = requests.get(url, allow_redirects=True)
	response.raise_for_status()

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
	tmp_file.write(response.content)
	return tmp_file.name

	except Exception as e:
	raise Exception(f"Failed to download file: {str(e)}")

	def process_input(file_input, url_input):
	"""Process either uploaded file or URL input"""
	# Check if we have any input
	if file_input is None and not url_input:
	return ERROR_MESSAGES["no_input"]

	temp_file = None
	try:
	# Handle URL input if provided
	if url_input:
	try:
	temp_file = download_file(url_input)
	result = parser.parse(temp_file)
	except Exception as e:
	return ERROR_MESSAGES["download_error"]
	# Handle file upload
	else:
	result = parser.parse(file_input)

	# Prepare the outputs
	metadata_df = pd.DataFrame([{
	"Property": k,
	"Value": str(v)
	} for k, v in result.metadata.dict().items()])

	# Extract structured content
	sections = result.structured_content.get('sections', [])
	sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])

	# Format entities if available
	entities = result.structured_content.get('entities', {})
	entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}"
	for entity_type, entities_list in entities.items()]) if entities else "No entities detected"

	return (
	result.content, # Main content
	metadata_df, # Metadata as table
	sections_text, # Structured sections
	entities_text, # Named entities
	f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
	)

	except UnsupportedFormatError as e:
	error_msg = f"⚠️ {str(e)}"
	return (
	error_msg,
	pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	except ParserError as e:
	error_msg = f"⚠️ {str(e)}"
	return (
	error_msg,
	pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	except Exception as e:
	error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}"
	return (
	error_msg,
	pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	finally:
	# Cleanup temporary file if it was created
	if temp_file and os.path.exists(temp_file):
	try:
	os.unlink(temp_file)
	except:
	pass

	# Create Gradio interface
	with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
	gr.Markdown(f"# {TITLE}")
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Document",
	file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
	type="filepath"
	)
	url_input = gr.Textbox(
	label="Or Enter Document URL",
	placeholder="https://example.com/document.pdf"
	)
	submit_btn = gr.Button("Process Document", variant="primary")

	with gr.Column():
	confidence = gr.Textbox(label="Processing Confidence")

	with gr.Tabs():
	with gr.TabItem("📝 Content"):
	content_output = gr.Textbox(
	label="Extracted Content",
	lines=10,
	max_lines=30
	)

	with gr.TabItem("📊 Metadata"):
	metadata_output = gr.Dataframe(
	label="Document Metadata",
	headers=["Property", "Value"]
	)

	with gr.TabItem("📑 Sections"):
	sections_output = gr.Textbox(
	label="Document Sections",
	lines=10,
	max_lines=30
	)

	with gr.TabItem("🏷️ Entities"):
	entities_output = gr.Textbox(
	label="Named Entities",
	lines=5,
	max_lines=15
	)

	# Handle file submission
	submit_btn.click(
	fn=process_input,
	inputs=[file_input, url_input],
	outputs=[
	content_output,
	metadata_output,
	sections_output,
	entities_output,
	confidence
	]
	)

	gr.Markdown("""
	### 📌 Supported Formats
	- PDF Documents (*.pdf)
	- Word Documents (*.docx)
	- Text Files (*.txt)
	- HTML Files (*.html)
	- Markdown Files (*.md)

	### 🔗 Example URLs
	- ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf
	- Research Papers
	- Documentation
	""")

	gr.Markdown(ARTICLE)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()