Spaces:

abhinavsarkar
/

StructuredPDFParser

Running

App Files Files Community

StructuredPDFParser / app.py

abhinavsarkar

Create app.py

77f71a3 verified 26 days ago

raw

history blame contribute delete

4.27 kB

	import streamlit as st
	import pymupdf4llm
	import tempfile
	import pathlib
	import markdown2
	from docx import Document
	from bs4 import BeautifulSoup

	def pdf_to_markdown(pdf_file):
	# Create a temporary file to save the uploaded PDF
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(pdf_file.read())
	temp_file_path = temp_file.name

	# Convert PDF to Markdown
	pdf_text = pymupdf4llm.to_markdown(temp_file_path)

	# Save the Markdown content to a file
	md_file_path = pathlib.Path("Output.md")
	md_file_path.write_bytes(pdf_text.encode())

	return pdf_text, md_file_path

	def create_docx_from_markdown(md_content):
	# Convert Markdown to HTML
	html_content = markdown2.markdown(md_content)

	# Create a new Document
	doc = Document()
	doc.add_heading('Converted PDF Content', level=1)

	# Use BeautifulSoup to parse the HTML and extract text
	soup = BeautifulSoup(html_content, "html.parser")

	# Add the rendered HTML content to the document
	for element in soup:
	if element.name == 'h1':
	doc.add_heading(element.get_text(), level=1)
	elif element.name == 'h2':
	doc.add_heading(element.get_text(), level=2)
	elif element.name == 'h3':
	doc.add_heading(element.get_text(), level=3)
	elif element.name == 'p':
	doc.add_paragraph(element.get_text())
	elif element.name == 'strong':
	doc.add_paragraph(element.get_text(), style='IntenseQuote')
	elif element.name == 'em':
	p = doc.add_paragraph()
	p.add_run(element.get_text()).italic = True

	# Save the document
	docx_file_path = pathlib.Path("Output.docx")
	doc.save(docx_file_path)

	return docx_file_path

	# Streamlit application
	st.title("📄 Structured PDF Data Extractor")
	st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")

	# File uploader for PDF
	pdf_input = st.file_uploader("Upload PDF", type="pdf")

	if pdf_input is not None:
	# Convert to Markdown when the PDF is uploaded
	with st.spinner("Converting PDF to Markdown..."):
	try:
	pdf_text, md_file_path = pdf_to_markdown(pdf_input)

	# Display the Markdown content
	st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
	st.markdown(pdf_text, unsafe_allow_html=True)

	# Create a download button for the Markdown file
	st.markdown("### Download Markdown File:")
	with open(md_file_path, "rb") as file:
	st.download_button(
	label="Download Markdown",
	data=file,
	file_name=md_file_path.name,
	mime="text/markdown"
	)

	# Create the .docx file from rendered Markdown content
	docx_file_path = create_docx_from_markdown(pdf_text)

	# Create a download button for the .docx file
	st.markdown("### Download Word Document:")
	with open(docx_file_path, "rb") as file:
	st.download_button(
	label="Download Word Document",
	data=file,
	file_name=docx_file_path.name,
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)

	except Exception as e:
	st.error(f"An error occurred during conversion: {e}")

	# Add some styling to make it visually appealing
	st.markdown(
	"""
	<style>
	body {
	font-family: 'Arial', sans-serif;
	line-height: 1.6;
	font-size: 16px;
	color: #333;
	}
	.stButton>button {
	background-color: #4CAF50; /* Green */
	border: none;
	color: white;
	padding: 10px 20px;
	text-align: center;
	text-decoration: none;
	display: inline-block;
	font-size: 16px;
	margin: 4px 2px;
	cursor: pointer;
	border-radius: 5px;
	transition: background-color 0.3s;
	}
	.stButton>button:hover {
	background-color: #45a049; /* Darker green */
	}
	</style>
	""",
	unsafe_allow_html=True
	)