abhinavsarkar's picture
Create app.py
77f71a3 verified
import streamlit as st
import pymupdf4llm
import tempfile
import pathlib
import markdown2
from docx import Document
from bs4 import BeautifulSoup
def pdf_to_markdown(pdf_file):
# Create a temporary file to save the uploaded PDF
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(pdf_file.read())
temp_file_path = temp_file.name
# Convert PDF to Markdown
pdf_text = pymupdf4llm.to_markdown(temp_file_path)
# Save the Markdown content to a file
md_file_path = pathlib.Path("Output.md")
md_file_path.write_bytes(pdf_text.encode())
return pdf_text, md_file_path
def create_docx_from_markdown(md_content):
# Convert Markdown to HTML
html_content = markdown2.markdown(md_content)
# Create a new Document
doc = Document()
doc.add_heading('Converted PDF Content', level=1)
# Use BeautifulSoup to parse the HTML and extract text
soup = BeautifulSoup(html_content, "html.parser")
# Add the rendered HTML content to the document
for element in soup:
if element.name == 'h1':
doc.add_heading(element.get_text(), level=1)
elif element.name == 'h2':
doc.add_heading(element.get_text(), level=2)
elif element.name == 'h3':
doc.add_heading(element.get_text(), level=3)
elif element.name == 'p':
doc.add_paragraph(element.get_text())
elif element.name == 'strong':
doc.add_paragraph(element.get_text(), style='IntenseQuote')
elif element.name == 'em':
p = doc.add_paragraph()
p.add_run(element.get_text()).italic = True
# Save the document
docx_file_path = pathlib.Path("Output.docx")
doc.save(docx_file_path)
return docx_file_path
# Streamlit application
st.title("📄 Structured PDF Data Extractor")
st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")
# File uploader for PDF
pdf_input = st.file_uploader("Upload PDF", type="pdf")
if pdf_input is not None:
# Convert to Markdown when the PDF is uploaded
with st.spinner("Converting PDF to Markdown..."):
try:
pdf_text, md_file_path = pdf_to_markdown(pdf_input)
# Display the Markdown content
st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
st.markdown(pdf_text, unsafe_allow_html=True)
# Create a download button for the Markdown file
st.markdown("### Download Markdown File:")
with open(md_file_path, "rb") as file:
st.download_button(
label="Download Markdown",
data=file,
file_name=md_file_path.name,
mime="text/markdown"
)
# Create the .docx file from rendered Markdown content
docx_file_path = create_docx_from_markdown(pdf_text)
# Create a download button for the .docx file
st.markdown("### Download Word Document:")
with open(docx_file_path, "rb") as file:
st.download_button(
label="Download Word Document",
data=file,
file_name=docx_file_path.name,
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
except Exception as e:
st.error(f"An error occurred during conversion: {e}")
# Add some styling to make it visually appealing
st.markdown(
"""
<style>
body {
font-family: 'Arial', sans-serif;
line-height: 1.6;
font-size: 16px;
color: #333;
}
.stButton>button {
background-color: #4CAF50; /* Green */
border: none;
color: white;
padding: 10px 20px;
text-align: center;
text-decoration: none;
display: inline-block;
font-size: 16px;
margin: 4px 2px;
cursor: pointer;
border-radius: 5px;
transition: background-color 0.3s;
}
.stButton>button:hover {
background-color: #45a049; /* Darker green */
}
</style>
""",
unsafe_allow_html=True
)