Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pymupdf4llm | |
import tempfile | |
import pathlib | |
import markdown2 | |
from docx import Document | |
from bs4 import BeautifulSoup | |
def pdf_to_markdown(pdf_file): | |
# Create a temporary file to save the uploaded PDF | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: | |
temp_file.write(pdf_file.read()) | |
temp_file_path = temp_file.name | |
# Convert PDF to Markdown | |
pdf_text = pymupdf4llm.to_markdown(temp_file_path) | |
# Save the Markdown content to a file | |
md_file_path = pathlib.Path("Output.md") | |
md_file_path.write_bytes(pdf_text.encode()) | |
return pdf_text, md_file_path | |
def create_docx_from_markdown(md_content): | |
# Convert Markdown to HTML | |
html_content = markdown2.markdown(md_content) | |
# Create a new Document | |
doc = Document() | |
doc.add_heading('Converted PDF Content', level=1) | |
# Use BeautifulSoup to parse the HTML and extract text | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Add the rendered HTML content to the document | |
for element in soup: | |
if element.name == 'h1': | |
doc.add_heading(element.get_text(), level=1) | |
elif element.name == 'h2': | |
doc.add_heading(element.get_text(), level=2) | |
elif element.name == 'h3': | |
doc.add_heading(element.get_text(), level=3) | |
elif element.name == 'p': | |
doc.add_paragraph(element.get_text()) | |
elif element.name == 'strong': | |
doc.add_paragraph(element.get_text(), style='IntenseQuote') | |
elif element.name == 'em': | |
p = doc.add_paragraph() | |
p.add_run(element.get_text()).italic = True | |
# Save the document | |
docx_file_path = pathlib.Path("Output.docx") | |
doc.save(docx_file_path) | |
return docx_file_path | |
# Streamlit application | |
st.title("📄 Structured PDF Data Extractor") | |
st.subheader("Upload a PDF file, preview the structured content, and download it if needed.") | |
# File uploader for PDF | |
pdf_input = st.file_uploader("Upload PDF", type="pdf") | |
if pdf_input is not None: | |
# Convert to Markdown when the PDF is uploaded | |
with st.spinner("Converting PDF to Markdown..."): | |
try: | |
pdf_text, md_file_path = pdf_to_markdown(pdf_input) | |
# Display the Markdown content | |
st.markdown("### Markdown Content Preview:", unsafe_allow_html=True) | |
st.markdown(pdf_text, unsafe_allow_html=True) | |
# Create a download button for the Markdown file | |
st.markdown("### Download Markdown File:") | |
with open(md_file_path, "rb") as file: | |
st.download_button( | |
label="Download Markdown", | |
data=file, | |
file_name=md_file_path.name, | |
mime="text/markdown" | |
) | |
# Create the .docx file from rendered Markdown content | |
docx_file_path = create_docx_from_markdown(pdf_text) | |
# Create a download button for the .docx file | |
st.markdown("### Download Word Document:") | |
with open(docx_file_path, "rb") as file: | |
st.download_button( | |
label="Download Word Document", | |
data=file, | |
file_name=docx_file_path.name, | |
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
) | |
except Exception as e: | |
st.error(f"An error occurred during conversion: {e}") | |
# Add some styling to make it visually appealing | |
st.markdown( | |
""" | |
<style> | |
body { | |
font-family: 'Arial', sans-serif; | |
line-height: 1.6; | |
font-size: 16px; | |
color: #333; | |
} | |
.stButton>button { | |
background-color: #4CAF50; /* Green */ | |
border: none; | |
color: white; | |
padding: 10px 20px; | |
text-align: center; | |
text-decoration: none; | |
display: inline-block; | |
font-size: 16px; | |
margin: 4px 2px; | |
cursor: pointer; | |
border-radius: 5px; | |
transition: background-color 0.3s; | |
} | |
.stButton>button:hover { | |
background-color: #45a049; /* Darker green */ | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |