|
import streamlit as st |
|
import pymupdf4llm |
|
import tempfile |
|
import pathlib |
|
import markdown2 |
|
from docx import Document |
|
from bs4 import BeautifulSoup |
|
|
|
def pdf_to_markdown(pdf_file): |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: |
|
temp_file.write(pdf_file.read()) |
|
temp_file_path = temp_file.name |
|
|
|
|
|
pdf_text = pymupdf4llm.to_markdown(temp_file_path) |
|
|
|
|
|
md_file_path = pathlib.Path("Output.md") |
|
md_file_path.write_bytes(pdf_text.encode()) |
|
|
|
return pdf_text, md_file_path |
|
|
|
def create_docx_from_markdown(md_content): |
|
|
|
html_content = markdown2.markdown(md_content) |
|
|
|
|
|
doc = Document() |
|
doc.add_heading('Converted PDF Content', level=1) |
|
|
|
|
|
soup = BeautifulSoup(html_content, "html.parser") |
|
|
|
|
|
for element in soup: |
|
if element.name == 'h1': |
|
doc.add_heading(element.get_text(), level=1) |
|
elif element.name == 'h2': |
|
doc.add_heading(element.get_text(), level=2) |
|
elif element.name == 'h3': |
|
doc.add_heading(element.get_text(), level=3) |
|
elif element.name == 'p': |
|
doc.add_paragraph(element.get_text()) |
|
elif element.name == 'strong': |
|
doc.add_paragraph(element.get_text(), style='IntenseQuote') |
|
elif element.name == 'em': |
|
p = doc.add_paragraph() |
|
p.add_run(element.get_text()).italic = True |
|
|
|
|
|
docx_file_path = pathlib.Path("Output.docx") |
|
doc.save(docx_file_path) |
|
|
|
return docx_file_path |
|
|
|
|
|
st.title("📄 Structured PDF Data Extractor") |
|
st.subheader("Upload a PDF file, preview the structured content, and download it if needed.") |
|
|
|
|
|
pdf_input = st.file_uploader("Upload PDF", type="pdf") |
|
|
|
if pdf_input is not None: |
|
|
|
with st.spinner("Converting PDF to Markdown..."): |
|
try: |
|
pdf_text, md_file_path = pdf_to_markdown(pdf_input) |
|
|
|
|
|
st.markdown("### Markdown Content Preview:", unsafe_allow_html=True) |
|
st.markdown(pdf_text, unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("### Download Markdown File:") |
|
with open(md_file_path, "rb") as file: |
|
st.download_button( |
|
label="Download Markdown", |
|
data=file, |
|
file_name=md_file_path.name, |
|
mime="text/markdown" |
|
) |
|
|
|
|
|
docx_file_path = create_docx_from_markdown(pdf_text) |
|
|
|
|
|
st.markdown("### Download Word Document:") |
|
with open(docx_file_path, "rb") as file: |
|
st.download_button( |
|
label="Download Word Document", |
|
data=file, |
|
file_name=docx_file_path.name, |
|
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
) |
|
|
|
except Exception as e: |
|
st.error(f"An error occurred during conversion: {e}") |
|
|
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
body { |
|
font-family: 'Arial', sans-serif; |
|
line-height: 1.6; |
|
font-size: 16px; |
|
color: #333; |
|
} |
|
.stButton>button { |
|
background-color: #4CAF50; /* Green */ |
|
border: none; |
|
color: white; |
|
padding: 10px 20px; |
|
text-align: center; |
|
text-decoration: none; |
|
display: inline-block; |
|
font-size: 16px; |
|
margin: 4px 2px; |
|
cursor: pointer; |
|
border-radius: 5px; |
|
transition: background-color 0.3s; |
|
} |
|
.stButton>button:hover { |
|
background-color: #45a049; /* Darker green */ |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|