Spaces:
Sleeping
Sleeping
File size: 4,269 Bytes
77f71a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st
import pymupdf4llm
import tempfile
import pathlib
import markdown2
from docx import Document
from bs4 import BeautifulSoup
def pdf_to_markdown(pdf_file):
# Create a temporary file to save the uploaded PDF
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(pdf_file.read())
temp_file_path = temp_file.name
# Convert PDF to Markdown
pdf_text = pymupdf4llm.to_markdown(temp_file_path)
# Save the Markdown content to a file
md_file_path = pathlib.Path("Output.md")
md_file_path.write_bytes(pdf_text.encode())
return pdf_text, md_file_path
def create_docx_from_markdown(md_content):
# Convert Markdown to HTML
html_content = markdown2.markdown(md_content)
# Create a new Document
doc = Document()
doc.add_heading('Converted PDF Content', level=1)
# Use BeautifulSoup to parse the HTML and extract text
soup = BeautifulSoup(html_content, "html.parser")
# Add the rendered HTML content to the document
for element in soup:
if element.name == 'h1':
doc.add_heading(element.get_text(), level=1)
elif element.name == 'h2':
doc.add_heading(element.get_text(), level=2)
elif element.name == 'h3':
doc.add_heading(element.get_text(), level=3)
elif element.name == 'p':
doc.add_paragraph(element.get_text())
elif element.name == 'strong':
doc.add_paragraph(element.get_text(), style='IntenseQuote')
elif element.name == 'em':
p = doc.add_paragraph()
p.add_run(element.get_text()).italic = True
# Save the document
docx_file_path = pathlib.Path("Output.docx")
doc.save(docx_file_path)
return docx_file_path
# Streamlit application
st.title("📄 Structured PDF Data Extractor")
st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")
# File uploader for PDF
pdf_input = st.file_uploader("Upload PDF", type="pdf")
if pdf_input is not None:
# Convert to Markdown when the PDF is uploaded
with st.spinner("Converting PDF to Markdown..."):
try:
pdf_text, md_file_path = pdf_to_markdown(pdf_input)
# Display the Markdown content
st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
st.markdown(pdf_text, unsafe_allow_html=True)
# Create a download button for the Markdown file
st.markdown("### Download Markdown File:")
with open(md_file_path, "rb") as file:
st.download_button(
label="Download Markdown",
data=file,
file_name=md_file_path.name,
mime="text/markdown"
)
# Create the .docx file from rendered Markdown content
docx_file_path = create_docx_from_markdown(pdf_text)
# Create a download button for the .docx file
st.markdown("### Download Word Document:")
with open(docx_file_path, "rb") as file:
st.download_button(
label="Download Word Document",
data=file,
file_name=docx_file_path.name,
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
except Exception as e:
st.error(f"An error occurred during conversion: {e}")
# Add some styling to make it visually appealing
st.markdown(
"""
<style>
body {
font-family: 'Arial', sans-serif;
line-height: 1.6;
font-size: 16px;
color: #333;
}
.stButton>button {
background-color: #4CAF50; /* Green */
border: none;
color: white;
padding: 10px 20px;
text-align: center;
text-decoration: none;
display: inline-block;
font-size: 16px;
margin: 4px 2px;
cursor: pointer;
border-radius: 5px;
transition: background-color 0.3s;
}
.stButton>button:hover {
background-color: #45a049; /* Darker green */
}
</style>
""",
unsafe_allow_html=True
)
|