Spaces:

abhinavsarkar
/

StructuredPDFParser

Sleeping

App Files Files Community

abhinavsarkar commited on Oct 31, 2024

Commit

77f71a3

verified ·

1 Parent(s): 1a4e7ec

Create app.py

Browse files

Files changed (1) hide show

app.py +130 -0

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import streamlit as st
+import pymupdf4llm
+import tempfile
+import pathlib
+import markdown2
+from docx import Document
+from bs4 import BeautifulSoup
+def pdf_to_markdown(pdf_file):
+    # Create a temporary file to save the uploaded PDF
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+        temp_file.write(pdf_file.read())
+        temp_file_path = temp_file.name
+    # Convert PDF to Markdown
+    pdf_text = pymupdf4llm.to_markdown(temp_file_path)
+    # Save the Markdown content to a file
+    md_file_path = pathlib.Path("Output.md")
+    md_file_path.write_bytes(pdf_text.encode())
+    return pdf_text, md_file_path
+def create_docx_from_markdown(md_content):
+    # Convert Markdown to HTML
+    html_content = markdown2.markdown(md_content)
+    # Create a new Document
+    doc = Document()
+    doc.add_heading('Converted PDF Content', level=1)
+    # Use BeautifulSoup to parse the HTML and extract text
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Add the rendered HTML content to the document
+    for element in soup:
+        if element.name == 'h1':
+            doc.add_heading(element.get_text(), level=1)
+        elif element.name == 'h2':
+            doc.add_heading(element.get_text(), level=2)
+        elif element.name == 'h3':
+            doc.add_heading(element.get_text(), level=3)
+        elif element.name == 'p':
+            doc.add_paragraph(element.get_text())
+        elif element.name == 'strong':
+            doc.add_paragraph(element.get_text(), style='IntenseQuote')
+        elif element.name == 'em':
+            p = doc.add_paragraph()
+            p.add_run(element.get_text()).italic = True
+    # Save the document
+    docx_file_path = pathlib.Path("Output.docx")
+    doc.save(docx_file_path)
+    return docx_file_path
+# Streamlit application
+st.title("📄 Structured PDF Data Extractor")
+st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")
+# File uploader for PDF
+pdf_input = st.file_uploader("Upload PDF", type="pdf")
+if pdf_input is not None:
+    # Convert to Markdown when the PDF is uploaded
+    with st.spinner("Converting PDF to Markdown..."):
+        try:
+            pdf_text, md_file_path = pdf_to_markdown(pdf_input)
+            # Display the Markdown content
+            st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
+            st.markdown(pdf_text, unsafe_allow_html=True)
+            # Create a download button for the Markdown file
+            st.markdown("### Download Markdown File:")
+            with open(md_file_path, "rb") as file:
+                st.download_button(
+                    label="Download Markdown",
+                    data=file,
+                    file_name=md_file_path.name,
+                    mime="text/markdown"
+                )
+            # Create the .docx file from rendered Markdown content
+            docx_file_path = create_docx_from_markdown(pdf_text)
+            # Create a download button for the .docx file
+            st.markdown("### Download Word Document:")
+            with open(docx_file_path, "rb") as file:
+                st.download_button(
+                    label="Download Word Document",
+                    data=file,
+                    file_name=docx_file_path.name,
+                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                )
+        except Exception as e:
+            st.error(f"An error occurred during conversion: {e}")
+# Add some styling to make it visually appealing
+st.markdown(
+    """
+    <style>
+    body {
+        font-family: 'Arial', sans-serif;
+        line-height: 1.6;
+        font-size: 16px;
+        color: #333;
+    }
+    .stButton>button {
+        background-color: #4CAF50; /* Green */
+        border: none;
+        color: white;
+        padding: 10px 20px;
+        text-align: center;
+        text-decoration: none;
+        display: inline-block;
+        font-size: 16px;
+        margin: 4px 2px;
+        cursor: pointer;
+        border-radius: 5px;
+        transition: background-color 0.3s;
+    }
+    .stButton>button:hover {
+        background-color: #45a049; /* Darker green */
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)