Spaces:

abhinavsarkar
/

StructuredPDFParser

Sleeping

File size: 4,269 Bytes

77f71a3

import streamlit as st
import pymupdf4llm
import tempfile
import pathlib
import markdown2
from docx import Document
from bs4 import BeautifulSoup

def pdf_to_markdown(pdf_file):
    # Create a temporary file to save the uploaded PDF
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(pdf_file.read())
        temp_file_path = temp_file.name

    # Convert PDF to Markdown
    pdf_text = pymupdf4llm.to_markdown(temp_file_path)

    # Save the Markdown content to a file
    md_file_path = pathlib.Path("Output.md")
    md_file_path.write_bytes(pdf_text.encode())

    return pdf_text, md_file_path

def create_docx_from_markdown(md_content):
    # Convert Markdown to HTML
    html_content = markdown2.markdown(md_content)

    # Create a new Document
    doc = Document()
    doc.add_heading('Converted PDF Content', level=1)

    # Use BeautifulSoup to parse the HTML and extract text
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Add the rendered HTML content to the document
    for element in soup:
        if element.name == 'h1':
            doc.add_heading(element.get_text(), level=1)
        elif element.name == 'h2':
            doc.add_heading(element.get_text(), level=2)
        elif element.name == 'h3':
            doc.add_heading(element.get_text(), level=3)
        elif element.name == 'p':
            doc.add_paragraph(element.get_text())
        elif element.name == 'strong':
            doc.add_paragraph(element.get_text(), style='IntenseQuote')
        elif element.name == 'em':
            p = doc.add_paragraph()
            p.add_run(element.get_text()).italic = True

    # Save the document
    docx_file_path = pathlib.Path("Output.docx")
    doc.save(docx_file_path)

    return docx_file_path

# Streamlit application
st.title("📄 Structured PDF Data Extractor")
st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")

# File uploader for PDF
pdf_input = st.file_uploader("Upload PDF", type="pdf")

if pdf_input is not None:
    # Convert to Markdown when the PDF is uploaded
    with st.spinner("Converting PDF to Markdown..."):
        try:
            pdf_text, md_file_path = pdf_to_markdown(pdf_input)

            # Display the Markdown content
            st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
            st.markdown(pdf_text, unsafe_allow_html=True)

            # Create a download button for the Markdown file
            st.markdown("### Download Markdown File:")
            with open(md_file_path, "rb") as file:
                st.download_button(
                    label="Download Markdown",
                    data=file,
                    file_name=md_file_path.name,
                    mime="text/markdown"
                )

            # Create the .docx file from rendered Markdown content
            docx_file_path = create_docx_from_markdown(pdf_text)

            # Create a download button for the .docx file
            st.markdown("### Download Word Document:")
            with open(docx_file_path, "rb") as file:
                st.download_button(
                    label="Download Word Document",
                    data=file,
                    file_name=docx_file_path.name,
                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                )
                
        except Exception as e:
            st.error(f"An error occurred during conversion: {e}")

# Add some styling to make it visually appealing
st.markdown(
    """
    <style>
    body {
        font-family: 'Arial', sans-serif;
        line-height: 1.6;
        font-size: 16px;
        color: #333;
    }
    .stButton>button {
        background-color: #4CAF50; /* Green */
        border: none;
        color: white;
        padding: 10px 20px;
        text-align: center;
        text-decoration: none;
        display: inline-block;
        font-size: 16px;
        margin: 4px 2px;
        cursor: pointer;
        border-radius: 5px;
        transition: background-color 0.3s;
    }
    .stButton>button:hover {
        background-color: #45a049; /* Darker green */
    }
    </style>
    """,
    unsafe_allow_html=True
)