File size: 4,269 Bytes
77f71a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
import pymupdf4llm
import tempfile
import pathlib
import markdown2
from docx import Document
from bs4 import BeautifulSoup

def pdf_to_markdown(pdf_file):
    # Create a temporary file to save the uploaded PDF
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(pdf_file.read())
        temp_file_path = temp_file.name

    # Convert PDF to Markdown
    pdf_text = pymupdf4llm.to_markdown(temp_file_path)

    # Save the Markdown content to a file
    md_file_path = pathlib.Path("Output.md")
    md_file_path.write_bytes(pdf_text.encode())

    return pdf_text, md_file_path

def create_docx_from_markdown(md_content):
    # Convert Markdown to HTML
    html_content = markdown2.markdown(md_content)

    # Create a new Document
    doc = Document()
    doc.add_heading('Converted PDF Content', level=1)

    # Use BeautifulSoup to parse the HTML and extract text
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Add the rendered HTML content to the document
    for element in soup:
        if element.name == 'h1':
            doc.add_heading(element.get_text(), level=1)
        elif element.name == 'h2':
            doc.add_heading(element.get_text(), level=2)
        elif element.name == 'h3':
            doc.add_heading(element.get_text(), level=3)
        elif element.name == 'p':
            doc.add_paragraph(element.get_text())
        elif element.name == 'strong':
            doc.add_paragraph(element.get_text(), style='IntenseQuote')
        elif element.name == 'em':
            p = doc.add_paragraph()
            p.add_run(element.get_text()).italic = True

    # Save the document
    docx_file_path = pathlib.Path("Output.docx")
    doc.save(docx_file_path)

    return docx_file_path

# Streamlit application
st.title("📄 Structured PDF Data Extractor")
st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")

# File uploader for PDF
pdf_input = st.file_uploader("Upload PDF", type="pdf")

if pdf_input is not None:
    # Convert to Markdown when the PDF is uploaded
    with st.spinner("Converting PDF to Markdown..."):
        try:
            pdf_text, md_file_path = pdf_to_markdown(pdf_input)

            # Display the Markdown content
            st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
            st.markdown(pdf_text, unsafe_allow_html=True)

            # Create a download button for the Markdown file
            st.markdown("### Download Markdown File:")
            with open(md_file_path, "rb") as file:
                st.download_button(
                    label="Download Markdown",
                    data=file,
                    file_name=md_file_path.name,
                    mime="text/markdown"
                )

            # Create the .docx file from rendered Markdown content
            docx_file_path = create_docx_from_markdown(pdf_text)

            # Create a download button for the .docx file
            st.markdown("### Download Word Document:")
            with open(docx_file_path, "rb") as file:
                st.download_button(
                    label="Download Word Document",
                    data=file,
                    file_name=docx_file_path.name,
                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                )
                
        except Exception as e:
            st.error(f"An error occurred during conversion: {e}")

# Add some styling to make it visually appealing
st.markdown(
    """
    <style>
    body {
        font-family: 'Arial', sans-serif;
        line-height: 1.6;
        font-size: 16px;
        color: #333;
    }
    .stButton>button {
        background-color: #4CAF50; /* Green */
        border: none;
        color: white;
        padding: 10px 20px;
        text-align: center;
        text-decoration: none;
        display: inline-block;
        font-size: 16px;
        margin: 4px 2px;
        cursor: pointer;
        border-radius: 5px;
        transition: background-color 0.3s;
    }
    .stButton>button:hover {
        background-color: #45a049; /* Darker green */
    }
    </style>
    """,
    unsafe_allow_html=True
)