abhinavsarkar commited on
Commit
77f71a3
1 Parent(s): 1a4e7ec

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pymupdf4llm
3
+ import tempfile
4
+ import pathlib
5
+ import markdown2
6
+ from docx import Document
7
+ from bs4 import BeautifulSoup
8
+
9
+ def pdf_to_markdown(pdf_file):
10
+ # Create a temporary file to save the uploaded PDF
11
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
12
+ temp_file.write(pdf_file.read())
13
+ temp_file_path = temp_file.name
14
+
15
+ # Convert PDF to Markdown
16
+ pdf_text = pymupdf4llm.to_markdown(temp_file_path)
17
+
18
+ # Save the Markdown content to a file
19
+ md_file_path = pathlib.Path("Output.md")
20
+ md_file_path.write_bytes(pdf_text.encode())
21
+
22
+ return pdf_text, md_file_path
23
+
24
+ def create_docx_from_markdown(md_content):
25
+ # Convert Markdown to HTML
26
+ html_content = markdown2.markdown(md_content)
27
+
28
+ # Create a new Document
29
+ doc = Document()
30
+ doc.add_heading('Converted PDF Content', level=1)
31
+
32
+ # Use BeautifulSoup to parse the HTML and extract text
33
+ soup = BeautifulSoup(html_content, "html.parser")
34
+
35
+ # Add the rendered HTML content to the document
36
+ for element in soup:
37
+ if element.name == 'h1':
38
+ doc.add_heading(element.get_text(), level=1)
39
+ elif element.name == 'h2':
40
+ doc.add_heading(element.get_text(), level=2)
41
+ elif element.name == 'h3':
42
+ doc.add_heading(element.get_text(), level=3)
43
+ elif element.name == 'p':
44
+ doc.add_paragraph(element.get_text())
45
+ elif element.name == 'strong':
46
+ doc.add_paragraph(element.get_text(), style='IntenseQuote')
47
+ elif element.name == 'em':
48
+ p = doc.add_paragraph()
49
+ p.add_run(element.get_text()).italic = True
50
+
51
+ # Save the document
52
+ docx_file_path = pathlib.Path("Output.docx")
53
+ doc.save(docx_file_path)
54
+
55
+ return docx_file_path
56
+
57
+ # Streamlit application
58
+ st.title("📄 Structured PDF Data Extractor")
59
+ st.subheader("Upload a PDF file, preview the structured content, and download it if needed.")
60
+
61
+ # File uploader for PDF
62
+ pdf_input = st.file_uploader("Upload PDF", type="pdf")
63
+
64
+ if pdf_input is not None:
65
+ # Convert to Markdown when the PDF is uploaded
66
+ with st.spinner("Converting PDF to Markdown..."):
67
+ try:
68
+ pdf_text, md_file_path = pdf_to_markdown(pdf_input)
69
+
70
+ # Display the Markdown content
71
+ st.markdown("### Markdown Content Preview:", unsafe_allow_html=True)
72
+ st.markdown(pdf_text, unsafe_allow_html=True)
73
+
74
+ # Create a download button for the Markdown file
75
+ st.markdown("### Download Markdown File:")
76
+ with open(md_file_path, "rb") as file:
77
+ st.download_button(
78
+ label="Download Markdown",
79
+ data=file,
80
+ file_name=md_file_path.name,
81
+ mime="text/markdown"
82
+ )
83
+
84
+ # Create the .docx file from rendered Markdown content
85
+ docx_file_path = create_docx_from_markdown(pdf_text)
86
+
87
+ # Create a download button for the .docx file
88
+ st.markdown("### Download Word Document:")
89
+ with open(docx_file_path, "rb") as file:
90
+ st.download_button(
91
+ label="Download Word Document",
92
+ data=file,
93
+ file_name=docx_file_path.name,
94
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
95
+ )
96
+
97
+ except Exception as e:
98
+ st.error(f"An error occurred during conversion: {e}")
99
+
100
+ # Add some styling to make it visually appealing
101
+ st.markdown(
102
+ """
103
+ <style>
104
+ body {
105
+ font-family: 'Arial', sans-serif;
106
+ line-height: 1.6;
107
+ font-size: 16px;
108
+ color: #333;
109
+ }
110
+ .stButton>button {
111
+ background-color: #4CAF50; /* Green */
112
+ border: none;
113
+ color: white;
114
+ padding: 10px 20px;
115
+ text-align: center;
116
+ text-decoration: none;
117
+ display: inline-block;
118
+ font-size: 16px;
119
+ margin: 4px 2px;
120
+ cursor: pointer;
121
+ border-radius: 5px;
122
+ transition: background-color 0.3s;
123
+ }
124
+ .stButton>button:hover {
125
+ background-color: #45a049; /* Darker green */
126
+ }
127
+ </style>
128
+ """,
129
+ unsafe_allow_html=True
130
+ )