ChinmayBH commited on
Commit
a6cd894
·
verified ·
1 Parent(s): 96fadd5

updated app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -250
app.py CHANGED
@@ -1,250 +0,0 @@
1
- import streamlit as st
2
- import os
3
- import json
4
- import fitz
5
- from io import BytesIO
6
- from PIL import Image
7
- import pandas as pd
8
- import tempfile
9
-
10
- def extract_text_images(
11
- pdf_path: str, output_folder: str,
12
- minimum_font_size: int,
13
- extraction_type: str = 'both'
14
- ) -> dict:
15
- """
16
- Extracts text and/or images from a PDF and organizes them by pages.
17
-
18
- Params
19
- -------
20
- pdf_path: str
21
- Path to the input PDF file.
22
- output_folder: str
23
- Path to the output folder where extracted data will be saved.
24
- minimum_font_size: int
25
- Minimum font size below which the text will be ignored.
26
- extraction_type: str
27
- Type of extraction, either 'text', 'images', or 'both'.
28
-
29
- Returns
30
- -------
31
- dict
32
- The extracted data organized by pages.
33
- """
34
- if not os.path.exists(output_folder):
35
- os.makedirs(output_folder)
36
-
37
- extraction_data = []
38
-
39
- pdf_document = fitz.open(pdf_path)
40
-
41
- for page_number in range(pdf_document.page_count):
42
- page = pdf_document.load_page(page_number)
43
- elements = []
44
-
45
- if extraction_type in ('text', 'both'):
46
- text_blocks = page.get_text("dict")["blocks"]
47
- lines = {}
48
-
49
- for block in text_blocks:
50
- if block["type"] == 0:
51
- for line in block["lines"]:
52
- for span in line["spans"]:
53
- font_size = span["size"]
54
- top = span["bbox"][1]
55
-
56
- if font_size < minimum_font_size:
57
- continue
58
-
59
- if top not in lines:
60
- lines[top] = []
61
- lines[top].append(span)
62
-
63
- for top in sorted(lines.keys()):
64
- line = lines[top]
65
- line_text = " ".join([span['text'] for span in line])
66
-
67
- elements.append({
68
- 'type': 'text',
69
- 'font_size': line[0]['size'],
70
- 'page': page_number + 1,
71
- 'content': line_text,
72
- 'x0': line[0]['bbox'][0],
73
- 'top': top,
74
- })
75
-
76
- if extraction_type in ('images', 'both'):
77
- image_list = page.get_images(full=True)
78
-
79
- for img_index, img in enumerate(image_list):
80
- xref = img[0]
81
- base_image = pdf_document.extract_image(xref)
82
- image_bytes = base_image["image"]
83
- image_filename = os.path.join(
84
- output_folder,
85
- f"page_{page_number + 1}_img_{img_index + 1}.png"
86
- )
87
-
88
- with open(image_filename, "wb") as img_file:
89
- img_file.write(image_bytes)
90
-
91
- img_rect = page.get_image_bbox(img)
92
- elements.append({
93
- 'type': 'image',
94
- 'page': page_number + 1,
95
- 'path': image_filename,
96
- 'x0': img_rect.x0,
97
- 'top': img_rect.y0
98
- })
99
-
100
- elements.sort(key=lambda e: (e['top'], e['x0']))
101
-
102
- page_content = []
103
- for element in elements:
104
- if element['type'] == 'text':
105
- if page_content and page_content[-1]['type'] == 'text':
106
- page_content[-1]['content'] += " " + element['content']
107
- else:
108
- page_content.append({
109
- 'type': 'text',
110
- 'content': element['content']
111
- })
112
- elif element['type'] == 'image':
113
- page_content.append({
114
- 'type': 'image',
115
- 'path': element['path']
116
- })
117
-
118
- extraction_data.append({
119
- 'page': page_number + 1,
120
- 'content': page_content
121
- })
122
-
123
- pdf_document.close()
124
-
125
- return extraction_data
126
-
127
- def convert_to_xlsx(data: dict) -> BytesIO:
128
- rows = []
129
-
130
- for item in data:
131
- page_number = item['page']
132
- content_list = item['content']
133
-
134
- for content in content_list:
135
- if content['type'] == 'text':
136
- rows.append({
137
- 'Page': page_number,
138
- 'Content': content['content']
139
- })
140
- elif content['type'] == 'image':
141
- rows.append({
142
- 'Page': page_number,
143
- 'Content': f"[Image: {content['path']}]"
144
- })
145
-
146
- df = pd.DataFrame(rows)
147
-
148
- output = BytesIO()
149
- with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
150
- df.to_excel(writer, index=False, sheet_name='Extraction')
151
-
152
- output.seek(0)
153
- return output
154
-
155
- def main():
156
- st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
157
- st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
158
-
159
- st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
160
-
161
- pdf_file = st.file_uploader("Upload PDF", type="pdf")
162
-
163
- if pdf_file is not None:
164
- num_pages_to_preview = st.sidebar.slider(
165
- "Select number of pages to preview:",
166
- min_value=1, max_value=5, value=1
167
- )
168
-
169
- pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
170
- for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
171
- page = pdf_document.load_page(page_num)
172
- pix = page.get_pixmap()
173
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
174
- st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
175
-
176
- st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
177
- extraction_type = st.selectbox(
178
- "Choose extraction type:",
179
- ("text", "images", "both")
180
- )
181
-
182
- st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
183
- minimum_font_size = st.number_input(
184
- "Minimum font size to extract:",
185
- min_value=1, value=2
186
- )
187
-
188
- if st.button("Start Extraction"):
189
- if pdf_file is not None:
190
- with tempfile.TemporaryDirectory() as output_folder:
191
- temp_pdf_path = os.path.join(output_folder, pdf_file.name)
192
- with open(temp_pdf_path, "wb") as f:
193
- f.write(pdf_file.getvalue())
194
-
195
- extraction_data = extract_text_images(
196
- temp_pdf_path,
197
- output_folder,
198
- minimum_font_size,
199
- extraction_type
200
- )
201
-
202
- st.json(extraction_data)
203
-
204
- xlsx_data = convert_to_xlsx(extraction_data)
205
-
206
- col1, col2 = st.columns(2)
207
-
208
- with col1:
209
- st.download_button(
210
- label="Download JSON",
211
- data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
212
- file_name='extraction_data.json',
213
- mime='application/json')
214
-
215
- with col2:
216
- st.download_button(
217
- label="Download XLSX",
218
- data=xlsx_data,
219
- file_name='extraction_data.xlsx',
220
- mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
221
-
222
- else:
223
- st.error("Please upload a PDF file.")
224
-
225
- st.markdown(
226
- """
227
- <style>
228
- .footer {
229
- position: fixed;
230
- bottom: 0;
231
- left: 0;
232
- width: 100%;
233
- background-color: #F0F0F0;
234
- font-family:cursive;
235
- text-align: right;
236
- padding: 5px 0;
237
- font-size:20px;
238
- font-weight: bold;
239
- color: #FF0000;
240
- }
241
- </style>
242
- <div class="footer">
243
- CREATED BY: CHINMAY BHALERAO
244
- </div>
245
- """,
246
- unsafe_allow_html=True
247
- )
248
-
249
- if __name__ == "__main__":
250
- main()