|
import streamlit as st |
|
import os |
|
import json |
|
import fitz |
|
from io import BytesIO |
|
from PIL import Image |
|
import pandas as pd |
|
import zipfile |
|
import tempfile |
|
|
|
def extract_text_images( |
|
pdf_path: str, output_folder: str, |
|
minimum_font_size: int, |
|
extraction_type: str = 'both' |
|
) -> dict: |
|
""" |
|
Extracts text and/or images from a PDF and organizes them by pages. |
|
""" |
|
if not os.path.exists(output_folder): |
|
os.makedirs(output_folder) |
|
|
|
extraction_data = [] |
|
pdf_document = fitz.open(pdf_path) |
|
|
|
for page_number in range(pdf_document.page_count): |
|
page = pdf_document.load_page(page_number) |
|
elements = [] |
|
|
|
if extraction_type in ('text', 'both'): |
|
text_blocks = page.get_text("dict")["blocks"] |
|
lines = {} |
|
|
|
for block in text_blocks: |
|
if block["type"] == 0: |
|
for line in block["lines"]: |
|
for span in line["spans"]: |
|
font_size = span["size"] |
|
top = span["bbox"][1] |
|
|
|
if font_size < minimum_font_size: |
|
continue |
|
|
|
if top not in lines: |
|
lines[top] = [] |
|
lines[top].append(span) |
|
|
|
for top in sorted(lines.keys()): |
|
line = lines[top] |
|
line_text = " ".join([span['text'] for span in line]) |
|
|
|
elements.append({ |
|
'type': 'text', |
|
'font_size': line[0]['size'], |
|
'page': page_number + 1, |
|
'content': line_text, |
|
'x0': line[0]['bbox'][0], |
|
'top': top, |
|
}) |
|
|
|
if extraction_type in ('images', 'both'): |
|
image_list = page.get_images(full=True) |
|
|
|
for img_index, img in enumerate(image_list): |
|
xref = img[0] |
|
base_image = pdf_document.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image_filename = os.path.join( |
|
output_folder, |
|
f"page_{page_number + 1}_img_{img_index + 1}.png" |
|
) |
|
|
|
with open(image_filename, "wb") as img_file: |
|
img_file.write(image_bytes) |
|
|
|
img_rect = page.get_image_bbox(img) |
|
elements.append({ |
|
'type': 'image', |
|
'page': page_number + 1, |
|
'path': image_filename, |
|
'x0': img_rect.x0, |
|
'top': img_rect.y0 |
|
}) |
|
|
|
elements.sort(key=lambda e: (e['top'], e['x0'])) |
|
|
|
page_content = [] |
|
for element in elements: |
|
if element['type'] == 'text': |
|
if page_content and page_content[-1]['type'] == 'text': |
|
page_content[-1]['content'] += " " + element['content'] |
|
else: |
|
page_content.append({ |
|
'type': 'text', |
|
'content': element['content'] |
|
}) |
|
elif element['type'] == 'image': |
|
page_content.append({ |
|
'type': 'image', |
|
'path': element['path'] |
|
}) |
|
|
|
extraction_data.append({ |
|
'page': page_number + 1, |
|
'content': page_content |
|
}) |
|
|
|
pdf_document.close() |
|
|
|
return extraction_data |
|
|
|
def convert_to_xlsx(data: dict) -> BytesIO: |
|
""" |
|
Converts the extracted data to an XLSX file. |
|
""" |
|
rows = [] |
|
|
|
for item in data: |
|
page_number = item['page'] |
|
content_list = item['content'] |
|
|
|
for content in content_list: |
|
if content['type'] == 'text': |
|
rows.append({ |
|
'Page': page_number, |
|
'Content': content['content'] |
|
}) |
|
elif content['type'] == 'image': |
|
rows.append({ |
|
'Page': page_number, |
|
'Content': f"[Image: {content['path']}]" |
|
}) |
|
|
|
df = pd.DataFrame(rows) |
|
|
|
output = BytesIO() |
|
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: |
|
df.to_excel(writer, index=False, sheet_name='Extraction') |
|
|
|
output.seek(0) |
|
return output |
|
|
|
def create_zip_with_json_and_images(output_folder, extraction_data): |
|
""" |
|
Creates a ZIP file containing both images and JSON data. |
|
""" |
|
zip_buffer = BytesIO() |
|
with zipfile.ZipFile(zip_buffer, "w") as zip_file: |
|
|
|
json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8') |
|
zip_file.writestr("extraction_data.json", json_data) |
|
|
|
|
|
for item in extraction_data: |
|
for content in item['content']: |
|
if content['type'] == 'image': |
|
image_path = content['path'] |
|
image_name = os.path.basename(image_path) |
|
zip_file.write(image_path, image_name) |
|
|
|
zip_buffer.seek(0) |
|
return zip_buffer |
|
|
|
def main(): |
|
st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True) |
|
st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True) |
|
|
|
st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True) |
|
|
|
pdf_file = st.file_uploader("Upload PDF", type="pdf") |
|
|
|
if pdf_file is not None: |
|
num_pages_to_preview = st.sidebar.slider( |
|
"Select number of pages to preview:", |
|
min_value=1, max_value=5, value=1 |
|
) |
|
|
|
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") |
|
for page_num in range(min(num_pages_to_preview, pdf_document.page_count)): |
|
page = pdf_document.load_page(page_num) |
|
pix = page.get_pixmap() |
|
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True) |
|
|
|
st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf") |
|
extraction_type = st.selectbox( |
|
"Choose extraction type:", |
|
("text", "images", "both") |
|
) |
|
|
|
st.info("Minimum font size is the size below which size, the text will get ignored for extraction") |
|
minimum_font_size = st.number_input( |
|
"Minimum font size to extract:", |
|
min_value=1, value=2 |
|
) |
|
|
|
output_folder = st.text_input("Output folder path:") |
|
|
|
if st.button("Start Extraction"): |
|
if pdf_file is not None and output_folder: |
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
temp_pdf_path = os.path.join(temp_dir, pdf_file.name) |
|
with open(temp_pdf_path, "wb") as f: |
|
f.write(pdf_file.getvalue()) |
|
|
|
extraction_data = extract_text_images( |
|
temp_pdf_path, |
|
temp_dir, |
|
minimum_font_size, |
|
extraction_type |
|
) |
|
|
|
st.json(extraction_data) |
|
|
|
if extraction_type == 'images' or extraction_type == 'both': |
|
zip_data = create_zip_with_json_and_images(temp_dir, extraction_data) |
|
st.download_button( |
|
label="Download ZIP", |
|
data=zip_data, |
|
file_name='extraction_data.zip', |
|
mime='application/zip' |
|
) |
|
|
|
xlsx_data = convert_to_xlsx(extraction_data) |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.download_button( |
|
label="Download JSON", |
|
data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'), |
|
file_name='extraction_data.json', |
|
mime='application/json' |
|
) |
|
with col2: |
|
st.download_button( |
|
label="Download XLSX", |
|
data=xlsx_data, |
|
file_name='extraction_data.xlsx', |
|
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' |
|
) |
|
else: |
|
st.error("Please upload a PDF file and provide an output folder path.") |
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
.footer { |
|
position: fixed; |
|
bottom: 0; |
|
left: 0; |
|
width: 100%; |
|
background-color: #F0F0F0; |
|
font-family:cursive; |
|
text-align: right; |
|
padding: 5px 0; |
|
font-size:20px; |
|
font-weight: bold; |
|
color: #FF0000; |
|
} |
|
</style> |
|
<div class="footer"> |
|
CREATED BY: CHINMAY BHALERAO |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|