Spaces:

ChinmayBH
/

PDF_DATA_EXTRACTOR_PAGEWISE

Running

App Files Files Community

ChinmayBH commited on Aug 14, 2024

Commit

a6cd894

verified ·

1 Parent(s): 96fadd5

updated app.py

Browse files

Files changed (1) hide show

app.py +0 -250

app.py CHANGED Viewed

@@ -1,250 +0,0 @@
-import streamlit as st
-import os
-import json
-import fitz
-from io import BytesIO
-from PIL import Image
-import pandas as pd
-import tempfile
-def extract_text_images(
-        pdf_path: str, output_folder: str,
-        minimum_font_size: int,
-        extraction_type: str = 'both'
-        ) -> dict:
-    """
-    Extracts text and/or images from a PDF and organizes them by pages.
-    Params
-    -------
-    pdf_path: str
-        Path to the input PDF file.
-    output_folder: str
-        Path to the output folder where extracted data will be saved.
-    minimum_font_size: int
-        Minimum font size below which the text will be ignored.
-    extraction_type: str
-        Type of extraction, either 'text', 'images', or 'both'.
-    Returns
-    -------
-    dict
-        The extracted data organized by pages.
-    """
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    extraction_data = []
-    pdf_document = fitz.open(pdf_path)
-    for page_number in range(pdf_document.page_count):
-        page = pdf_document.load_page(page_number)
-        elements = []
-        if extraction_type in ('text', 'both'):
-            text_blocks = page.get_text("dict")["blocks"]
-            lines = {}
-            for block in text_blocks:
-                if block["type"] == 0:
-                    for line in block["lines"]:
-                        for span in line["spans"]:
-                            font_size = span["size"]
-                            top = span["bbox"][1]
-                            if font_size < minimum_font_size:
-                                continue
-                            if top not in lines:
-                                lines[top] = []
-                            lines[top].append(span)
-            for top in sorted(lines.keys()):
-                line = lines[top]
-                line_text = " ".join([span['text'] for span in line])
-                elements.append({
-                    'type': 'text',
-                    'font_size': line[0]['size'],
-                    'page': page_number + 1,
-                    'content': line_text,
-                    'x0': line[0]['bbox'][0],
-                    'top': top,
-                })
-        if extraction_type in ('images', 'both'):
-            image_list = page.get_images(full=True)
-            for img_index, img in enumerate(image_list):
-                xref = img[0]
-                base_image = pdf_document.extract_image(xref)
-                image_bytes = base_image["image"]
-                image_filename = os.path.join(
-                    output_folder,
-                    f"page_{page_number + 1}_img_{img_index + 1}.png"
-                )
-                with open(image_filename, "wb") as img_file:
-                    img_file.write(image_bytes)
-                img_rect = page.get_image_bbox(img)
-                elements.append({
-                    'type': 'image',
-                    'page': page_number + 1,
-                    'path': image_filename,
-                    'x0': img_rect.x0,
-                    'top': img_rect.y0
-                })
-        elements.sort(key=lambda e: (e['top'], e['x0']))
-        page_content = []
-        for element in elements:
-            if element['type'] == 'text':
-                if page_content and page_content[-1]['type'] == 'text':
-                    page_content[-1]['content'] += " " + element['content']
-                else:
-                    page_content.append({
-                        'type': 'text',
-                        'content': element['content']
-                    })
-            elif element['type'] == 'image':
-                page_content.append({
-                    'type': 'image',
-                    'path': element['path']
-                })
-        extraction_data.append({
-            'page': page_number + 1,
-            'content': page_content
-        })
-    pdf_document.close()
-    return extraction_data
-def convert_to_xlsx(data: dict) -> BytesIO:
-    rows = []
-    for item in data:
-        page_number = item['page']
-        content_list = item['content']
-        for content in content_list:
-            if content['type'] == 'text':
-                rows.append({
-                    'Page': page_number,
-                    'Content': content['content']
-                })
-            elif content['type'] == 'image':
-                rows.append({
-                    'Page': page_number,
-                    'Content': f"[Image: {content['path']}]"
-                })
-    df = pd.DataFrame(rows)
-    output = BytesIO()
-    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
-        df.to_excel(writer, index=False, sheet_name='Extraction')
-    output.seek(0)
-    return output
-def main():
-    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
-    st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
-    st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
-    pdf_file = st.file_uploader("Upload PDF", type="pdf")
-    if pdf_file is not None:
-        num_pages_to_preview = st.sidebar.slider(
-            "Select number of pages to preview:",
-            min_value=1, max_value=5, value=1
-        )
-        pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
-        for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
-            page = pdf_document.load_page(page_num)
-            pix = page.get_pixmap()
-            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
-    st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
-    extraction_type = st.selectbox(
-        "Choose extraction type:",
-        ("text", "images", "both")
-    )
-    st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
-    minimum_font_size = st.number_input(
-        "Minimum font size to extract:",
-        min_value=1, value=2
-    )
-    if st.button("Start Extraction"):
-        if pdf_file is not None:
-            with tempfile.TemporaryDirectory() as output_folder:
-                temp_pdf_path = os.path.join(output_folder, pdf_file.name)
-                with open(temp_pdf_path, "wb") as f:
-                    f.write(pdf_file.getvalue())
-                extraction_data = extract_text_images(
-                    temp_pdf_path,
-                    output_folder,
-                    minimum_font_size,
-                    extraction_type
-                )
-                st.json(extraction_data)
-                xlsx_data = convert_to_xlsx(extraction_data)
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.download_button(
-                        label="Download JSON",
-                        data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
-                        file_name='extraction_data.json',
-                        mime='application/json')
-                with col2:
-                    st.download_button(
-                        label="Download XLSX",
-                        data=xlsx_data,
-                        file_name='extraction_data.xlsx',
-                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
-        else:
-            st.error("Please upload a PDF file.")
-    st.markdown(
-        """
-        <style>
-        .footer {
-            position: fixed;
-            bottom: 0;
-            left: 0;
-            width: 100%;
-            background-color: #F0F0F0;
-            font-family:cursive;
-            text-align: right;
-            padding: 5px 0;
-            font-size:20px;
-            font-weight: bold;
-            color: #FF0000;
-        }
-        </style>
-        <div class="footer">
-            CREATED BY: CHINMAY BHALERAO
-        </div>
-        """,
-        unsafe_allow_html=True
-    )
-if __name__ == "__main__":
-    main()