File size: 4,388 Bytes
d2cb17f
 
 
 
 
 
 
 
 
 
 
 
 
 
86ac03f
 
 
 
 
d2cb17f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image, ImageDraw
from io import BytesIO
import pandas as pd
import os
import numpy as np
import google.generativeai as genai
import openai
import base64
import requests
import tempfile
import ast

gemini_api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=gemini_api_key)

openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key

import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
import tempfile


# Define the model extraction functions
def extract_bounding_box_pymupdf(pdf_content):
    bounding_boxes = []
    pdf_file = fitz.open(stream=pdf_content, filetype="pdf")

    for page_index in range(len(pdf_file)):
        page_bbox = []
        page = pdf_file[page_index]
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(page.get_images(full=True), start=1):
            rect = page.get_image_bbox(img[7])
            bbox = list(rect)
            page_bbox.append(bbox)
        bounding_boxes.append(page_bbox)

    pdf_file.close()  # Close the PDF file after use
    return bounding_boxes


def extract_bounding_boxes_gemini(api_key, images):
    # Placeholder for Gemini API integration
    bounding_boxes = [[(0, 0, 100, 100)]] * len(images)  # Dummy bounding boxes
    return bounding_boxes


def extract_bounding_box_gpt(api_key, pdf_content):
    # Placeholder for GPT-4 API integration
    bounding_boxes = [[(0, 0, 100, 100)]] * len(
        fitz.open(stream=pdf_content, filetype="pdf")
    )  # Dummy bounding boxes
    return bounding_boxes


def extract_images_and_tables(pdf_file, model_option):
    if isinstance(pdf_file, str):
        # If input is a file path (usually in testing or local execution)
        with open(pdf_file, "rb") as f:
            pdf_bytes = f.read()
    elif isinstance(pdf_file, bytes):
        # If input is bytes (from Gradio)
        pdf_bytes = pdf_file
    else:
        raise TypeError("Unsupported input type for pdf_file.")

    pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")

    images = []
    for page_index in range(len(pdf_document)):
        for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(BytesIO(image_bytes))
            images.append(image)

    tables = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text = page.get_text("text")

        lines = [line.strip() for line in text.split("\n") if line.strip()]

        if any("," in line for line in lines):
            rows = [line.split(",") for line in lines]
            tables.extend(rows)

    table_content = ""
    if tables:
        max_columns = max(len(row) for row in tables)
        tables = [row + [""] * (max_columns - len(row)) for row in tables]
        df = pd.DataFrame(tables[1:], columns=tables[0])
        table_content = df.to_csv(index=False)

    pdf_document.close()

    if model_option == "PyMuPDF":
        bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes)
    elif model_option == "Gemini":
        bounding_boxes = extract_bounding_boxes_gemini(
            "your_gemini_api_key_here", images
        )
    elif model_option == "GPT-4":
        bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes)
    else:
        bounding_boxes = []

    return images, table_content, bounding_boxes


def handle_model_selection(pdf_file, model_option):
    return extract_images_and_tables(pdf_file, model_option)


# Define the Gradio interface
interface = gr.Interface(
    fn=handle_model_selection,
    inputs=[
        gr.File(type="filepath", label="Upload PDF"),
        gr.Dropdown(
            label="Select Model",
            choices=["PyMuPDF", "Gemini", "GPT-4"],
            value="PyMuPDF",
        ),
    ],
    outputs=[
        gr.Gallery(label="Extracted Images"),
        gr.Textbox(label="Extracted Tables"),
        gr.JSON(label="Extracted Bounding Boxes"),
    ],
    title="PDF Image and Table Extractor",
    description="Upload a PDF to extract images and tables. Choose the model for extraction.",
)

interface.launch(share=True)