Spaces:
Sleeping
Sleeping
File size: 4,388 Bytes
d2cb17f 86ac03f d2cb17f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import fitz # PyMuPDF
from PIL import Image, ImageDraw
from io import BytesIO
import pandas as pd
import os
import numpy as np
import google.generativeai as genai
import openai
import base64
import requests
import tempfile
import ast
gemini_api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=gemini_api_key)
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
import tempfile
# Define the model extraction functions
def extract_bounding_box_pymupdf(pdf_content):
bounding_boxes = []
pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
for page_index in range(len(pdf_file)):
page_bbox = []
page = pdf_file[page_index]
image_list = page.get_images(full=True)
for image_index, img in enumerate(page.get_images(full=True), start=1):
rect = page.get_image_bbox(img[7])
bbox = list(rect)
page_bbox.append(bbox)
bounding_boxes.append(page_bbox)
pdf_file.close() # Close the PDF file after use
return bounding_boxes
def extract_bounding_boxes_gemini(api_key, images):
# Placeholder for Gemini API integration
bounding_boxes = [[(0, 0, 100, 100)]] * len(images) # Dummy bounding boxes
return bounding_boxes
def extract_bounding_box_gpt(api_key, pdf_content):
# Placeholder for GPT-4 API integration
bounding_boxes = [[(0, 0, 100, 100)]] * len(
fitz.open(stream=pdf_content, filetype="pdf")
) # Dummy bounding boxes
return bounding_boxes
def extract_images_and_tables(pdf_file, model_option):
if isinstance(pdf_file, str):
# If input is a file path (usually in testing or local execution)
with open(pdf_file, "rb") as f:
pdf_bytes = f.read()
elif isinstance(pdf_file, bytes):
# If input is bytes (from Gradio)
pdf_bytes = pdf_file
else:
raise TypeError("Unsupported input type for pdf_file.")
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
for page_index in range(len(pdf_document)):
for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(BytesIO(image_bytes))
images.append(image)
tables = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text("text")
lines = [line.strip() for line in text.split("\n") if line.strip()]
if any("," in line for line in lines):
rows = [line.split(",") for line in lines]
tables.extend(rows)
table_content = ""
if tables:
max_columns = max(len(row) for row in tables)
tables = [row + [""] * (max_columns - len(row)) for row in tables]
df = pd.DataFrame(tables[1:], columns=tables[0])
table_content = df.to_csv(index=False)
pdf_document.close()
if model_option == "PyMuPDF":
bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes)
elif model_option == "Gemini":
bounding_boxes = extract_bounding_boxes_gemini(
"your_gemini_api_key_here", images
)
elif model_option == "GPT-4":
bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes)
else:
bounding_boxes = []
return images, table_content, bounding_boxes
def handle_model_selection(pdf_file, model_option):
return extract_images_and_tables(pdf_file, model_option)
# Define the Gradio interface
interface = gr.Interface(
fn=handle_model_selection,
inputs=[
gr.File(type="filepath", label="Upload PDF"),
gr.Dropdown(
label="Select Model",
choices=["PyMuPDF", "Gemini", "GPT-4"],
value="PyMuPDF",
),
],
outputs=[
gr.Gallery(label="Extracted Images"),
gr.Textbox(label="Extracted Tables"),
gr.JSON(label="Extracted Bounding Boxes"),
],
title="PDF Image and Table Extractor",
description="Upload a PDF to extract images and tables. Choose the model for extraction.",
)
interface.launch(share=True)
|