Spaces:
Sleeping
Sleeping
import pymupdf | |
from io import BytesIO | |
from PIL import Image | |
import pdfplumber | |
import ast | |
import google.generativeai as genai | |
from PIL import Image, ImageDraw | |
import openai | |
import requests | |
import os | |
# from constants import GEMINI_API_KEY, OPENAI_API_KEY | |
from utils import ( | |
draw_boxes, | |
pdf_to_images, | |
parse_bboxs_gemini_flash, | |
convert_pdf_to_images, | |
encode_image_to_base64, | |
) | |
def extract_images_pymupdf(pdf_file): | |
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
doc = pymupdf.open(pdf_path) | |
images = [] | |
for page_idx, page in enumerate(doc): | |
for img_index, img in enumerate(doc.get_page_images(page_idx)): | |
xref = img[0] | |
base_image = doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
image = Image.open(BytesIO(image_bytes)) | |
images.append(image) | |
return images if images != [] else None | |
def extract_images_pdfplumber(pdf_file): | |
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
images = [] | |
output_dir = "extract_tables/table_outputs" | |
pdf_obj = pdfplumber.open(pdf_path) | |
for page_idx, page in enumerate(pdf_obj.pages): | |
page_bbox = [] | |
for image_idx, image in enumerate(page.images): | |
page_height = page.height | |
image_bbox = ( | |
image["x0"], | |
page_height - image["y1"], | |
image["x1"], | |
page_height - image["y0"], | |
) | |
page_bbox.append(image_bbox) | |
cropped_page = page.crop(image_bbox) | |
image_obj = cropped_page.to_image(resolution=400) | |
image_path = os.path.join( | |
output_dir, f"image-{page_idx + 1}-{image_idx}.png" | |
) | |
image_obj.save(image_path) | |
image = Image.open(image_path) | |
images.append(image) | |
return images if images != [] else None | |
def extract_images_gemini(model, pdf_file): | |
gemini_api_key = os.getenv("GEMINI_API_KEY") | |
genai.configure(api_key=gemini_api_key) | |
gemini_model = genai.GenerativeModel(model) | |
prompt = f"Extract the bounding boxes of all the images present in this page. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output" | |
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
images = [] | |
pdf_images = pdf_to_images(pdf_path) | |
for page in pdf_images: | |
img = Image.open(page).convert("RGB") | |
response = gemini_model.generate_content([img, prompt], stream=False) | |
response.resolve() | |
print(response.text) | |
if model == "gemini-pro-vision": | |
page_bbox = ast.literal_eval(response.text) | |
elif model == "gemini-1.5-flash-latest": | |
page_bbox = parse_bboxs_gemini_flash(response.text) | |
image = draw_boxes(page, page_bbox) | |
images.append(image) | |
return images | |
def extract_images_gpt(model, pdf_file): | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
openai.api_key = openai_api_key | |
image_media_type = "image/png" | |
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
images = convert_pdf_to_images(pdf_path) | |
image_paths = pdf_to_images(pdf_path) | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {openai.api_key}", | |
} | |
extracted_images = [] | |
for page_idx, image in enumerate(images): | |
base64_string = encode_image_to_base64(image) | |
payload = { | |
"model": model, | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Extract bounding boxes of all the images present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.", | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64_string}" | |
}, | |
}, | |
], | |
} | |
], | |
} | |
response = requests.post( | |
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload | |
) | |
response_json = response.json() | |
print(response_json["choices"][0]["message"]["content"]) | |
if "choices" in response_json and len(response_json["choices"]) > 0: | |
extracted_images.append( | |
draw_boxes( | |
image_paths[page_idx], | |
ast.literal_eval(response_json["choices"][0]["message"]["content"]), | |
) | |
) | |
return extracted_images | |