Spaces:
Running
Running
import os, base64, requests, yaml | |
from PIL import Image | |
from openai import OpenAI | |
from general_utils import calculate_cost | |
# PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}""" | |
PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:""" | |
class GPT4oMiniOCR: | |
def __init__(self, api_key): | |
self.api_key = api_key | |
self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml') | |
def encode_image(self, image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode('utf-8') | |
def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512): | |
# Getting the base64 string | |
base64_image = self.encode_image(image_path) | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {self.api_key}" | |
} | |
payload = { | |
"model": "gpt-4o-mini", | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": PROMPT, | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64_image}", | |
"detail": resolution, | |
} | |
} | |
] | |
} | |
], | |
"max_tokens": max_tokens | |
} | |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) | |
response_json = response.json() | |
if "choices" in response_json : | |
parsed_answer = response_json["choices"][0]["message"]["content"] | |
else: | |
parsed_answer = None | |
usage_report = response_json.get('usage', {}) | |
tokens_in = usage_report["prompt_tokens"] | |
tokens_out = usage_report["completion_tokens"] | |
total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out) | |
cost_in, cost_out, total_cost, rates_in, rates_out = total_cost | |
return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out | |
def main(): | |
# img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg' | |
img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg' | |
# $env:OPENAI_API_KEY="KEY" | |
API_KEY = "" | |
ocr = GPT4oMiniOCR(API_KEY) | |
parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512) | |
print(f"Parsed Answer: {parsed_answer}") | |
print(f"Total Cost: {total_cost}") | |
parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512) | |
print(f"Parsed Answer: {parsed_answer}") | |
print(f"Total Cost: {total_cost}") | |
if __name__ == '__main__': | |
main() |