multimodal-ai / app.py
bstraehle's picture
Update app.py
87f0bbf verified
import gradio as gr
import base64, os, threading
from openai import OpenAI
lock = threading.Lock()
config = {
"max_tokens": 1000,
"model": "gpt-4o",
"temperature": 0
}
def get_img_b64(img_path):
with open(img_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode("utf-8")
def invoke(openai_api_key, prompt, image):
if not openai_api_key:
raise gr.Error("OpenAI API Key is required.")
if not prompt:
raise gr.Error("Prompt is required.")
if not image:
raise gr.Error("Image is required.")
with lock:
os.environ["OPENAI_API_KEY"] = openai_api_key
content = ""
try:
client = OpenAI()
img_b64 = get_img_b64(image)
completion = client.chat.completions.create(
max_tokens = config["max_tokens"],
messages = [{"role": "user",
"content": [{"type": "text",
"text": prompt},
{"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}]}],
model = config["model"],
temperature = config["temperature"]
)
content = completion.choices[0].message.content
except Exception as e:
err_msg = e
raise gr.Error(e)
finally:
del os.environ["OPENAI_API_KEY"]
return content
gr.close_all()
demo = gr.Interface(
fn = invoke,
inputs = [gr.Textbox(label = "OpenAI API Key", type = "password", lines = 1),
gr.Textbox(label = "Prompt", lines = 1, value = "Describe the diagram"),
gr.Image(label = "Image", type = "filepath", sources = ["upload"],
value = "https://raw.githubusercontent.com/bstraehle/ai-ml-dl/main/hugging-face/multimodal-ai/architecture.png")],
outputs = [gr.Markdown(label = "Completion", value = os.environ["OUTPUT"])],
title = "Multimodal Reasoning Application",
description = os.environ["DESCRIPTION"]
)
demo.launch()