"""VLM Helper Functions.""" import base64 import numpy as np from openai import OpenAI class GPT4V: """GPT4V VLM.""" def __init__(self, openai_api_key): self.client = OpenAI(api_key=openai_api_key) def query(self, prompt_seq, temperature=0, max_tokens=512): """Queries GPT-4V.""" content = [] for elem in prompt_seq: if isinstance(elem, str): content.append({'type': 'text', 'text': elem}) elif isinstance(elem, np.ndarray): base64_image_str = base64.b64encode(elem).decode('utf-8') image_url = f'data:image/jpeg;base64,{base64_image_str}' content.append({'type': 'image_url', 'image_url': {'url': image_url}}) messages = [{'role': 'user', 'content': content}] response = self.client.chat.completions.create( model='gpt-4-vision-preview', messages=messages, temperature=temperature, max_tokens=max_tokens ) return response.choices[0].message.content from azure_openai_gpt4o import call_llm class GPT4Azure: """GPT4V VLM via Azure API""" def __init__(self, openai_api_key): """ Dummy inteface: azure api_key is read from .env file - no need to pass it here """ def query(self, prompt_seq, temperature=0, max_tokens=512): """Queries GPT-4V.""" content = [] for elem in prompt_seq: if isinstance(elem, str): content.append({'type': 'text', 'text': elem}) elif isinstance(elem, np.ndarray): base64_image_str = base64.b64encode(elem).decode('utf-8') image_url = f'data:image/jpeg;base64,{base64_image_str}' content.append({'type': 'image_url', 'image_url': {'url': image_url}}) messages = [{'role': 'user', 'content': content}] response = call_llm(messages, azure_deployment_model = None, max_tokens=max_tokens, temperature=temperature) return response