"""VLM Helper Functions.""" import base64 import numpy as np from openai import OpenAI class GPT4V: """GPT4V VLM.""" def __init__(self, openai_api_key): self.client = OpenAI(api_key=openai_api_key) def query(self, prompt_seq, temperature=0, max_tokens=512): """Queries GPT-4V.""" content = [] for elem in prompt_seq: if isinstance(elem, str): content.append({'type': 'text', 'text': elem}) elif isinstance(elem, np.ndarray): base64_image_str = base64.b64encode(elem).decode('utf-8') image_url = f'data:image/jpeg;base64,{base64_image_str}' content.append({'type': 'image_url', 'image_url': {'url': image_url}}) messages = [{'role': 'user', 'content': content}] response = self.client.chat.completions.create( model='gpt-4-vision-preview', messages=messages, temperature=temperature, max_tokens=max_tokens ) return response.choices[0].message.content