import gradio as gr import base64 import anthropic from openai import OpenAI # Assuming anthropic is a package that provides an Anthropic client for interacting with Claude # and it's installed or defined somewhere in your project from anthropic import Anthropic def create_image_content(image, MT, detail = "low"): return { "type": "image_url", "image_url": {"url": f"data:{MT};base64,{image}", "detail": detail} } def image_to_base64(image_path): """Convert the image to base64.""" with open(image_path, "rb") as image_file: image_data = image_file.read() return base64.b64encode(image_data).decode("utf-8") def get_media_type(image_name): """Get the media type of the uploaded image based on its file extension.""" if image_name.lower().endswith(".jpg") or image_name.lower().endswith(".jpeg"): return "image/jpeg" elif image_name.lower().endswith(".png"): return "image/png" else: return None # Extend this function based on the image formats you expect to handle def set_system_message(sysmsg): return [{ "role": "system", "content": sysmsg }] def describe_image(image_path, claude_api_key, openai_api_key, model, prompt): """Send the image to the selected model for description.""" try: if model.startswith("claude"): # Using Anthropic Claude models if not claude_api_key: return "Claude API key is required for Claude models." client = Anthropic(api_key=claude_api_key) message = client.messages.create( model=model, max_tokens=1024, messages=[ { "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": get_media_type(image_path), "data": image_to_base64(image_path), }, }, { "type": "text", "text": prompt } ], } ], ) return message.content[0].text elif model == "gpt-4-vision Low" or model == "gpt-4-vision High": # Using OpenAI GPT-4 Vision if not openai_api_key: return "OpenAI API key is required for GPT-4 Vision." client = OpenAI(api_key = openai_api_key) processed_image = image_to_base64(image_path) mt = get_media_type(image_path) if model == "gpt-4-vision Low": detail = "low"#image_content = create_image_content(processed_image, mt) else: detail = "high"#image_content = create_image_content(processed_image, mt, "high") system_message = set_system_message("You are GPT-4.") response = client.chat.completions.create( model="gpt-4-vision-preview", messages=system_message + [ { "role": "user", "content": [{ "type": "image_url", "image_url": {"url": f"data:{mt};base64,{processed_image}", "detail": detail} }] }, { "role": "user", "content": prompt } ], max_tokens=1024 ) return response.choices[0].message.content except Exception as e: return f"Error: {str(e)}" def main(image_path, claude_api_key, openai_api_key, model_a, model_b, prompt): if claude_api_key or openai_api_key: description_a = describe_image(image_path, claude_api_key, openai_api_key, model_a, prompt) description_b = describe_image(image_path, claude_api_key, openai_api_key, model_b, prompt) return description_a, description_b else: return "Please enter a valid API key.", "Please enter a valid API key." model_options = ["claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307", "gpt-4-vision Low", "gpt-4-vision High"] with gr.Blocks() as iface: gr.Markdown("# Image Description with Claude Models and GPT-4 Vision") gr.Markdown("Drag and drop an image to get descriptions from different models.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="filepath", label="Upload Image") claude_api_key_input = gr.Textbox(type="password", label="Enter your Claude API Key") openai_api_key_input = gr.Textbox(type="password", label="Enter your OpenAI API Key") with gr.Column(): model_a_dropdown = gr.Dropdown(choices=model_options, label="Model A") model_b_dropdown = gr.Dropdown(choices=model_options, label="Model B") with gr.Row(): output_a = gr.Textbox(label="Description from Model A") output_b = gr.Textbox(label="Description from Model B") prompt_input = gr.Textbox(label="Custom Prompt", value="As an AI image tagging expert, please provide precise tags for these images to enhance CLIP model's understanding of the content. Employ succinct keywords or phrases, steering clear of elaborate sentences and extraneous conjunctions. Prioritize the tags by relevance. Your tags should capture key elements such as the main subject, setting, artistic style, composition, image quality, color tone, filter, and camera specifications, and any other tags crucial for the image. When tagging photos of people, include specific details like gender, nationality, attire, actions, pose, expressions, accessories, makeup, composition type, age, etc. For other image categories, apply appropriate and common descriptive tags as well. Recognize and tag any celebrities, well-known landmark or IPs if clearly featured in the image. Your tags should be accurate, non-duplicative, and within a 20-75 word count range. These tags will use for image re-creation, so the closer the resemblance to the original image, the better the tag quality. Tags should be comma-separated. Exceptional tagging will be rewarded with $10 per image.") run_button = gr.Button("Run") run_button.click( fn=lambda image_path, claude_api_key, openai_api_key, model_a, model_b, prompt: main(image_path, claude_api_key, openai_api_key, model_a, model_b, prompt), inputs=[image_input, claude_api_key_input, openai_api_key_input, model_a_dropdown, model_b_dropdown, prompt_input], outputs=[output_a, output_b] ) if __name__ == "__main__": iface.launch()