Spaces:

markllego
/

openai-gpt4-vision

Running

App Files Files Community

markllego

ychen commited on Nov 10, 2023

Commit

6d09e4d

1 Parent(s): 71ff575

IMPORTANT: Ask the user to provide UI & other improvements (#1)

Browse files

- Add custom options (7d0ff4471d4cb45c4514a47ac84250a9515dbf02)

Co-authored-by: ychen <ychen@users.noreply.huggingface.co>

Files changed (2) hide show

.gitignore +2 -0
app.py +59 -24

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv/
2	+ flagged/

app.py CHANGED Viewed

@@ -2,29 +2,31 @@
 import gradio as gr
 import openai
 import base64
-from PIL import Image
 import io
 import requests
-import os
-# Consider using environment variables or a configuration file for API keys.
-# WARNING: Do not hardcode API keys in your code, especially if sharing or using version control.
-openai.api_key = os.getenv('OPENAI_API_KEY')
-if openai.api_key is None:
-    raise ValueError("Please set the OPENAI_API_KEY environment variable.")
 # Function to encode the image to base64
 def encode_image_to_base64(image):
     buffered = io.BytesIO()
     image.save(buffered, format="JPEG")
-    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
     return img_str
 # Function to send the image to the OpenAI API and get a response
-def ask_openai_with_image(image):
     # Encode the uploaded image to base64
     base64_image = encode_image_to_base64(image)
     # Create the payload with the base64 encoded image
     payload = {
         "model": "gpt-4-vision-preview",
@@ -34,25 +36,28 @@ def ask_openai_with_image(image):
                 "content": [
                     {
                         "type": "text",
-                        "text": "I've uploaded an image and I'd like to know what it depicts and any interesting details you can provide."
                     },
                     {
                         "type": "image_url",
-                        "image_url": f"data:image/jpeg;base64,{base64_image}"
-                    }
-                ]
             }
         ],
-        "max_tokens": 4095
     }
     # Send the request to the OpenAI API
     response = requests.post(
         "https://api.openai.com/v1/chat/completions",
         headers={"Authorization": f"Bearer {openai.api_key}"},
-        json=payload
     )
     # Check if the request was successful
     if response.status_code == 200:
         response_json = response.json()
@@ -69,14 +74,44 @@ def ask_openai_with_image(image):
         # If an error occurred, return the error message
         return f"Error: {response.text}"
 # Create a Gradio interface
-iface = gr.Interface(
     fn=ask_openai_with_image,
-    inputs=gr.Image(type="pil"),
-    outputs="text",
-    title="GPT-4 with Vision",
-    description="Upload an image and get a description from GPT-4 with Vision."
 )
 # Launch the app
-iface.launch()

 import gradio as gr
 import openai
 import base64
 import io
 import requests
 # Function to encode the image to base64
 def encode_image_to_base64(image):
     buffered = io.BytesIO()
     image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
     return img_str
 # Function to send the image to the OpenAI API and get a response
+def ask_openai_with_image(api_key, instruction, json_prompt, low_quality_mode, image):
+    # Set the OpenAI API key
+    openai.api_key = api_key
     # Encode the uploaded image to base64
     base64_image = encode_image_to_base64(image)
+    instruction = instruction.strip()
+    if json_prompt.strip() != "":
+        instruction = f"{instruction}\n\nReturn in JSON format and include the following attributes:\n\n{json_prompt.strip()}"
     # Create the payload with the base64 encoded image
     payload = {
         "model": "gpt-4-vision-preview",
                 "content": [
                     {
                         "type": "text",
+                        "text": instruction,
                     },
                     {
                         "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "detail": "low" if low_quality_mode else "high",
+                        },
+                    },
+                ],
             }
         ],
+        "max_tokens": 4095,
     }
     # Send the request to the OpenAI API
     response = requests.post(
         "https://api.openai.com/v1/chat/completions",
         headers={"Authorization": f"Bearer {openai.api_key}"},
+        json=payload,
     )
     # Check if the request was successful
     if response.status_code == 200:
         response_json = response.json()
         # If an error occurred, return the error message
         return f"Error: {response.text}"
+json_schema = gr.Textbox(
+    label="JSON Attributes",
+    info="Define a list of attributes to force the model to respond in valid json format. Leave blank to disable json formatting.",
+    lines=3,
+    placeholder="""Example:
+- name: Name of the object
+- color: Color of the object
+""",
+)
+instructions = gr.Textbox(
+    label="Instructions",
+    info="Instructions for the vision model to follow. Leave blank to use default.",
+    lines=2,
+    placeholder="""Default:
+I've uploaded an image and I'd like to know what it depicts and any interesting details you can provide.""",
+)
+low_quality_mode = gr.Checkbox(
+    label="Low Quality Mode",
+    info="See here: https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding.",
+)
 # Create a Gradio interface
+vision_playground = gr.Interface(
     fn=ask_openai_with_image,
+    inputs=[
+        gr.Textbox(label="API Key"),
+        instructions,
+        json_schema,
+        low_quality_mode,
+        gr.Image(type="pil", label="Image"),
+    ],
+    outputs=[gr.Markdown()],
+    title="GPT-4-Vision Playground",
+    description="Upload an image and get a description from GPT-4 with Vision.",
 )
 # Launch the app
+vision_playground.launch()