Spaces:

Roboflow
/

SoM

Sleeping

App Files Files Community

SkalskiP commited on Nov 21, 2023

Commit

0757835

•

1 Parent(s): 0e39589

Update README file and code refactoring

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +25 -17
gpt4v.py +6 -6

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: SoM
-emoji: 👁
 colorFrom: pink
 colorTo: yellow
 sdk: docker

 ---
+title: Set of Marks
+emoji: ✅
 colorFrom: pink
 colorTo: yellow
 sdk: docker

app.py CHANGED Viewed

@@ -9,10 +9,10 @@ import supervision as sv
 from typing import List
 from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
 from utils import postprocess_masks, Visualizer
 HOME = os.getenv("HOME")
 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-MINIMUM_AREA_THRESHOLD = 0.01
 SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
 # SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
@@ -27,13 +27,6 @@ MARKDOWN = """
     Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
 </h1>
-## 🚀 How To
-- Upload an image.
-- Click the `Run` button to generate the image with marks.
-- Pass OpenAI API 🔑. You can get one [here](https://platform.openai.com/api-keys).
-- Ask GPT-4V questions about the image in the chatbot.
 ## 🚧 Roadmap
 - [ ] Support for alphabetic labels
@@ -55,8 +48,7 @@ def inference(
     result = mask_generator.generate(image=image)
     detections = sv.Detections.from_sam(result)
     detections = postprocess_masks(
-        detections=detections,
-        area_threshold=MINIMUM_AREA_THRESHOLD)
     bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
     annotated_image = visualizer.visualize(
         image=bgr_image,
@@ -68,8 +60,16 @@ def inference(
     return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
-def prompt(message, history):
-    return "response"
 image_input = gr.Image(
@@ -89,8 +89,10 @@ image_output = gr.Image(
     label="SoM Visual Prompt",
     type="numpy",
     height=512)
-textbox_api_key = gr.Textbox(
-    label="OpenAI API KEY",
     type="password")
 chatbot = gr.Chatbot(
     label="GPT-4V + SoM",
@@ -102,7 +104,9 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             image_input.render()
-            with gr.Accordion(label="Detailed prompt settings (e.g., mark type)", open=False):
                 with gr.Row():
                     checkbox_annotation_mode.render()
                 with gr.Row():
@@ -110,9 +114,13 @@ with gr.Blocks() as demo:
         with gr.Column():
             image_output.render()
             run_button.render()
-    textbox_api_key.render()
     with gr.Row():
-        gr.ChatInterface(chatbot=chatbot, fn=prompt)
     run_button.click(
         fn=inference,

 from typing import List
 from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
 from utils import postprocess_masks, Visualizer
+from gpt4v import prompt_image
 HOME = os.getenv("HOME")
 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
 # SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
     Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
 </h1>
 ## 🚧 Roadmap
 - [ ] Support for alphabetic labels
     result = mask_generator.generate(image=image)
     detections = sv.Detections.from_sam(result)
     detections = postprocess_masks(
+        detections=detections)
     bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
     annotated_image = visualizer.visualize(
         image=bgr_image,
     return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
+def prompt(message, history, image: np.ndarray, api_key: str) -> str:
+    if api_key == "":
+        return "⚠️ Please set your OpenAI API key first"
+    if image is None:
+        return "⚠️ Please generate SoM visual prompt first"
+    return prompt_image(
+        api_key=api_key,
+        image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
+        prompt=message
+    )
 image_input = gr.Image(
     label="SoM Visual Prompt",
     type="numpy",
     height=512)
+openai_api_key = gr.Textbox(
+    show_label=False,
+    placeholder="Before you start chatting, set your OpenAI API key here",
+    lines=1,
     type="password")
 chatbot = gr.Chatbot(
     label="GPT-4V + SoM",
     with gr.Row():
         with gr.Column():
             image_input.render()
+            with gr.Accordion(
+                    label="Detailed prompt settings (e.g., mark type)",
+                    open=False):
                 with gr.Row():
                     checkbox_annotation_mode.render()
                 with gr.Row():
         with gr.Column():
             image_output.render()
             run_button.render()
     with gr.Row():
+        openai_api_key.render()
+    with gr.Row():
+        gr.ChatInterface(
+            chatbot=chatbot,
+            fn=prompt,
+            additional_inputs=[image_output, openai_api_key])
     run_button.click(
         fn=inference,

gpt4v.py CHANGED Viewed

@@ -42,15 +42,15 @@ def compose_payload(image: np.ndarray, prompt: str) -> dict:
     return {
         "model": "gpt-4-vision-preview",
         "messages": [
             {
                 "role": "user",
                 "content": [
-                    {
-                        "role": "system",
-                        "content": [
-                            META_PROMPT
-                        ]
-                    },
                     {
                         "type": "text",
                         "text": prompt

     return {
         "model": "gpt-4-vision-preview",
         "messages": [
+            {
+                "role": "system",
+                "content": [
+                    META_PROMPT
+                ]
+            },
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "text",
                         "text": prompt