Update README file and code refactoring
Browse files
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
|
|
1 |
---
|
2 |
+
title: Set of Marks
|
3 |
+
emoji: ✅
|
4 |
colorFrom: pink
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
app.py
CHANGED
@@ -9,10 +9,10 @@ import supervision as sv
|
|
9 |
from typing import List
|
10 |
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
|
11 |
from utils import postprocess_masks, Visualizer
|
|
|
12 |
|
13 |
HOME = os.getenv("HOME")
|
14 |
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
15 |
-
MINIMUM_AREA_THRESHOLD = 0.01
|
16 |
|
17 |
SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
|
18 |
# SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
|
@@ -27,13 +27,6 @@ MARKDOWN = """
|
|
27 |
Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
|
28 |
</h1>
|
29 |
|
30 |
-
## 🚀 How To
|
31 |
-
|
32 |
-
- Upload an image.
|
33 |
-
- Click the `Run` button to generate the image with marks.
|
34 |
-
- Pass OpenAI API 🔑. You can get one [here](https://platform.openai.com/api-keys).
|
35 |
-
- Ask GPT-4V questions about the image in the chatbot.
|
36 |
-
|
37 |
## 🚧 Roadmap
|
38 |
|
39 |
- [ ] Support for alphabetic labels
|
@@ -55,8 +48,7 @@ def inference(
|
|
55 |
result = mask_generator.generate(image=image)
|
56 |
detections = sv.Detections.from_sam(result)
|
57 |
detections = postprocess_masks(
|
58 |
-
detections=detections
|
59 |
-
area_threshold=MINIMUM_AREA_THRESHOLD)
|
60 |
bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
|
61 |
annotated_image = visualizer.visualize(
|
62 |
image=bgr_image,
|
@@ -68,8 +60,16 @@ def inference(
|
|
68 |
return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
|
69 |
|
70 |
|
71 |
-
def prompt(message, history):
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
|
75 |
image_input = gr.Image(
|
@@ -89,8 +89,10 @@ image_output = gr.Image(
|
|
89 |
label="SoM Visual Prompt",
|
90 |
type="numpy",
|
91 |
height=512)
|
92 |
-
|
93 |
-
|
|
|
|
|
94 |
type="password")
|
95 |
chatbot = gr.Chatbot(
|
96 |
label="GPT-4V + SoM",
|
@@ -102,7 +104,9 @@ with gr.Blocks() as demo:
|
|
102 |
with gr.Row():
|
103 |
with gr.Column():
|
104 |
image_input.render()
|
105 |
-
with gr.Accordion(
|
|
|
|
|
106 |
with gr.Row():
|
107 |
checkbox_annotation_mode.render()
|
108 |
with gr.Row():
|
@@ -110,9 +114,13 @@ with gr.Blocks() as demo:
|
|
110 |
with gr.Column():
|
111 |
image_output.render()
|
112 |
run_button.render()
|
113 |
-
textbox_api_key.render()
|
114 |
with gr.Row():
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
run_button.click(
|
118 |
fn=inference,
|
|
|
9 |
from typing import List
|
10 |
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
|
11 |
from utils import postprocess_masks, Visualizer
|
12 |
+
from gpt4v import prompt_image
|
13 |
|
14 |
HOME = os.getenv("HOME")
|
15 |
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
|
|
16 |
|
17 |
SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
|
18 |
# SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
|
|
|
27 |
Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
|
28 |
</h1>
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
## 🚧 Roadmap
|
31 |
|
32 |
- [ ] Support for alphabetic labels
|
|
|
48 |
result = mask_generator.generate(image=image)
|
49 |
detections = sv.Detections.from_sam(result)
|
50 |
detections = postprocess_masks(
|
51 |
+
detections=detections)
|
|
|
52 |
bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
|
53 |
annotated_image = visualizer.visualize(
|
54 |
image=bgr_image,
|
|
|
60 |
return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
|
61 |
|
62 |
|
63 |
+
def prompt(message, history, image: np.ndarray, api_key: str) -> str:
|
64 |
+
if api_key == "":
|
65 |
+
return "⚠️ Please set your OpenAI API key first"
|
66 |
+
if image is None:
|
67 |
+
return "⚠️ Please generate SoM visual prompt first"
|
68 |
+
return prompt_image(
|
69 |
+
api_key=api_key,
|
70 |
+
image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
|
71 |
+
prompt=message
|
72 |
+
)
|
73 |
|
74 |
|
75 |
image_input = gr.Image(
|
|
|
89 |
label="SoM Visual Prompt",
|
90 |
type="numpy",
|
91 |
height=512)
|
92 |
+
openai_api_key = gr.Textbox(
|
93 |
+
show_label=False,
|
94 |
+
placeholder="Before you start chatting, set your OpenAI API key here",
|
95 |
+
lines=1,
|
96 |
type="password")
|
97 |
chatbot = gr.Chatbot(
|
98 |
label="GPT-4V + SoM",
|
|
|
104 |
with gr.Row():
|
105 |
with gr.Column():
|
106 |
image_input.render()
|
107 |
+
with gr.Accordion(
|
108 |
+
label="Detailed prompt settings (e.g., mark type)",
|
109 |
+
open=False):
|
110 |
with gr.Row():
|
111 |
checkbox_annotation_mode.render()
|
112 |
with gr.Row():
|
|
|
114 |
with gr.Column():
|
115 |
image_output.render()
|
116 |
run_button.render()
|
|
|
117 |
with gr.Row():
|
118 |
+
openai_api_key.render()
|
119 |
+
with gr.Row():
|
120 |
+
gr.ChatInterface(
|
121 |
+
chatbot=chatbot,
|
122 |
+
fn=prompt,
|
123 |
+
additional_inputs=[image_output, openai_api_key])
|
124 |
|
125 |
run_button.click(
|
126 |
fn=inference,
|
gpt4v.py
CHANGED
@@ -42,15 +42,15 @@ def compose_payload(image: np.ndarray, prompt: str) -> dict:
|
|
42 |
return {
|
43 |
"model": "gpt-4-vision-preview",
|
44 |
"messages": [
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
{
|
46 |
"role": "user",
|
47 |
"content": [
|
48 |
-
{
|
49 |
-
"role": "system",
|
50 |
-
"content": [
|
51 |
-
META_PROMPT
|
52 |
-
]
|
53 |
-
},
|
54 |
{
|
55 |
"type": "text",
|
56 |
"text": prompt
|
|
|
42 |
return {
|
43 |
"model": "gpt-4-vision-preview",
|
44 |
"messages": [
|
45 |
+
{
|
46 |
+
"role": "system",
|
47 |
+
"content": [
|
48 |
+
META_PROMPT
|
49 |
+
]
|
50 |
+
},
|
51 |
{
|
52 |
"role": "user",
|
53 |
"content": [
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
{
|
55 |
"type": "text",
|
56 |
"text": prompt
|