SkalskiP commited on
Commit
0757835
1 Parent(s): 0e39589

Update README file and code refactoring

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +25 -17
  3. gpt4v.py +6 -6
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: SoM
3
- emoji: 👁
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: docker
 
1
  ---
2
+ title: Set of Marks
3
+ emoji:
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: docker
app.py CHANGED
@@ -9,10 +9,10 @@ import supervision as sv
9
  from typing import List
10
  from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
11
  from utils import postprocess_masks, Visualizer
 
12
 
13
  HOME = os.getenv("HOME")
14
  DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
15
- MINIMUM_AREA_THRESHOLD = 0.01
16
 
17
  SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
18
  # SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
@@ -27,13 +27,6 @@ MARKDOWN = """
27
  Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
28
  </h1>
29
 
30
- ## 🚀 How To
31
-
32
- - Upload an image.
33
- - Click the `Run` button to generate the image with marks.
34
- - Pass OpenAI API 🔑. You can get one [here](https://platform.openai.com/api-keys).
35
- - Ask GPT-4V questions about the image in the chatbot.
36
-
37
  ## 🚧 Roadmap
38
 
39
  - [ ] Support for alphabetic labels
@@ -55,8 +48,7 @@ def inference(
55
  result = mask_generator.generate(image=image)
56
  detections = sv.Detections.from_sam(result)
57
  detections = postprocess_masks(
58
- detections=detections,
59
- area_threshold=MINIMUM_AREA_THRESHOLD)
60
  bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
61
  annotated_image = visualizer.visualize(
62
  image=bgr_image,
@@ -68,8 +60,16 @@ def inference(
68
  return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
69
 
70
 
71
- def prompt(message, history):
72
- return "response"
 
 
 
 
 
 
 
 
73
 
74
 
75
  image_input = gr.Image(
@@ -89,8 +89,10 @@ image_output = gr.Image(
89
  label="SoM Visual Prompt",
90
  type="numpy",
91
  height=512)
92
- textbox_api_key = gr.Textbox(
93
- label="OpenAI API KEY",
 
 
94
  type="password")
95
  chatbot = gr.Chatbot(
96
  label="GPT-4V + SoM",
@@ -102,7 +104,9 @@ with gr.Blocks() as demo:
102
  with gr.Row():
103
  with gr.Column():
104
  image_input.render()
105
- with gr.Accordion(label="Detailed prompt settings (e.g., mark type)", open=False):
 
 
106
  with gr.Row():
107
  checkbox_annotation_mode.render()
108
  with gr.Row():
@@ -110,9 +114,13 @@ with gr.Blocks() as demo:
110
  with gr.Column():
111
  image_output.render()
112
  run_button.render()
113
- textbox_api_key.render()
114
  with gr.Row():
115
- gr.ChatInterface(chatbot=chatbot, fn=prompt)
 
 
 
 
 
116
 
117
  run_button.click(
118
  fn=inference,
 
9
  from typing import List
10
  from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
11
  from utils import postprocess_masks, Visualizer
12
+ from gpt4v import prompt_image
13
 
14
  HOME = os.getenv("HOME")
15
  DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
16
 
17
  SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
18
  # SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
 
27
  Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
28
  </h1>
29
 
 
 
 
 
 
 
 
30
  ## 🚧 Roadmap
31
 
32
  - [ ] Support for alphabetic labels
 
48
  result = mask_generator.generate(image=image)
49
  detections = sv.Detections.from_sam(result)
50
  detections = postprocess_masks(
51
+ detections=detections)
 
52
  bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
53
  annotated_image = visualizer.visualize(
54
  image=bgr_image,
 
60
  return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
61
 
62
 
63
+ def prompt(message, history, image: np.ndarray, api_key: str) -> str:
64
+ if api_key == "":
65
+ return "⚠️ Please set your OpenAI API key first"
66
+ if image is None:
67
+ return "⚠️ Please generate SoM visual prompt first"
68
+ return prompt_image(
69
+ api_key=api_key,
70
+ image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
71
+ prompt=message
72
+ )
73
 
74
 
75
  image_input = gr.Image(
 
89
  label="SoM Visual Prompt",
90
  type="numpy",
91
  height=512)
92
+ openai_api_key = gr.Textbox(
93
+ show_label=False,
94
+ placeholder="Before you start chatting, set your OpenAI API key here",
95
+ lines=1,
96
  type="password")
97
  chatbot = gr.Chatbot(
98
  label="GPT-4V + SoM",
 
104
  with gr.Row():
105
  with gr.Column():
106
  image_input.render()
107
+ with gr.Accordion(
108
+ label="Detailed prompt settings (e.g., mark type)",
109
+ open=False):
110
  with gr.Row():
111
  checkbox_annotation_mode.render()
112
  with gr.Row():
 
114
  with gr.Column():
115
  image_output.render()
116
  run_button.render()
 
117
  with gr.Row():
118
+ openai_api_key.render()
119
+ with gr.Row():
120
+ gr.ChatInterface(
121
+ chatbot=chatbot,
122
+ fn=prompt,
123
+ additional_inputs=[image_output, openai_api_key])
124
 
125
  run_button.click(
126
  fn=inference,
gpt4v.py CHANGED
@@ -42,15 +42,15 @@ def compose_payload(image: np.ndarray, prompt: str) -> dict:
42
  return {
43
  "model": "gpt-4-vision-preview",
44
  "messages": [
 
 
 
 
 
 
45
  {
46
  "role": "user",
47
  "content": [
48
- {
49
- "role": "system",
50
- "content": [
51
- META_PROMPT
52
- ]
53
- },
54
  {
55
  "type": "text",
56
  "text": prompt
 
42
  return {
43
  "model": "gpt-4-vision-preview",
44
  "messages": [
45
+ {
46
+ "role": "system",
47
+ "content": [
48
+ META_PROMPT
49
+ ]
50
+ },
51
  {
52
  "role": "user",
53
  "content": [
 
 
 
 
 
 
54
  {
55
  "type": "text",
56
  "text": prompt