maxiw commited on
Commit
0da7bd3
1 Parent(s): bbaff07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -52,7 +52,7 @@ def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scal
52
 
53
 
54
  @spaces.GPU
55
- def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
56
  model = models[model_id].eval()
57
  processor = processors[model_id]
58
 
@@ -61,7 +61,7 @@ def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
61
  "role": "user",
62
  "content": [
63
  {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
64
- {"type": "text", "text": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."},
65
  {"type": "text", "text": f"detect {text_input}"},
66
  ],
67
  }
@@ -109,6 +109,7 @@ with gr.Blocks(css=css) as demo:
109
  with gr.Column():
110
  input_img = gr.Image(label="Input Picture", type="pil")
111
  model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
 
112
  text_input = gr.Textbox(label="Description of Localization Target")
113
  submit_btn = gr.Button(value="Submit")
114
  with gr.Column():
@@ -116,6 +117,6 @@ with gr.Blocks(css=css) as demo:
116
  parsed_boxes = gr.Textbox(label="Parsed Boxes")
117
  annotated_image = gr.Image(label="Annotated Picture")
118
 
119
- submit_btn.click(run_example, [input_img, text_input, model_selector], [model_output_text, parsed_boxes, annotated_image])
120
 
121
  demo.launch(debug=True)
 
52
 
53
 
54
  @spaces.GPU
55
+ def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Instruct"):
56
  model = models[model_id].eval()
57
  processor = processors[model_id]
58
 
 
61
  "role": "user",
62
  "content": [
63
  {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
64
+ {"type": "text", "text": system_prompt},
65
  {"type": "text", "text": f"detect {text_input}"},
66
  ],
67
  }
 
109
  with gr.Column():
110
  input_img = gr.Image(label="Input Picture", type="pil")
111
  model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
112
+ system_prompt = gr.Textbox(label="System Prompt", value="You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
113
  text_input = gr.Textbox(label="Description of Localization Target")
114
  submit_btn = gr.Button(value="Submit")
115
  with gr.Column():
 
117
  parsed_boxes = gr.Textbox(label="Parsed Boxes")
118
  annotated_image = gr.Image(label="Annotated Picture")
119
 
120
+ submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector], [model_output_text, parsed_boxes, annotated_image])
121
 
122
  demo.launch(debug=True)