gokaygokay leoxiaobin commited on
Commit
4d69588
1 Parent(s): beec895

add caption with grounding tasks (#3)

Browse files

- add caption + grounding tasks (2ad8ae101342d52f346e30112c7db5242dbf976f)
- add radio for single task and cascaded task (15ad26fe99cc2270c1402008f664d759dfdbfc3b)


Co-authored-by: Bin Xiao <leoxiaobin@users.noreply.huggingface.co>

Files changed (1) hide show
  1. app.py +53 -8
app.py CHANGED
@@ -135,6 +135,33 @@ def process_image(image, task_prompt, text_input=None, model_id='microsoft/Flore
135
  task_prompt = '<MORE_DETAILED_CAPTION>'
136
  results = run_example(task_prompt, image, model_id=model_id)
137
  return results, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  elif task_prompt == 'Object Detection':
139
  task_prompt = '<OD>'
140
  results = run_example(task_prompt, image, model_id=model_id)
@@ -202,6 +229,28 @@ css = """
202
  }
203
  """
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  with gr.Blocks(css=css) as demo:
206
  gr.Markdown(DESCRIPTION)
207
  with gr.Tab(label="Florence-2 Image Captioning"):
@@ -209,13 +258,9 @@ with gr.Blocks(css=css) as demo:
209
  with gr.Column():
210
  input_img = gr.Image(label="Input Picture")
211
  model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value='microsoft/Florence-2-large')
212
- task_prompt = gr.Dropdown(choices=[
213
- 'Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection',
214
- 'Dense Region Caption', 'Region Proposal', 'Caption to Phrase Grounding',
215
- 'Referring Expression Segmentation', 'Region to Segmentation',
216
- 'Open Vocabulary Detection', 'Region to Category', 'Region to Description',
217
- 'OCR', 'OCR with Region'
218
- ], label="Task Prompt", value= 'Caption')
219
  text_input = gr.Textbox(label="Text Input (optional)")
220
  submit_btn = gr.Button(value="Submit")
221
  with gr.Column():
@@ -236,4 +281,4 @@ with gr.Blocks(css=css) as demo:
236
 
237
  submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text, output_img])
238
 
239
- demo.launch(debug=True)
 
135
  task_prompt = '<MORE_DETAILED_CAPTION>'
136
  results = run_example(task_prompt, image, model_id=model_id)
137
  return results, None
138
+ elif task_prompt == 'Caption + Grounding':
139
+ task_prompt = '<CAPTION>'
140
+ results = run_example(task_prompt, image, model_id=model_id)
141
+ text_input = results[task_prompt]
142
+ task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
143
+ results = run_example(task_prompt, image, text_input, model_id)
144
+ results['<CAPTION>'] = text_input
145
+ fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
146
+ return results, fig_to_pil(fig)
147
+ elif task_prompt == 'Detailed Caption + Grounding':
148
+ task_prompt = '<DETAILED_CAPTION>'
149
+ results = run_example(task_prompt, image, model_id=model_id)
150
+ text_input = results[task_prompt]
151
+ task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
152
+ results = run_example(task_prompt, image, text_input, model_id)
153
+ results['<DETAILED_CAPTION>'] = text_input
154
+ fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
155
+ return results, fig_to_pil(fig)
156
+ elif task_prompt == 'More Detailed Caption + Grounding':
157
+ task_prompt = '<MORE_DETAILED_CAPTION>'
158
+ results = run_example(task_prompt, image, model_id=model_id)
159
+ text_input = results[task_prompt]
160
+ task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
161
+ results = run_example(task_prompt, image, text_input, model_id)
162
+ results['<MORE_DETAILED_CAPTION>'] = text_input
163
+ fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
164
+ return results, fig_to_pil(fig)
165
  elif task_prompt == 'Object Detection':
166
  task_prompt = '<OD>'
167
  results = run_example(task_prompt, image, model_id=model_id)
 
229
  }
230
  """
231
 
232
+
233
+ single_task_list =[
234
+ 'Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection',
235
+ 'Dense Region Caption', 'Region Proposal', 'Caption to Phrase Grounding',
236
+ 'Referring Expression Segmentation', 'Region to Segmentation',
237
+ 'Open Vocabulary Detection', 'Region to Category', 'Region to Description',
238
+ 'OCR', 'OCR with Region'
239
+ ]
240
+
241
+ cascased_task_list =[
242
+ 'Caption + Grounding', 'Detailed Caption + Grounding', 'More Detailed Caption + Grounding'
243
+ ]
244
+
245
+
246
+ def update_task_dropdown(choice):
247
+ if choice == 'Cascased task':
248
+ return gr.Dropdown(choices=cascased_task_list, value='Caption + Grounding')
249
+ else:
250
+ return gr.Dropdown(choices=single_task_list, value='Caption')
251
+
252
+
253
+
254
  with gr.Blocks(css=css) as demo:
255
  gr.Markdown(DESCRIPTION)
256
  with gr.Tab(label="Florence-2 Image Captioning"):
 
258
  with gr.Column():
259
  input_img = gr.Image(label="Input Picture")
260
  model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value='microsoft/Florence-2-large')
261
+ task_type = gr.Radio(choices=['Single task', 'Cascased task'], label='Task type selector', value='Single task')
262
+ task_prompt = gr.Dropdown(choices=single_task_list, label="Task Prompt")
263
+ task_type.change(fn=update_task_dropdown, inputs=task_type, outputs=task_prompt)
 
 
 
 
264
  text_input = gr.Textbox(label="Text Input (optional)")
265
  submit_btn = gr.Button(value="Submit")
266
  with gr.Column():
 
281
 
282
  submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text, output_img])
283
 
284
+ demo.launch(debug=True)