JustinLin610 commited on
Commit
179180d
1 Parent(s): a1ed4e4

reorder inputs

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -85,17 +85,17 @@ def video_text_zeroshot(video, text_list):
85
 
86
  def inference(
87
  task,
 
88
  image=None,
89
  audio=None,
90
  video=None,
91
- text_list=None,
92
  ):
93
  if task == "image-text":
94
  result = image_text_zeroshot(image, text_list)
95
  elif task == "audio-text":
96
  result = audio_text_zeroshot(audio, text_list)
97
  elif task == "video-text":
98
- result = audio_text_zeroshot(audio, text_list)
99
  else:
100
  raise NotImplementedError
101
  return result
@@ -113,10 +113,10 @@ def main():
113
  default="image-text",
114
  label="Task",
115
  ),
 
116
  gr.inputs.Image(type="filepath", label="Input image"),
117
  gr.inputs.Audio(type="filepath", label="Input audio"),
118
  gr.inputs.Video(type="filepath", label="Input video"),
119
- gr.inputs.Textbox(lines=1, label="Candidate texts"),
120
  ]
121
 
122
  iface = gr.Interface(
@@ -124,10 +124,10 @@ def main():
124
  inputs,
125
  "label",
126
  examples=[
127
- ["image-text", "assets/dog_image.jpg", None, None, "A dog|A car|A bird"],
128
- ["image-text", "assets/car_image.jpg", None, None, "A dog|A car|A bird"],
129
- ["audio-text", None, "assets/bird_audio.wav", None, "A dog|A car|A bird"],
130
- ["video-text", None, "assets/dog_video.mp4", None, "A dog|A car|A bird"],
131
  ],
132
  description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification, audio classification, and video classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
133
  To test your own cases, you can upload an image, an audio or a video, and provide the candidate texts separated by "|".<br>
 
85
 
86
  def inference(
87
  task,
88
+ text_list=None,
89
  image=None,
90
  audio=None,
91
  video=None,
 
92
  ):
93
  if task == "image-text":
94
  result = image_text_zeroshot(image, text_list)
95
  elif task == "audio-text":
96
  result = audio_text_zeroshot(audio, text_list)
97
  elif task == "video-text":
98
+ result = video_text_zeroshot(video, text_list)
99
  else:
100
  raise NotImplementedError
101
  return result
 
113
  default="image-text",
114
  label="Task",
115
  ),
116
+ gr.inputs.Textbox(lines=1, label="Candidate texts"),
117
  gr.inputs.Image(type="filepath", label="Input image"),
118
  gr.inputs.Audio(type="filepath", label="Input audio"),
119
  gr.inputs.Video(type="filepath", label="Input video"),
 
120
  ]
121
 
122
  iface = gr.Interface(
 
124
  inputs,
125
  "label",
126
  examples=[
127
+ ["image-text", "A dog|A car|A bird", "assets/dog_image.jpg", None, None],
128
+ ["image-text", "A dog|A car|A bird", "assets/car_image.jpg", None, None],
129
+ ["audio-text", "A dog|A car|A bird", None, "assets/bird_audio.wav", None],
130
+ ["video-text", "A dog|A car|A bird", None, "assets/dog_video.mp4", None],
131
  ],
132
  description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification, audio classification, and video classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
133
  To test your own cases, you can upload an image, an audio or a video, and provide the candidate texts separated by "|".<br>