JustinLin610 commited on
Commit
9dd993b
1 Parent(s): 5648cf2
Files changed (1) hide show
  1. app.py +32 -4
app.py CHANGED
@@ -59,6 +59,30 @@ def audio_text_zeroshot(audio, text_list):
59
  return score_dict
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def inference(
63
  task,
64
  image=None,
@@ -69,6 +93,8 @@ def inference(
69
  result = image_text_zeroshot(image, text_list)
70
  elif task == "audio-text":
71
  result = audio_text_zeroshot(audio, text_list)
 
 
72
  else:
73
  raise NotImplementedError
74
  return result
@@ -80,6 +106,7 @@ def main():
80
  choices=[
81
  "image-text",
82
  "audio-text",
 
83
  ],
84
  type="value",
85
  default="image-text",
@@ -87,6 +114,7 @@ def main():
87
  ),
88
  gr.inputs.Image(type="filepath", label="Input image"),
89
  gr.inputs.Audio(type="filepath", label="Input audio"),
 
90
  gr.inputs.Textbox(lines=1, label="Candidate texts"),
91
  ]
92
 
@@ -95,10 +123,10 @@ def main():
95
  inputs,
96
  "label",
97
  examples=[
98
- ["image-text", "assets/dog_image.jpg", None, "A dog|A car|A bird"],
99
- ["image-text", "assets/car_image.jpg", None, "A dog|A car|A bird"],
100
- ["audio-text", None, "assets/bird_audio.wav", "A dog|A car|A bird"],
101
- ["audio-text", None, "assets/dog_audio.wav", "A dog|A car|A bird"],
102
  ],
103
  description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification and audio classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
104
  To test your own cases, you can upload an image or an audio, and provide the candidate texts separated by "|".<br>
 
59
  return score_dict
60
 
61
 
62
+ def video_text_zeroshot(video, text_list):
63
+ video_paths = [video]
64
+ labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
65
+ inputs = {
66
+ ModalityType.TEXT: data.load_and_transform_text(labels, device),
67
+ ModalityType.VIDEO: data.load_and_transform_video_data(video_paths, device),
68
+ }
69
+
70
+ with torch.no_grad():
71
+ embeddings = model(inputs)
72
+
73
+ scores = (
74
+ torch.softmax(
75
+ embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1
76
+ )
77
+ .squeeze(0)
78
+ .tolist()
79
+ )
80
+
81
+ score_dict = {label: score for label, score in zip(labels, scores)}
82
+
83
+ return score_dict
84
+
85
+
86
  def inference(
87
  task,
88
  image=None,
 
93
  result = image_text_zeroshot(image, text_list)
94
  elif task == "audio-text":
95
  result = audio_text_zeroshot(audio, text_list)
96
+ elif task == "video-text":
97
+ result = audio_text_zeroshot(audio, text_list)
98
  else:
99
  raise NotImplementedError
100
  return result
 
106
  choices=[
107
  "image-text",
108
  "audio-text",
109
+ "video-text",
110
  ],
111
  type="value",
112
  default="image-text",
 
114
  ),
115
  gr.inputs.Image(type="filepath", label="Input image"),
116
  gr.inputs.Audio(type="filepath", label="Input audio"),
117
+ gr.inputs.Video(type="filepath", label="Input video"),
118
  gr.inputs.Textbox(lines=1, label="Candidate texts"),
119
  ]
120
 
 
123
  inputs,
124
  "label",
125
  examples=[
126
+ ["image-text", "assets/dog_image.jpg", None, None, "A dog|A car|A bird"],
127
+ ["image-text", "assets/car_image.jpg", None, None, "A dog|A car|A bird"],
128
+ ["audio-text", None, "assets/bird_audio.wav", None, "A dog|A car|A bird"],
129
+ ["video-text", None, "assets/dog_video.mp4", None, "A dog|A car|A bird"],
130
  ],
131
  description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification and audio classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
132
  To test your own cases, you can upload an image or an audio, and provide the candidate texts separated by "|".<br>