JustinLin610 commited on
Commit
970c656
1 Parent(s): 39ae392
Files changed (1) hide show
  1. app.py +84 -24
app.py CHANGED
@@ -22,32 +22,92 @@ def image_text_zeroshot(image, text_list):
22
  with torch.no_grad():
23
  embeddings = model(inputs)
24
 
25
- scores = torch.softmax(
26
- embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T,
27
- dim=-1
28
- ).squeeze(0).tolist()
 
 
 
29
 
30
- score_dict = {label:score for label, score in zip(labels, scores)}
31
 
32
  return score_dict
33
 
34
 
35
- inputs = [
36
- gr.inputs.Image(type='filepath',
37
- label="Input image"),
38
- gr.inputs.Textbox(lines=1,
39
- label="Candidate texts"),
40
- ]
41
-
42
- iface = gr.Interface(image_text_zeroshot,
43
- inputs,
44
- "label",
45
- examples=[["assets/dog_image.jpg", "A dog|A car|A bird"],
46
- ["assets/car_image.jpg", "A dog|A car|A bird"],
47
- ["assets/bird_image.jpg", "A dog|A car|A bird"]],
48
- description="""<p>This is a simple demo of ImageBind for zeroshot image classification. Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
49
- To test your own cases, you can upload an image, and provide the candidate texts separated by "|".<br>
50
- You can duplicate this space and run it privately: <a href='https://huggingface.co/spaces/OFA-Sys/chinese-clip-zero-shot-image-classification?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>""",
51
- title="ImageBind: Zero-shot Image Classification")
52
-
53
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  with torch.no_grad():
23
  embeddings = model(inputs)
24
 
25
+ scores = (
26
+ torch.softmax(
27
+ embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1
28
+ )
29
+ .squeeze(0)
30
+ .tolist()
31
+ )
32
 
33
+ score_dict = {label: score for label, score in zip(labels, scores)}
34
 
35
  return score_dict
36
 
37
 
38
+ def audio_text_zeroshot(audio, text_list):
39
+ audio_paths = [audio]
40
+ labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
41
+ inputs = {
42
+ ModalityType.TEXT: data.load_and_transform_text(labels, device),
43
+ ModalityType.AUDIO: data.load_and_transform_vision_data(audio_paths, device),
44
+ }
45
+
46
+ with torch.no_grad():
47
+ embeddings = model(inputs)
48
+
49
+ scores = (
50
+ torch.softmax(
51
+ embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1
52
+ )
53
+ .squeeze(0)
54
+ .tolist()
55
+ )
56
+
57
+ score_dict = {label: score for label, score in zip(labels, scores)}
58
+
59
+ return score_dict
60
+
61
+
62
+ def inference(
63
+ task,
64
+ image=None,
65
+ audio=None,
66
+ text_list=None,
67
+ ):
68
+ if task == "image-text":
69
+ result = image_text_zeroshot(image, text_list)
70
+ elif task == "audio-text":
71
+ result = audio_text_zeroshot(audio, text_list)
72
+ else:
73
+ raise NotImplementedError
74
+ return result
75
+
76
+
77
+ def main():
78
+ inputs = [
79
+ gr.inputs.Radio(
80
+ choices=[
81
+ "image-text",
82
+ "audio-text",
83
+ ],
84
+ type="value",
85
+ default="image-text",
86
+ label="Task",
87
+ ),
88
+ gr.inputs.Image(type="filepath", label="Input image"),
89
+ gr.inputs.Audio(type="filepath", label="Input audio"),
90
+ gr.inputs.Textbox(lines=1, label="Candidate texts"),
91
+ ]
92
+
93
+ iface = gr.Interface(
94
+ inference,
95
+ inputs,
96
+ "label",
97
+ examples=[
98
+ ["image-text", "assets/dog_image.jpg", None, "A dog|A car|A bird"],
99
+ ["image-text", "assets/car_image.jpg", None, "A dog|A car|A bird"],
100
+ ["audio-text", None, "assets/bird_audio.wav", "A dog|A car|A bird"],
101
+ ["audio-text", None, "assets/dog_audio.wav", "A dog|A car|A bird"],
102
+ ],
103
+ description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification and audio classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
104
+ To test your own cases, you can upload an image or an audio, and provide the candidate texts separated by "|".<br>
105
+ You can duplicate this space and run it privately: <a href='https://huggingface.co/spaces/OFA-Sys/chinese-clip-zero-shot-image-classification?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>""",
106
+ title="ImageBind: Zero-shot Cross-modal Understanding",
107
+ )
108
+
109
+ iface.launch()
110
+
111
+
112
+ if __name__ == "__main__":
113
+ main()