zhuohan-7 commited on
Commit
7d4aeea
1 Parent(s): 181fb78

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +18 -0
  2. examples/2AC/AudioCaps-Test/data-00000-of-00001.arrow +3 -0
  3. examples/2AC/AudioCaps-Test/dataset_info.json +160 -0
  4. examples/2AC/AudioCaps-Test/sample_0.wav +0 -0
  5. examples/2AC/AudioCaps-Test/sample_1.wav +0 -0
  6. examples/2AC/AudioCaps-Test/sample_2.wav +0 -0
  7. examples/2AC/AudioCaps-Test/state.json +24 -0
  8. examples/2AC/WavCaps-Test/data-00000-of-00001.arrow +3 -0
  9. examples/2AC/WavCaps-Test/dataset_info.json +156 -0
  10. examples/2AC/WavCaps-Test/sample_0.wav +0 -0
  11. examples/2AC/WavCaps-Test/sample_1.wav +0 -0
  12. examples/2AC/WavCaps-Test/sample_2.wav +0 -0
  13. examples/2AC/WavCaps-Test/state.json +24 -0
  14. examples/2AQA/AudioCaps-QA-Test/data-00000-of-00001.arrow +3 -0
  15. examples/2AQA/AudioCaps-QA-Test/dataset_info.json +164 -0
  16. examples/2AQA/AudioCaps-QA-Test/sample_0.wav +0 -0
  17. examples/2AQA/AudioCaps-QA-Test/sample_1.wav +0 -0
  18. examples/2AQA/AudioCaps-QA-Test/sample_2.wav +0 -0
  19. examples/2AQA/AudioCaps-QA-Test/state.json +24 -0
  20. examples/2AQA/Clotho-AQA-Test/data-00000-of-00001.arrow +3 -0
  21. examples/2AQA/Clotho-AQA-Test/dataset_info.json +147 -0
  22. examples/2AQA/Clotho-AQA-Test/sample_0.wav +0 -0
  23. examples/2AQA/Clotho-AQA-Test/sample_1.wav +0 -0
  24. examples/2AQA/Clotho-AQA-Test/sample_2.wav +0 -0
  25. examples/2AQA/Clotho-AQA-Test/state.json +24 -0
  26. examples/2AQA/WavCaps-QA-Test/data-00000-of-00001.arrow +3 -0
  27. examples/2AQA/WavCaps-QA-Test/dataset_info.json +160 -0
  28. examples/2AQA/WavCaps-QA-Test/sample_0.wav +0 -0
  29. examples/2AQA/WavCaps-QA-Test/sample_1.wav +0 -0
  30. examples/2AQA/WavCaps-QA-Test/sample_2.wav +0 -0
  31. examples/2AQA/WavCaps-QA-Test/state.json +24 -0
  32. examples/2AR/VoxCeleb-Accent-Test/data-00000-of-00001.arrow +3 -0
  33. examples/2AR/VoxCeleb-Accent-Test/dataset_info.json +168 -0
  34. examples/2AR/VoxCeleb-Accent-Test/sample_0.wav +0 -0
  35. examples/2AR/VoxCeleb-Accent-Test/sample_1.wav +0 -0
  36. examples/2AR/VoxCeleb-Accent-Test/sample_2.wav +0 -0
  37. examples/2AR/VoxCeleb-Accent-Test/state.json +24 -0
  38. examples/2ASR/Common-Voice-15-En-Test/data-00000-of-00001.arrow +3 -0
  39. examples/2ASR/Common-Voice-15-En-Test/dataset_info.json +188 -0
  40. examples/2ASR/Common-Voice-15-En-Test/sample_0.wav +0 -0
  41. examples/2ASR/Common-Voice-15-En-Test/sample_1.wav +0 -0
  42. examples/2ASR/Common-Voice-15-En-Test/sample_2.wav +0 -0
  43. examples/2ASR/Common-Voice-15-En-Test/state.json +24 -0
  44. examples/2ASR/Earnings21-Test/data-00000-of-00001.arrow +3 -0
  45. examples/2ASR/Earnings21-Test/dataset_info.json +152 -0
  46. examples/2ASR/Earnings21-Test/sample_0.wav +3 -0
  47. examples/2ASR/Earnings21-Test/sample_1.wav +3 -0
  48. examples/2ASR/Earnings21-Test/sample_2.wav +3 -0
  49. examples/2ASR/Earnings21-Test/state.json +24 -0
  50. examples/2ASR/Earnings22-Test/data-00000-of-00001.arrow +3 -0
.gitattributes CHANGED
@@ -60,3 +60,21 @@ examples/SQA/CN-College-Listen-MCQ-Test/sample_0.wav filter=lfs diff=lfs merge=l
60
  examples/SQA/Spoken-Squad-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
61
  examples/SQA/Spoken-Squad-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
62
  examples/SQA/Spoken-Squad-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  examples/SQA/Spoken-Squad-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
61
  examples/SQA/Spoken-Squad-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
62
  examples/SQA/Spoken-Squad-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
63
+ examples/2ASR/Earnings21-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
64
+ examples/2ASR/Earnings21-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
65
+ examples/2ASR/Earnings21-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
66
+ examples/2ASR/Earnings22-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
67
+ examples/2ASR/Earnings22-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
68
+ examples/2ASR/Earnings22-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
69
+ examples/2ASR/Tedlium3-Long-form-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
70
+ examples/2ASR/Tedlium3-Long-form-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
71
+ examples/2ASR/Tedlium3-Long-form-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
72
+ examples/2SQA/CN-College-Listen-MCQ-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
73
+ examples/2SQA/Public-SG-Speech-QA-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
74
+ examples/2SQA/SLUE-P2-SQA5-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
75
+ examples/2SQA/SLUE-P2-SQA5-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
76
+ examples/2SQA/SLUE-P2-SQA5-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
77
+ examples/2SQA/Spoken-Squad-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
78
+ examples/2SQA/Spoken-Squad-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
79
+ examples/2SQA/Spoken-Squad-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
80
+ examples/SQA/DREAM-TTS-MCQ-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
examples/2AC/AudioCaps-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70213ecc31962d6d8bbc0e4d7ae2dd302c851a4af00f12b07735311f5f128288
3
+ size 966216
examples/2AC/AudioCaps-Test/dataset_info.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audiocap_id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "start_time": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "youtube_id": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ }
48
+ },
49
+ "salmonn_7b": {
50
+ "answer": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "model_prediction": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "task_type": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "text": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ }
66
+ },
67
+ "wavllm_fairseq": {
68
+ "answer": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "model_prediction": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "task_type": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "text": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ }
84
+ },
85
+ "Qwen2-Audio-7B-Instruct": {
86
+ "answer": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "model_prediction": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "task_type": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "text": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ }
102
+ },
103
+ "whisper_large_v3_with_llama_3_8b_instruct": {
104
+ "answer": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "model_prediction": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "task_type": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "text": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ }
120
+ },
121
+ "mowe_audio": {
122
+ "answer": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "model_prediction": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "task_type": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "text": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ }
138
+ },
139
+ "qwen_audio_chat": {
140
+ "answer": {
141
+ "dtype": "string",
142
+ "_type": "Value"
143
+ },
144
+ "model_prediction": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ },
148
+ "task_type": {
149
+ "dtype": "string",
150
+ "_type": "Value"
151
+ },
152
+ "text": {
153
+ "dtype": "string",
154
+ "_type": "Value"
155
+ }
156
+ }
157
+ },
158
+ "homepage": "",
159
+ "license": ""
160
+ }
examples/2AC/AudioCaps-Test/sample_0.wav ADDED
Binary file (320 kB). View file
 
examples/2AC/AudioCaps-Test/sample_1.wav ADDED
Binary file (320 kB). View file
 
examples/2AC/AudioCaps-Test/sample_2.wav ADDED
Binary file (315 kB). View file
 
examples/2AC/AudioCaps-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "e736bf1821a473f3",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2AC/WavCaps-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9636af636286b1aedad840ccef31ca7d879e824ddc6814fcb7411b7fbdf411aa
3
+ size 812352
examples/2AC/WavCaps-Test/dataset_info.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audio_path": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "duration": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ }
44
+ },
45
+ "salmonn_7b": {
46
+ "answer": {
47
+ "dtype": "string",
48
+ "_type": "Value"
49
+ },
50
+ "model_prediction": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "task_type": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "text": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ }
62
+ },
63
+ "wavllm_fairseq": {
64
+ "answer": {
65
+ "dtype": "string",
66
+ "_type": "Value"
67
+ },
68
+ "model_prediction": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "task_type": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "text": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ }
80
+ },
81
+ "Qwen2-Audio-7B-Instruct": {
82
+ "answer": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "model_prediction": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "task_type": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "text": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ }
98
+ },
99
+ "whisper_large_v3_with_llama_3_8b_instruct": {
100
+ "answer": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "model_prediction": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "task_type": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "text": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ }
116
+ },
117
+ "mowe_audio": {
118
+ "answer": {
119
+ "dtype": "string",
120
+ "_type": "Value"
121
+ },
122
+ "model_prediction": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "task_type": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "text": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ }
134
+ },
135
+ "qwen_audio_chat": {
136
+ "answer": {
137
+ "dtype": "string",
138
+ "_type": "Value"
139
+ },
140
+ "model_prediction": {
141
+ "dtype": "string",
142
+ "_type": "Value"
143
+ },
144
+ "task_type": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ },
148
+ "text": {
149
+ "dtype": "string",
150
+ "_type": "Value"
151
+ }
152
+ }
153
+ },
154
+ "homepage": "",
155
+ "license": ""
156
+ }
examples/2AC/WavCaps-Test/sample_0.wav ADDED
Binary file (32 kB). View file
 
examples/2AC/WavCaps-Test/sample_1.wav ADDED
Binary file (268 kB). View file
 
examples/2AC/WavCaps-Test/sample_2.wav ADDED
Binary file (500 kB). View file
 
examples/2AC/WavCaps-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "742ab313af054565",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2AQA/AudioCaps-QA-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92b0b2fe81ee0e3a2690a444bb9b68994d89ca53db6ce174f5802293549256d1
3
+ size 953616
examples/2AQA/AudioCaps-QA-Test/dataset_info.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audiocap_id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "caption": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "start_time": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ },
48
+ "youtube_id": {
49
+ "dtype": "string",
50
+ "_type": "Value"
51
+ }
52
+ },
53
+ "salmonn_7b": {
54
+ "answer": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "model_prediction": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "task_type": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ },
66
+ "text": {
67
+ "dtype": "string",
68
+ "_type": "Value"
69
+ }
70
+ },
71
+ "wavllm_fairseq": {
72
+ "answer": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "model_prediction": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "task_type": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ },
84
+ "text": {
85
+ "dtype": "string",
86
+ "_type": "Value"
87
+ }
88
+ },
89
+ "Qwen2-Audio-7B-Instruct": {
90
+ "answer": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "model_prediction": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "task_type": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ },
102
+ "text": {
103
+ "dtype": "string",
104
+ "_type": "Value"
105
+ }
106
+ },
107
+ "whisper_large_v3_with_llama_3_8b_instruct": {
108
+ "answer": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "model_prediction": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "task_type": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ },
120
+ "text": {
121
+ "dtype": "string",
122
+ "_type": "Value"
123
+ }
124
+ },
125
+ "mowe_audio": {
126
+ "answer": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "model_prediction": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "task_type": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ },
138
+ "text": {
139
+ "dtype": "string",
140
+ "_type": "Value"
141
+ }
142
+ },
143
+ "qwen_audio_chat": {
144
+ "answer": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ },
148
+ "model_prediction": {
149
+ "dtype": "string",
150
+ "_type": "Value"
151
+ },
152
+ "task_type": {
153
+ "dtype": "string",
154
+ "_type": "Value"
155
+ },
156
+ "text": {
157
+ "dtype": "string",
158
+ "_type": "Value"
159
+ }
160
+ }
161
+ },
162
+ "homepage": "",
163
+ "license": ""
164
+ }
examples/2AQA/AudioCaps-QA-Test/sample_0.wav ADDED
Binary file (320 kB). View file
 
examples/2AQA/AudioCaps-QA-Test/sample_1.wav ADDED
Binary file (320 kB). View file
 
examples/2AQA/AudioCaps-QA-Test/sample_2.wav ADDED
Binary file (302 kB). View file
 
examples/2AQA/AudioCaps-QA-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "52bc1dfcaf2a0f4b",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2AQA/Clotho-AQA-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08b4de88bf163bbfd2097769e6104ba8514380a7a097741d38e1ccc41d5b0f86
3
+ size 2035832
examples/2AQA/Clotho-AQA-Test/dataset_info.json ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {},
36
+ "salmonn_7b": {
37
+ "answer": {
38
+ "dtype": "string",
39
+ "_type": "Value"
40
+ },
41
+ "model_prediction": {
42
+ "dtype": "string",
43
+ "_type": "Value"
44
+ },
45
+ "task_type": {
46
+ "dtype": "string",
47
+ "_type": "Value"
48
+ },
49
+ "text": {
50
+ "dtype": "string",
51
+ "_type": "Value"
52
+ }
53
+ },
54
+ "wavllm_fairseq": {
55
+ "answer": {
56
+ "dtype": "string",
57
+ "_type": "Value"
58
+ },
59
+ "model_prediction": {
60
+ "dtype": "string",
61
+ "_type": "Value"
62
+ },
63
+ "task_type": {
64
+ "dtype": "string",
65
+ "_type": "Value"
66
+ },
67
+ "text": {
68
+ "dtype": "string",
69
+ "_type": "Value"
70
+ }
71
+ },
72
+ "Qwen2-Audio-7B-Instruct": {
73
+ "answer": {
74
+ "dtype": "string",
75
+ "_type": "Value"
76
+ },
77
+ "model_prediction": {
78
+ "dtype": "string",
79
+ "_type": "Value"
80
+ },
81
+ "task_type": {
82
+ "dtype": "string",
83
+ "_type": "Value"
84
+ },
85
+ "text": {
86
+ "dtype": "string",
87
+ "_type": "Value"
88
+ }
89
+ },
90
+ "whisper_large_v3_with_llama_3_8b_instruct": {
91
+ "answer": {
92
+ "dtype": "string",
93
+ "_type": "Value"
94
+ },
95
+ "model_prediction": {
96
+ "dtype": "string",
97
+ "_type": "Value"
98
+ },
99
+ "task_type": {
100
+ "dtype": "string",
101
+ "_type": "Value"
102
+ },
103
+ "text": {
104
+ "dtype": "string",
105
+ "_type": "Value"
106
+ }
107
+ },
108
+ "mowe_audio": {
109
+ "answer": {
110
+ "dtype": "string",
111
+ "_type": "Value"
112
+ },
113
+ "model_prediction": {
114
+ "dtype": "string",
115
+ "_type": "Value"
116
+ },
117
+ "task_type": {
118
+ "dtype": "string",
119
+ "_type": "Value"
120
+ },
121
+ "text": {
122
+ "dtype": "string",
123
+ "_type": "Value"
124
+ }
125
+ },
126
+ "qwen_audio_chat": {
127
+ "answer": {
128
+ "dtype": "string",
129
+ "_type": "Value"
130
+ },
131
+ "model_prediction": {
132
+ "dtype": "string",
133
+ "_type": "Value"
134
+ },
135
+ "task_type": {
136
+ "dtype": "string",
137
+ "_type": "Value"
138
+ },
139
+ "text": {
140
+ "dtype": "string",
141
+ "_type": "Value"
142
+ }
143
+ }
144
+ },
145
+ "homepage": "",
146
+ "license": ""
147
+ }
examples/2AQA/Clotho-AQA-Test/sample_0.wav ADDED
Binary file (615 kB). View file
 
examples/2AQA/Clotho-AQA-Test/sample_1.wav ADDED
Binary file (576 kB). View file
 
examples/2AQA/Clotho-AQA-Test/sample_2.wav ADDED
Binary file (835 kB). View file
 
examples/2AQA/Clotho-AQA-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "e2e76326f448d7c4",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2AQA/WavCaps-QA-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1736d5bcc9ca0d8d4847d7d000e6c6e63c73f6262177ea0391d180c40649da39
3
+ size 837920
examples/2AQA/WavCaps-QA-Test/dataset_info.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audio_path": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "caption": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "duration": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ }
48
+ },
49
+ "salmonn_7b": {
50
+ "answer": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "model_prediction": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "task_type": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "text": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ }
66
+ },
67
+ "wavllm_fairseq": {
68
+ "answer": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "model_prediction": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "task_type": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "text": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ }
84
+ },
85
+ "Qwen2-Audio-7B-Instruct": {
86
+ "answer": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "model_prediction": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "task_type": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "text": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ }
102
+ },
103
+ "whisper_large_v3_with_llama_3_8b_instruct": {
104
+ "answer": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "model_prediction": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "task_type": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "text": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ }
120
+ },
121
+ "mowe_audio": {
122
+ "answer": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "model_prediction": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "task_type": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "text": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ }
138
+ },
139
+ "qwen_audio_chat": {
140
+ "answer": {
141
+ "dtype": "string",
142
+ "_type": "Value"
143
+ },
144
+ "model_prediction": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ },
148
+ "task_type": {
149
+ "dtype": "string",
150
+ "_type": "Value"
151
+ },
152
+ "text": {
153
+ "dtype": "string",
154
+ "_type": "Value"
155
+ }
156
+ }
157
+ },
158
+ "homepage": "",
159
+ "license": ""
160
+ }
examples/2AQA/WavCaps-QA-Test/sample_0.wav ADDED
Binary file (320 kB). View file
 
examples/2AQA/WavCaps-QA-Test/sample_1.wav ADDED
Binary file (251 kB). View file
 
examples/2AQA/WavCaps-QA-Test/sample_2.wav ADDED
Binary file (256 kB). View file
 
examples/2AQA/WavCaps-QA-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "40995a6cc1fe3dc7",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2AR/VoxCeleb-Accent-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5307150e5b08f9bfd7b81d56ca7ffbee1d731d002417d20fc3ec0713bc764533
3
+ size 730864
examples/2AR/VoxCeleb-Accent-Test/dataset_info.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "Gender": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "Nationality": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "VGGFace1 ID": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ },
48
+ "VoxCeleb1 ID": {
49
+ "dtype": "string",
50
+ "_type": "Value"
51
+ },
52
+ "index": {
53
+ "dtype": "string",
54
+ "_type": "Value"
55
+ }
56
+ },
57
+ "salmonn_7b": {
58
+ "answer": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "model_prediction": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ },
66
+ "task_type": {
67
+ "dtype": "string",
68
+ "_type": "Value"
69
+ },
70
+ "text": {
71
+ "dtype": "string",
72
+ "_type": "Value"
73
+ }
74
+ },
75
+ "wavllm_fairseq": {
76
+ "answer": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "model_prediction": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ },
84
+ "task_type": {
85
+ "dtype": "string",
86
+ "_type": "Value"
87
+ },
88
+ "text": {
89
+ "dtype": "string",
90
+ "_type": "Value"
91
+ }
92
+ },
93
+ "Qwen2-Audio-7B-Instruct": {
94
+ "answer": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "model_prediction": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ },
102
+ "task_type": {
103
+ "dtype": "string",
104
+ "_type": "Value"
105
+ },
106
+ "text": {
107
+ "dtype": "string",
108
+ "_type": "Value"
109
+ }
110
+ },
111
+ "whisper_large_v3_with_llama_3_8b_instruct": {
112
+ "answer": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "model_prediction": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ },
120
+ "task_type": {
121
+ "dtype": "string",
122
+ "_type": "Value"
123
+ },
124
+ "text": {
125
+ "dtype": "string",
126
+ "_type": "Value"
127
+ }
128
+ },
129
+ "mowe_audio": {
130
+ "answer": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "model_prediction": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ },
138
+ "task_type": {
139
+ "dtype": "string",
140
+ "_type": "Value"
141
+ },
142
+ "text": {
143
+ "dtype": "string",
144
+ "_type": "Value"
145
+ }
146
+ },
147
+ "qwen_audio_chat": {
148
+ "answer": {
149
+ "dtype": "string",
150
+ "_type": "Value"
151
+ },
152
+ "model_prediction": {
153
+ "dtype": "string",
154
+ "_type": "Value"
155
+ },
156
+ "task_type": {
157
+ "dtype": "string",
158
+ "_type": "Value"
159
+ },
160
+ "text": {
161
+ "dtype": "string",
162
+ "_type": "Value"
163
+ }
164
+ }
165
+ },
166
+ "homepage": "",
167
+ "license": ""
168
+ }
examples/2AR/VoxCeleb-Accent-Test/sample_0.wav ADDED
Binary file (268 kB). View file
 
examples/2AR/VoxCeleb-Accent-Test/sample_1.wav ADDED
Binary file (152 kB). View file
 
examples/2AR/VoxCeleb-Accent-Test/sample_2.wav ADDED
Binary file (301 kB). View file
 
examples/2AR/VoxCeleb-Accent-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "fa91a59f90c22c3c",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2ASR/Common-Voice-15-En-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9a561ce9aed8ba4c02f86c90883029e1bb566e2b66986b17874f3bb5884d67d
3
+ size 489552
examples/2ASR/Common-Voice-15-En-Test/dataset_info.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "accents": {
37
+ "dtype": "null",
38
+ "_type": "Value"
39
+ },
40
+ "age": {
41
+ "dtype": "null",
42
+ "_type": "Value"
43
+ },
44
+ "client_id": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ },
48
+ "down_votes": {
49
+ "dtype": "int64",
50
+ "_type": "Value"
51
+ },
52
+ "gender": {
53
+ "dtype": "null",
54
+ "_type": "Value"
55
+ },
56
+ "language": {
57
+ "dtype": "string",
58
+ "_type": "Value"
59
+ },
60
+ "locale": {
61
+ "dtype": "string",
62
+ "_type": "Value"
63
+ },
64
+ "segment": {
65
+ "dtype": "null",
66
+ "_type": "Value"
67
+ },
68
+ "up_votes": {
69
+ "dtype": "int64",
70
+ "_type": "Value"
71
+ },
72
+ "variant": {
73
+ "dtype": "null",
74
+ "_type": "Value"
75
+ }
76
+ },
77
+ "salmonn_7b": {
78
+ "answer": {
79
+ "dtype": "string",
80
+ "_type": "Value"
81
+ },
82
+ "model_prediction": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "task_type": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "text": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ }
94
+ },
95
+ "wavllm_fairseq": {
96
+ "answer": {
97
+ "dtype": "string",
98
+ "_type": "Value"
99
+ },
100
+ "model_prediction": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "task_type": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "text": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ }
112
+ },
113
+ "Qwen2-Audio-7B-Instruct": {
114
+ "answer": {
115
+ "dtype": "string",
116
+ "_type": "Value"
117
+ },
118
+ "model_prediction": {
119
+ "dtype": "string",
120
+ "_type": "Value"
121
+ },
122
+ "task_type": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "text": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ }
130
+ },
131
+ "whisper_large_v3_with_llama_3_8b_instruct": {
132
+ "answer": {
133
+ "dtype": "string",
134
+ "_type": "Value"
135
+ },
136
+ "model_prediction": {
137
+ "dtype": "string",
138
+ "_type": "Value"
139
+ },
140
+ "task_type": {
141
+ "dtype": "string",
142
+ "_type": "Value"
143
+ },
144
+ "text": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ }
148
+ },
149
+ "mowe_audio": {
150
+ "answer": {
151
+ "dtype": "string",
152
+ "_type": "Value"
153
+ },
154
+ "model_prediction": {
155
+ "dtype": "string",
156
+ "_type": "Value"
157
+ },
158
+ "task_type": {
159
+ "dtype": "string",
160
+ "_type": "Value"
161
+ },
162
+ "text": {
163
+ "dtype": "string",
164
+ "_type": "Value"
165
+ }
166
+ },
167
+ "qwen_audio_chat": {
168
+ "answer": {
169
+ "dtype": "string",
170
+ "_type": "Value"
171
+ },
172
+ "model_prediction": {
173
+ "dtype": "string",
174
+ "_type": "Value"
175
+ },
176
+ "task_type": {
177
+ "dtype": "string",
178
+ "_type": "Value"
179
+ },
180
+ "text": {
181
+ "dtype": "string",
182
+ "_type": "Value"
183
+ }
184
+ }
185
+ },
186
+ "homepage": "",
187
+ "license": ""
188
+ }
examples/2ASR/Common-Voice-15-En-Test/sample_0.wav ADDED
Binary file (158 kB). View file
 
examples/2ASR/Common-Voice-15-En-Test/sample_1.wav ADDED
Binary file (172 kB). View file
 
examples/2ASR/Common-Voice-15-En-Test/sample_2.wav ADDED
Binary file (148 kB). View file
 
examples/2ASR/Common-Voice-15-En-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "468db91ad949e4d4",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2ASR/Earnings21-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fbca261ae8ac8ccc96993ea11f17836bdcfef1070835784f159b79990a5a298
3
+ size 429108160
examples/2ASR/Earnings21-Test/dataset_info.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ }
40
+ },
41
+ "salmonn_7b": {
42
+ "answer": {
43
+ "dtype": "string",
44
+ "_type": "Value"
45
+ },
46
+ "model_prediction": {
47
+ "dtype": "string",
48
+ "_type": "Value"
49
+ },
50
+ "task_type": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "text": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ }
58
+ },
59
+ "wavllm_fairseq": {
60
+ "answer": {
61
+ "dtype": "string",
62
+ "_type": "Value"
63
+ },
64
+ "model_prediction": {
65
+ "dtype": "string",
66
+ "_type": "Value"
67
+ },
68
+ "task_type": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "text": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ }
76
+ },
77
+ "Qwen2-Audio-7B-Instruct": {
78
+ "answer": {
79
+ "dtype": "string",
80
+ "_type": "Value"
81
+ },
82
+ "model_prediction": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "task_type": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "text": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ }
94
+ },
95
+ "whisper_large_v3_with_llama_3_8b_instruct": {
96
+ "answer": {
97
+ "dtype": "string",
98
+ "_type": "Value"
99
+ },
100
+ "model_prediction": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "task_type": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "text": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ }
112
+ },
113
+ "mowe_audio": {
114
+ "answer": {
115
+ "dtype": "string",
116
+ "_type": "Value"
117
+ },
118
+ "model_prediction": {
119
+ "dtype": "string",
120
+ "_type": "Value"
121
+ },
122
+ "task_type": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "text": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ }
130
+ },
131
+ "qwen_audio_chat": {
132
+ "answer": {
133
+ "dtype": "string",
134
+ "_type": "Value"
135
+ },
136
+ "model_prediction": {
137
+ "dtype": "string",
138
+ "_type": "Value"
139
+ },
140
+ "task_type": {
141
+ "dtype": "string",
142
+ "_type": "Value"
143
+ },
144
+ "text": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ }
148
+ }
149
+ },
150
+ "homepage": "",
151
+ "license": ""
152
+ }
examples/2ASR/Earnings21-Test/sample_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8fb994964e1b0df9f4675ceaa73d55da55a096f5b94d002d9f7b07c997fc83e
3
+ size 97593644
examples/2ASR/Earnings21-Test/sample_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd6ba77731011a6dc02e5854a600a2036713be4c2d71abf63fd6a89b86083c4f
3
+ size 178791280
examples/2ASR/Earnings21-Test/sample_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a1d15425069b003730e79f0df467103f4ac7670f87a0539a97c82973a02943e
3
+ size 150700076
examples/2ASR/Earnings21-Test/state.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "8cc0ad99446f1aba",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "mowe_audio",
18
+ "qwen_audio_chat"
19
+ ],
20
+ "_format_kwargs": {},
21
+ "_format_type": null,
22
+ "_output_all_columns": false,
23
+ "_split": null
24
+ }
examples/2ASR/Earnings22-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:497dd6d287df9a8be5194b7875ae88f275127986d3fc538601382e80244bbb7b
3
+ size 332277848