zhuohan-7 commited on
Commit
8a77d82
1 Parent(s): b2caef0

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +18 -0
  2. examples/AC/AudioCaps-Test/data-00000-of-00001.arrow +3 -0
  3. examples/AC/AudioCaps-Test/dataset_info.json +142 -0
  4. examples/AC/AudioCaps-Test/sample_0.wav +0 -0
  5. examples/AC/AudioCaps-Test/sample_1.wav +0 -0
  6. examples/AC/AudioCaps-Test/sample_2.wav +0 -0
  7. examples/AC/AudioCaps-Test/state.json +23 -0
  8. examples/AC/WavCaps-Test/data-00000-of-00001.arrow +3 -0
  9. examples/AC/WavCaps-Test/dataset_info.json +138 -0
  10. examples/AC/WavCaps-Test/sample_0.wav +0 -0
  11. examples/AC/WavCaps-Test/sample_1.wav +0 -0
  12. examples/AC/WavCaps-Test/sample_2.wav +0 -0
  13. examples/AC/WavCaps-Test/state.json +23 -0
  14. examples/AR/VoxCeleb-Accent-Test/data-00000-of-00001.arrow +3 -0
  15. examples/AR/VoxCeleb-Accent-Test/dataset_info.json +150 -0
  16. examples/AR/VoxCeleb-Accent-Test/sample_0.wav +0 -0
  17. examples/AR/VoxCeleb-Accent-Test/sample_1.wav +0 -0
  18. examples/AR/VoxCeleb-Accent-Test/sample_2.wav +0 -0
  19. examples/AR/VoxCeleb-Accent-Test/state.json +23 -0
  20. examples/ASQA/AudioCaps-QA-Test/data-00000-of-00001.arrow +3 -0
  21. examples/ASQA/AudioCaps-QA-Test/dataset_info.json +146 -0
  22. examples/ASQA/AudioCaps-QA-Test/sample_0.wav +0 -0
  23. examples/ASQA/AudioCaps-QA-Test/sample_1.wav +0 -0
  24. examples/ASQA/AudioCaps-QA-Test/sample_2.wav +0 -0
  25. examples/ASQA/AudioCaps-QA-Test/state.json +23 -0
  26. examples/ASQA/Clotho-AQA-Test/data-00000-of-00001.arrow +3 -0
  27. examples/ASQA/Clotho-AQA-Test/dataset_info.json +129 -0
  28. examples/ASQA/Clotho-AQA-Test/sample_0.wav +0 -0
  29. examples/ASQA/Clotho-AQA-Test/sample_1.wav +0 -0
  30. examples/ASQA/Clotho-AQA-Test/sample_2.wav +0 -0
  31. examples/ASQA/Clotho-AQA-Test/state.json +23 -0
  32. examples/ASQA/WavCaps-QA-Test/data-00000-of-00001.arrow +3 -0
  33. examples/ASQA/WavCaps-QA-Test/dataset_info.json +142 -0
  34. examples/ASQA/WavCaps-QA-Test/sample_0.wav +0 -0
  35. examples/ASQA/WavCaps-QA-Test/sample_1.wav +0 -0
  36. examples/ASQA/WavCaps-QA-Test/sample_2.wav +0 -0
  37. examples/ASQA/WavCaps-QA-Test/state.json +23 -0
  38. examples/ASR/Common-Voice-15-En-Test/data-00000-of-00001.arrow +3 -0
  39. examples/ASR/Common-Voice-15-En-Test/dataset_info.json +170 -0
  40. examples/ASR/Common-Voice-15-En-Test/sample_0.wav +0 -0
  41. examples/ASR/Common-Voice-15-En-Test/sample_1.wav +0 -0
  42. examples/ASR/Common-Voice-15-En-Test/sample_2.wav +0 -0
  43. examples/ASR/Common-Voice-15-En-Test/state.json +23 -0
  44. examples/ASR/Earnings21-Test/data-00000-of-00001.arrow +3 -0
  45. examples/ASR/Earnings21-Test/dataset_info.json +134 -0
  46. examples/ASR/Earnings21-Test/sample_0.wav +3 -0
  47. examples/ASR/Earnings21-Test/sample_1.wav +3 -0
  48. examples/ASR/Earnings21-Test/sample_2.wav +3 -0
  49. examples/ASR/Earnings21-Test/state.json +23 -0
  50. examples/ASR/Earnings22-Test/data-00000-of-00001.arrow +3 -0
.gitattributes CHANGED
@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
+ examples/ASR/Earnings21-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
37
+ examples/ASR/Earnings21-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
38
+ examples/ASR/Earnings21-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
39
+ examples/ASR/Earnings22-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
40
+ examples/ASR/Earnings22-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
41
+ examples/ASR/Earnings22-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
42
+ examples/ASR/Tedlium3-Longform-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
43
+ examples/ASR/Tedlium3-Longform-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
44
+ examples/ASR/Tedlium3-Longform-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
45
+ examples/SQA/DREAM-TTS-MCQ-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
46
+ examples/SQA/Public-SG-Speech-QA-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
47
+ examples/SQA/Public-SG-Speech-QA-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
48
+ examples/SQA/SLUE-P2-SQA5-Test/sample_0.wav filter=lfs diff=lfs merge=lfs -text
49
+ examples/SQA/SLUE-P2-SQA5-Test/sample_1.wav filter=lfs diff=lfs merge=lfs -text
50
+ examples/SQA/SLUE-P2-SQA5-Test/sample_2.wav filter=lfs diff=lfs merge=lfs -text
51
+ examples/SQA/Spoken-Squad-v1/sample_0.wav filter=lfs diff=lfs merge=lfs -text
52
+ examples/SQA/Spoken-Squad-v1/sample_1.wav filter=lfs diff=lfs merge=lfs -text
53
+ examples/SQA/Spoken-Squad-v1/sample_2.wav filter=lfs diff=lfs merge=lfs -text
examples/AC/AudioCaps-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff53d7bcff873fea97f02e971836f24adfbfba9ebdcfc7554423bf65cb110e1d
3
+ size 970312
examples/AC/AudioCaps-Test/dataset_info.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audiocap_id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "start_time": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "youtube_id": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ }
48
+ },
49
+ "salmonn_7b": {
50
+ "answer": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "model_prediction": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "task_type": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "text": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ }
66
+ },
67
+ "wavllm_fairseq": {
68
+ "answer": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "model_prediction": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "task_type": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "text": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ }
84
+ },
85
+ "Qwen2-Audio-7B-Instruct": {
86
+ "answer": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "model_prediction": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "task_type": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "text": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ }
102
+ },
103
+ "whisper_large_v3_with_llama_3_8b_instruct": {
104
+ "answer": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "model_prediction": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "task_type": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "text": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ }
120
+ },
121
+ "qwen_audio_chat": {
122
+ "answer": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "model_prediction": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "task_type": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "text": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ }
138
+ }
139
+ },
140
+ "homepage": "",
141
+ "license": ""
142
+ }
examples/AC/AudioCaps-Test/sample_0.wav ADDED
Binary file (320 kB). View file
 
examples/AC/AudioCaps-Test/sample_1.wav ADDED
Binary file (320 kB). View file
 
examples/AC/AudioCaps-Test/sample_2.wav ADDED
Binary file (320 kB). View file
 
examples/AC/AudioCaps-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "0e301916c3676d35",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/AC/WavCaps-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c824145183da91da498c0338860b6daeaed94d7c619ec9ae22a041918c61c902
3
+ size 985360
examples/AC/WavCaps-Test/dataset_info.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audio_path": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "duration": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ }
44
+ },
45
+ "salmonn_7b": {
46
+ "answer": {
47
+ "dtype": "string",
48
+ "_type": "Value"
49
+ },
50
+ "model_prediction": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "task_type": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "text": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ }
62
+ },
63
+ "wavllm_fairseq": {
64
+ "answer": {
65
+ "dtype": "string",
66
+ "_type": "Value"
67
+ },
68
+ "model_prediction": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "task_type": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "text": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ }
80
+ },
81
+ "Qwen2-Audio-7B-Instruct": {
82
+ "answer": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "model_prediction": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "task_type": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "text": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ }
98
+ },
99
+ "whisper_large_v3_with_llama_3_8b_instruct": {
100
+ "answer": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "model_prediction": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "task_type": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "text": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ }
116
+ },
117
+ "qwen_audio_chat": {
118
+ "answer": {
119
+ "dtype": "string",
120
+ "_type": "Value"
121
+ },
122
+ "model_prediction": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "task_type": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "text": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ }
134
+ }
135
+ },
136
+ "homepage": "",
137
+ "license": ""
138
+ }
examples/AC/WavCaps-Test/sample_0.wav ADDED
Binary file (559 kB). View file
 
examples/AC/WavCaps-Test/sample_1.wav ADDED
Binary file (129 kB). View file
 
examples/AC/WavCaps-Test/sample_2.wav ADDED
Binary file (287 kB). View file
 
examples/AC/WavCaps-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "1e570096603c2a32",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/AR/VoxCeleb-Accent-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c07c0e241fb703dda0b7544925d0ca59bba57417daa5e08727a9edd9312e64
3
+ size 578840
examples/AR/VoxCeleb-Accent-Test/dataset_info.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "Gender": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "Nationality": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "VGGFace1 ID": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ },
48
+ "VoxCeleb1 ID": {
49
+ "dtype": "string",
50
+ "_type": "Value"
51
+ },
52
+ "index": {
53
+ "dtype": "string",
54
+ "_type": "Value"
55
+ }
56
+ },
57
+ "salmonn_7b": {
58
+ "answer": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "model_prediction": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ },
66
+ "task_type": {
67
+ "dtype": "string",
68
+ "_type": "Value"
69
+ },
70
+ "text": {
71
+ "dtype": "string",
72
+ "_type": "Value"
73
+ }
74
+ },
75
+ "wavllm_fairseq": {
76
+ "answer": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "model_prediction": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ },
84
+ "task_type": {
85
+ "dtype": "string",
86
+ "_type": "Value"
87
+ },
88
+ "text": {
89
+ "dtype": "string",
90
+ "_type": "Value"
91
+ }
92
+ },
93
+ "Qwen2-Audio-7B-Instruct": {
94
+ "answer": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "model_prediction": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ },
102
+ "task_type": {
103
+ "dtype": "string",
104
+ "_type": "Value"
105
+ },
106
+ "text": {
107
+ "dtype": "string",
108
+ "_type": "Value"
109
+ }
110
+ },
111
+ "whisper_large_v3_with_llama_3_8b_instruct": {
112
+ "answer": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "model_prediction": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ },
120
+ "task_type": {
121
+ "dtype": "string",
122
+ "_type": "Value"
123
+ },
124
+ "text": {
125
+ "dtype": "string",
126
+ "_type": "Value"
127
+ }
128
+ },
129
+ "qwen_audio_chat": {
130
+ "answer": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "model_prediction": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ },
138
+ "task_type": {
139
+ "dtype": "string",
140
+ "_type": "Value"
141
+ },
142
+ "text": {
143
+ "dtype": "string",
144
+ "_type": "Value"
145
+ }
146
+ }
147
+ },
148
+ "homepage": "",
149
+ "license": ""
150
+ }
examples/AR/VoxCeleb-Accent-Test/sample_0.wav ADDED
Binary file (209 kB). View file
 
examples/AR/VoxCeleb-Accent-Test/sample_1.wav ADDED
Binary file (227 kB). View file
 
examples/AR/VoxCeleb-Accent-Test/sample_2.wav ADDED
Binary file (134 kB). View file
 
examples/AR/VoxCeleb-Accent-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "7017504c8eeb5d71",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/ASQA/AudioCaps-QA-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f33a60a23ffc8fce090307d530ef03b2b8cf9852fa70418e76ed6a1d5dd978
3
+ size 954480
examples/ASQA/AudioCaps-QA-Test/dataset_info.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audiocap_id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "caption": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "start_time": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ },
48
+ "youtube_id": {
49
+ "dtype": "string",
50
+ "_type": "Value"
51
+ }
52
+ },
53
+ "salmonn_7b": {
54
+ "answer": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "model_prediction": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "task_type": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ },
66
+ "text": {
67
+ "dtype": "string",
68
+ "_type": "Value"
69
+ }
70
+ },
71
+ "wavllm_fairseq": {
72
+ "answer": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "model_prediction": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "task_type": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ },
84
+ "text": {
85
+ "dtype": "string",
86
+ "_type": "Value"
87
+ }
88
+ },
89
+ "Qwen2-Audio-7B-Instruct": {
90
+ "answer": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "model_prediction": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "task_type": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ },
102
+ "text": {
103
+ "dtype": "string",
104
+ "_type": "Value"
105
+ }
106
+ },
107
+ "whisper_large_v3_with_llama_3_8b_instruct": {
108
+ "answer": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "model_prediction": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "task_type": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ },
120
+ "text": {
121
+ "dtype": "string",
122
+ "_type": "Value"
123
+ }
124
+ },
125
+ "qwen_audio_chat": {
126
+ "answer": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "model_prediction": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "task_type": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ },
138
+ "text": {
139
+ "dtype": "string",
140
+ "_type": "Value"
141
+ }
142
+ }
143
+ },
144
+ "homepage": "",
145
+ "license": ""
146
+ }
examples/ASQA/AudioCaps-QA-Test/sample_0.wav ADDED
Binary file (320 kB). View file
 
examples/ASQA/AudioCaps-QA-Test/sample_1.wav ADDED
Binary file (304 kB). View file
 
examples/ASQA/AudioCaps-QA-Test/sample_2.wav ADDED
Binary file (320 kB). View file
 
examples/ASQA/AudioCaps-QA-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "b4d0bc420173574a",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/ASQA/Clotho-AQA-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a76c13e88e89bb39d6a4dedc9f3bb30e128cdaaa5f68d8c0e1d804d9af5cbf68
3
+ size 2181384
examples/ASQA/Clotho-AQA-Test/dataset_info.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {},
36
+ "salmonn_7b": {
37
+ "answer": {
38
+ "dtype": "string",
39
+ "_type": "Value"
40
+ },
41
+ "model_prediction": {
42
+ "dtype": "string",
43
+ "_type": "Value"
44
+ },
45
+ "task_type": {
46
+ "dtype": "string",
47
+ "_type": "Value"
48
+ },
49
+ "text": {
50
+ "dtype": "string",
51
+ "_type": "Value"
52
+ }
53
+ },
54
+ "wavllm_fairseq": {
55
+ "answer": {
56
+ "dtype": "string",
57
+ "_type": "Value"
58
+ },
59
+ "model_prediction": {
60
+ "dtype": "string",
61
+ "_type": "Value"
62
+ },
63
+ "task_type": {
64
+ "dtype": "string",
65
+ "_type": "Value"
66
+ },
67
+ "text": {
68
+ "dtype": "string",
69
+ "_type": "Value"
70
+ }
71
+ },
72
+ "Qwen2-Audio-7B-Instruct": {
73
+ "answer": {
74
+ "dtype": "string",
75
+ "_type": "Value"
76
+ },
77
+ "model_prediction": {
78
+ "dtype": "string",
79
+ "_type": "Value"
80
+ },
81
+ "task_type": {
82
+ "dtype": "string",
83
+ "_type": "Value"
84
+ },
85
+ "text": {
86
+ "dtype": "string",
87
+ "_type": "Value"
88
+ }
89
+ },
90
+ "whisper_large_v3_with_llama_3_8b_instruct": {
91
+ "answer": {
92
+ "dtype": "string",
93
+ "_type": "Value"
94
+ },
95
+ "model_prediction": {
96
+ "dtype": "string",
97
+ "_type": "Value"
98
+ },
99
+ "task_type": {
100
+ "dtype": "string",
101
+ "_type": "Value"
102
+ },
103
+ "text": {
104
+ "dtype": "string",
105
+ "_type": "Value"
106
+ }
107
+ },
108
+ "qwen_audio_chat": {
109
+ "answer": {
110
+ "dtype": "string",
111
+ "_type": "Value"
112
+ },
113
+ "model_prediction": {
114
+ "dtype": "string",
115
+ "_type": "Value"
116
+ },
117
+ "task_type": {
118
+ "dtype": "string",
119
+ "_type": "Value"
120
+ },
121
+ "text": {
122
+ "dtype": "string",
123
+ "_type": "Value"
124
+ }
125
+ }
126
+ },
127
+ "homepage": "",
128
+ "license": ""
129
+ }
examples/ASQA/Clotho-AQA-Test/sample_0.wav ADDED
Binary file (868 kB). View file
 
examples/ASQA/Clotho-AQA-Test/sample_1.wav ADDED
Binary file (668 kB). View file
 
examples/ASQA/Clotho-AQA-Test/sample_2.wav ADDED
Binary file (636 kB). View file
 
examples/ASQA/Clotho-AQA-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "3f05c76553bf311d",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/ASQA/WavCaps-QA-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:183517b37463c54c088f5e1011721003d14587380bc111e8ad7f7cfc60fcd9e5
3
+ size 970376
examples/ASQA/WavCaps-QA-Test/dataset_info.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "audio_path": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "caption": {
41
+ "dtype": "string",
42
+ "_type": "Value"
43
+ },
44
+ "duration": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ }
48
+ },
49
+ "salmonn_7b": {
50
+ "answer": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "model_prediction": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ },
58
+ "task_type": {
59
+ "dtype": "string",
60
+ "_type": "Value"
61
+ },
62
+ "text": {
63
+ "dtype": "string",
64
+ "_type": "Value"
65
+ }
66
+ },
67
+ "wavllm_fairseq": {
68
+ "answer": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "model_prediction": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ },
76
+ "task_type": {
77
+ "dtype": "string",
78
+ "_type": "Value"
79
+ },
80
+ "text": {
81
+ "dtype": "string",
82
+ "_type": "Value"
83
+ }
84
+ },
85
+ "Qwen2-Audio-7B-Instruct": {
86
+ "answer": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "model_prediction": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ },
94
+ "task_type": {
95
+ "dtype": "string",
96
+ "_type": "Value"
97
+ },
98
+ "text": {
99
+ "dtype": "string",
100
+ "_type": "Value"
101
+ }
102
+ },
103
+ "whisper_large_v3_with_llama_3_8b_instruct": {
104
+ "answer": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "model_prediction": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ },
112
+ "task_type": {
113
+ "dtype": "string",
114
+ "_type": "Value"
115
+ },
116
+ "text": {
117
+ "dtype": "string",
118
+ "_type": "Value"
119
+ }
120
+ },
121
+ "qwen_audio_chat": {
122
+ "answer": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "model_prediction": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ },
130
+ "task_type": {
131
+ "dtype": "string",
132
+ "_type": "Value"
133
+ },
134
+ "text": {
135
+ "dtype": "string",
136
+ "_type": "Value"
137
+ }
138
+ }
139
+ },
140
+ "homepage": "",
141
+ "license": ""
142
+ }
examples/ASQA/WavCaps-QA-Test/sample_0.wav ADDED
Binary file (320 kB). View file
 
examples/ASQA/WavCaps-QA-Test/sample_1.wav ADDED
Binary file (320 kB). View file
 
examples/ASQA/WavCaps-QA-Test/sample_2.wav ADDED
Binary file (320 kB). View file
 
examples/ASQA/WavCaps-QA-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "f88396310248e252",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/ASR/Common-Voice-15-En-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44587f333a31075993f3cc8832f8946fd59605246769dd8bf86887dcf3bd889c
3
+ size 543912
examples/ASR/Common-Voice-15-En-Test/dataset_info.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "accents": {
37
+ "dtype": "null",
38
+ "_type": "Value"
39
+ },
40
+ "age": {
41
+ "dtype": "null",
42
+ "_type": "Value"
43
+ },
44
+ "client_id": {
45
+ "dtype": "string",
46
+ "_type": "Value"
47
+ },
48
+ "down_votes": {
49
+ "dtype": "int64",
50
+ "_type": "Value"
51
+ },
52
+ "gender": {
53
+ "dtype": "null",
54
+ "_type": "Value"
55
+ },
56
+ "language": {
57
+ "dtype": "string",
58
+ "_type": "Value"
59
+ },
60
+ "locale": {
61
+ "dtype": "string",
62
+ "_type": "Value"
63
+ },
64
+ "segment": {
65
+ "dtype": "null",
66
+ "_type": "Value"
67
+ },
68
+ "up_votes": {
69
+ "dtype": "int64",
70
+ "_type": "Value"
71
+ },
72
+ "variant": {
73
+ "dtype": "null",
74
+ "_type": "Value"
75
+ }
76
+ },
77
+ "salmonn_7b": {
78
+ "answer": {
79
+ "dtype": "string",
80
+ "_type": "Value"
81
+ },
82
+ "model_prediction": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "task_type": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "text": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ }
94
+ },
95
+ "wavllm_fairseq": {
96
+ "answer": {
97
+ "dtype": "string",
98
+ "_type": "Value"
99
+ },
100
+ "model_prediction": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "task_type": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "text": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ }
112
+ },
113
+ "Qwen2-Audio-7B-Instruct": {
114
+ "answer": {
115
+ "dtype": "string",
116
+ "_type": "Value"
117
+ },
118
+ "model_prediction": {
119
+ "dtype": "string",
120
+ "_type": "Value"
121
+ },
122
+ "task_type": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "text": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ }
130
+ },
131
+ "whisper_large_v3_with_llama_3_8b_instruct": {
132
+ "answer": {
133
+ "dtype": "string",
134
+ "_type": "Value"
135
+ },
136
+ "model_prediction": {
137
+ "dtype": "string",
138
+ "_type": "Value"
139
+ },
140
+ "task_type": {
141
+ "dtype": "string",
142
+ "_type": "Value"
143
+ },
144
+ "text": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ }
148
+ },
149
+ "qwen_audio_chat": {
150
+ "answer": {
151
+ "dtype": "string",
152
+ "_type": "Value"
153
+ },
154
+ "model_prediction": {
155
+ "dtype": "string",
156
+ "_type": "Value"
157
+ },
158
+ "task_type": {
159
+ "dtype": "string",
160
+ "_type": "Value"
161
+ },
162
+ "text": {
163
+ "dtype": "string",
164
+ "_type": "Value"
165
+ }
166
+ }
167
+ },
168
+ "homepage": "",
169
+ "license": ""
170
+ }
examples/ASR/Common-Voice-15-En-Test/sample_0.wav ADDED
Binary file (202 kB). View file
 
examples/ASR/Common-Voice-15-En-Test/sample_1.wav ADDED
Binary file (118 kB). View file
 
examples/ASR/Common-Voice-15-En-Test/sample_2.wav ADDED
Binary file (214 kB). View file
 
examples/ASR/Common-Voice-15-En-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "30218d56801da2e8",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/ASR/Earnings21-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33dce5920f131621276ca79ef824fc52d5d5bb043d7672458fdbe037c7fb60fd
3
+ size 310572568
examples/ASR/Earnings21-Test/dataset_info.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "context": {
6
+ "text": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "audio": {
11
+ "sampling_rate": 16000,
12
+ "_type": "Audio"
13
+ }
14
+ },
15
+ "instruction": {
16
+ "audio": {
17
+ "dtype": "null",
18
+ "_type": "Value"
19
+ },
20
+ "text": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ }
24
+ },
25
+ "answer": {
26
+ "audio": {
27
+ "dtype": "null",
28
+ "_type": "Value"
29
+ },
30
+ "text": {
31
+ "dtype": "string",
32
+ "_type": "Value"
33
+ }
34
+ },
35
+ "other_attributes": {
36
+ "id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ }
40
+ },
41
+ "salmonn_7b": {
42
+ "answer": {
43
+ "dtype": "string",
44
+ "_type": "Value"
45
+ },
46
+ "model_prediction": {
47
+ "dtype": "string",
48
+ "_type": "Value"
49
+ },
50
+ "task_type": {
51
+ "dtype": "string",
52
+ "_type": "Value"
53
+ },
54
+ "text": {
55
+ "dtype": "string",
56
+ "_type": "Value"
57
+ }
58
+ },
59
+ "wavllm_fairseq": {
60
+ "answer": {
61
+ "dtype": "string",
62
+ "_type": "Value"
63
+ },
64
+ "model_prediction": {
65
+ "dtype": "string",
66
+ "_type": "Value"
67
+ },
68
+ "task_type": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "text": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ }
76
+ },
77
+ "Qwen2-Audio-7B-Instruct": {
78
+ "answer": {
79
+ "dtype": "string",
80
+ "_type": "Value"
81
+ },
82
+ "model_prediction": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "task_type": {
87
+ "dtype": "string",
88
+ "_type": "Value"
89
+ },
90
+ "text": {
91
+ "dtype": "string",
92
+ "_type": "Value"
93
+ }
94
+ },
95
+ "whisper_large_v3_with_llama_3_8b_instruct": {
96
+ "answer": {
97
+ "dtype": "string",
98
+ "_type": "Value"
99
+ },
100
+ "model_prediction": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "task_type": {
105
+ "dtype": "string",
106
+ "_type": "Value"
107
+ },
108
+ "text": {
109
+ "dtype": "string",
110
+ "_type": "Value"
111
+ }
112
+ },
113
+ "qwen_audio_chat": {
114
+ "answer": {
115
+ "dtype": "string",
116
+ "_type": "Value"
117
+ },
118
+ "model_prediction": {
119
+ "dtype": "string",
120
+ "_type": "Value"
121
+ },
122
+ "task_type": {
123
+ "dtype": "string",
124
+ "_type": "Value"
125
+ },
126
+ "text": {
127
+ "dtype": "string",
128
+ "_type": "Value"
129
+ }
130
+ }
131
+ },
132
+ "homepage": "",
133
+ "license": ""
134
+ }
examples/ASR/Earnings21-Test/sample_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e88eccc61c5e8a5e62867c52a8ccfc4e108a5f48459f7df0eb1e9ae7f16d4f
3
+ size 139072236
examples/ASR/Earnings21-Test/sample_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcd3ce7847b7fb0f600bb3faeca3cd2a8f4992fd1de17bd831ef6ccfb2623f33
3
+ size 55065644
examples/ASR/Earnings21-Test/sample_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:607725d2a33f206a0f00668f1907f4d997fc7dbe7d1e1f9c82045496412bd8bd
3
+ size 115039268
examples/ASR/Earnings21-Test/state.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "2e3dea299b387757",
8
+ "_format_columns": [
9
+ "context",
10
+ "instruction",
11
+ "answer",
12
+ "other_attributes",
13
+ "salmonn_7b",
14
+ "wavllm_fairseq",
15
+ "Qwen2-Audio-7B-Instruct",
16
+ "whisper_large_v3_with_llama_3_8b_instruct",
17
+ "qwen_audio_chat"
18
+ ],
19
+ "_format_kwargs": {},
20
+ "_format_type": null,
21
+ "_output_all_columns": false,
22
+ "_split": null
23
+ }
examples/ASR/Earnings22-Test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51a45b6f2b6f1eaefc76f48f4778b5d5d46793e0a0fc846b1c2dd4b639dd173f
3
+ size 299513312