narugo commited on
Commit
ab5e723
1 Parent(s): d19276c

Export ONNX version of model 'sanchit-gandhi/whisper-base-ft-common-language-id', on 2024-08-30 01:35:14 CST

Browse files
Files changed (4) hide show
  1. README.md +17 -0
  2. config.json +246 -0
  3. model.onnx +3 -0
  4. preprocessor_config.json +14 -0
README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: sanchit-gandhi/whisper-base-ft-common-language-id
3
+ datasets:
4
+ - common_language
5
+ license: apache-2.0
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: whisper-base-ft-common-language-id
10
+ results: []
11
+ tags:
12
+ - audio-classification
13
+ - generated_from_trainer
14
+ ---
15
+
16
+ This is the ONNX exported version of [sanchit-gandhi/whisper-base-ft-common-language-id](https://huggingface.co/sanchit-gandhi/whisper-base-ft-common-language-id).
17
+
config.json ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sanchit-gandhi/whisper-base-ft-common-language-id",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperForAudioClassification"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 512,
17
+ "decoder_attention_heads": 8,
18
+ "decoder_ffn_dim": 2048,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 6,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 50257,
28
+ "finetuning_task": "audio-classification",
29
+ "forced_decoder_ids": [
30
+ [
31
+ 1,
32
+ 50259
33
+ ],
34
+ [
35
+ 2,
36
+ 50359
37
+ ],
38
+ [
39
+ 3,
40
+ 50363
41
+ ]
42
+ ],
43
+ "id2label": {
44
+ "0": "Arabic",
45
+ "1": "Basque",
46
+ "2": "Breton",
47
+ "3": "Catalan",
48
+ "4": "Chinese_China",
49
+ "5": "Chinese_Hongkong",
50
+ "6": "Chinese_Taiwan",
51
+ "7": "Chuvash",
52
+ "8": "Czech",
53
+ "9": "Dhivehi",
54
+ "10": "Dutch",
55
+ "11": "English",
56
+ "12": "Esperanto",
57
+ "13": "Estonian",
58
+ "14": "French",
59
+ "15": "Frisian",
60
+ "16": "Georgian",
61
+ "17": "German",
62
+ "18": "Greek",
63
+ "19": "Hakha_Chin",
64
+ "20": "Indonesian",
65
+ "21": "Interlingua",
66
+ "22": "Italian",
67
+ "23": "Japanese",
68
+ "24": "Kabyle",
69
+ "25": "Kinyarwanda",
70
+ "26": "Kyrgyz",
71
+ "27": "Latvian",
72
+ "28": "Maltese",
73
+ "29": "Mangolian",
74
+ "30": "Persian",
75
+ "31": "Polish",
76
+ "32": "Portuguese",
77
+ "33": "Romanian",
78
+ "34": "Romansh_Sursilvan",
79
+ "35": "Russian",
80
+ "36": "Sakha",
81
+ "37": "Slovenian",
82
+ "38": "Spanish",
83
+ "39": "Swedish",
84
+ "40": "Tamil",
85
+ "41": "Tatar",
86
+ "42": "Turkish",
87
+ "43": "Ukranian",
88
+ "44": "Welsh"
89
+ },
90
+ "init_std": 0.02,
91
+ "is_encoder_decoder": true,
92
+ "label2id": {
93
+ "Arabic": "0",
94
+ "Basque": "1",
95
+ "Breton": "2",
96
+ "Catalan": "3",
97
+ "Chinese_China": "4",
98
+ "Chinese_Hongkong": "5",
99
+ "Chinese_Taiwan": "6",
100
+ "Chuvash": "7",
101
+ "Czech": "8",
102
+ "Dhivehi": "9",
103
+ "Dutch": "10",
104
+ "English": "11",
105
+ "Esperanto": "12",
106
+ "Estonian": "13",
107
+ "French": "14",
108
+ "Frisian": "15",
109
+ "Georgian": "16",
110
+ "German": "17",
111
+ "Greek": "18",
112
+ "Hakha_Chin": "19",
113
+ "Indonesian": "20",
114
+ "Interlingua": "21",
115
+ "Italian": "22",
116
+ "Japanese": "23",
117
+ "Kabyle": "24",
118
+ "Kinyarwanda": "25",
119
+ "Kyrgyz": "26",
120
+ "Latvian": "27",
121
+ "Maltese": "28",
122
+ "Mangolian": "29",
123
+ "Persian": "30",
124
+ "Polish": "31",
125
+ "Portuguese": "32",
126
+ "Romanian": "33",
127
+ "Romansh_Sursilvan": "34",
128
+ "Russian": "35",
129
+ "Sakha": "36",
130
+ "Slovenian": "37",
131
+ "Spanish": "38",
132
+ "Swedish": "39",
133
+ "Tamil": "40",
134
+ "Tatar": "41",
135
+ "Turkish": "42",
136
+ "Ukranian": "43",
137
+ "Welsh": "44"
138
+ },
139
+ "mask_feature_length": 10,
140
+ "mask_feature_min_masks": 0,
141
+ "mask_feature_prob": 0.0,
142
+ "mask_time_length": 10,
143
+ "mask_time_min_masks": 2,
144
+ "mask_time_prob": 0.05,
145
+ "max_length": 448,
146
+ "max_source_positions": 1500,
147
+ "max_target_positions": 448,
148
+ "median_filter_width": 7,
149
+ "model_type": "whisper",
150
+ "num_hidden_layers": 6,
151
+ "num_mel_bins": 80,
152
+ "pad_token_id": 50257,
153
+ "scale_embedding": false,
154
+ "suppress_tokens": [
155
+ 1,
156
+ 2,
157
+ 7,
158
+ 8,
159
+ 9,
160
+ 10,
161
+ 14,
162
+ 25,
163
+ 26,
164
+ 27,
165
+ 28,
166
+ 29,
167
+ 31,
168
+ 58,
169
+ 59,
170
+ 60,
171
+ 61,
172
+ 62,
173
+ 63,
174
+ 90,
175
+ 91,
176
+ 92,
177
+ 93,
178
+ 359,
179
+ 503,
180
+ 522,
181
+ 542,
182
+ 873,
183
+ 893,
184
+ 902,
185
+ 918,
186
+ 922,
187
+ 931,
188
+ 1350,
189
+ 1853,
190
+ 1982,
191
+ 2460,
192
+ 2627,
193
+ 3246,
194
+ 3253,
195
+ 3268,
196
+ 3536,
197
+ 3846,
198
+ 3961,
199
+ 4183,
200
+ 4667,
201
+ 6585,
202
+ 6647,
203
+ 7273,
204
+ 9061,
205
+ 9383,
206
+ 10428,
207
+ 10929,
208
+ 11938,
209
+ 12033,
210
+ 12331,
211
+ 12562,
212
+ 13793,
213
+ 14157,
214
+ 14635,
215
+ 15265,
216
+ 15618,
217
+ 16553,
218
+ 16604,
219
+ 18362,
220
+ 18956,
221
+ 20075,
222
+ 21675,
223
+ 22520,
224
+ 26130,
225
+ 26161,
226
+ 26435,
227
+ 28279,
228
+ 29464,
229
+ 31650,
230
+ 32302,
231
+ 32470,
232
+ 36865,
233
+ 42863,
234
+ 47425,
235
+ 49870,
236
+ 50254,
237
+ 50258,
238
+ 50360,
239
+ 50361,
240
+ 50362
241
+ ],
242
+ "transformers_version": "4.43.4",
243
+ "use_cache": true,
244
+ "use_weighted_layer_sum": false,
245
+ "vocab_size": 51865
246
+ }
model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d2203033195dd4033b067b5efecef7d9d5e67321072cabac07a1f9e075e1b94
3
+ size 83029854
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }