csukuangfj commited on
Commit
1d48bfe
1 Parent(s): 6ffadcd

add tibetan, arabic and german models

Browse files
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Automatic Speech Recognition With Next Gen Kaldi
3
- emoji: 📚
4
- colorFrom: pink
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.0.26
8
  app_file: app.py
1
  ---
2
+ title: Automatic Speech Recognition
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.0.26
8
  app_file: app.py
app.py CHANGED
@@ -25,6 +25,7 @@ import time
25
  from datetime import datetime
26
 
27
  import gradio as gr
 
28
  import torchaudio
29
 
30
  from examples import examples
@@ -37,7 +38,7 @@ def convert_to_wav(in_filename: str) -> str:
37
  """Convert the input audio file to a wave file"""
38
  out_filename = in_filename + ".wav"
39
  logging.info(f"Converting '{in_filename}' to '{out_filename}'")
40
- _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' '{out_filename}'")
41
  return out_filename
42
 
43
 
@@ -108,6 +109,7 @@ def process_microphone(
108
  return "", build_html_output(str(e), "result_item_error")
109
 
110
 
 
111
  def process(
112
  language: str,
113
  repo_id: str,
@@ -123,36 +125,32 @@ def process(
123
 
124
  filename = convert_to_wav(in_filename)
125
 
 
 
 
126
  now = datetime.now()
127
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
128
  logging.info(f"Started at {date_time}")
129
 
130
  start = time.time()
131
- wave, wave_sample_rate = torchaudio.load(filename)
132
-
133
- if wave_sample_rate != sample_rate:
134
- logging.info(
135
- f"Expected sample rate: {sample_rate}. Given: {wave_sample_rate}. "
136
- f"Resampling to {sample_rate}."
137
- )
138
-
139
- wave = torchaudio.functional.resample(
140
- wave,
141
- orig_freq=wave_sample_rate,
142
- new_freq=sample_rate,
143
- )
144
- wave = wave[0] # use only the first channel.
145
 
146
- hyp = get_pretrained_model(repo_id).decode_waves(
147
- [wave],
148
  decoding_method=decoding_method,
149
  num_active_paths=num_active_paths,
150
- )[0]
 
 
 
 
 
 
151
 
152
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
153
  end = time.time()
154
 
155
- duration = wave.shape[0] / sample_rate
 
156
  rtf = (end - start) / duration
157
 
158
  logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
@@ -164,14 +162,14 @@ def process(
164
  """
165
  if rtf > 1:
166
  info += (
167
- f"<br/>We are loading the model for the first run. "
168
  "Please run again to measure the real RTF.<br/>"
169
  )
170
 
171
  logging.info(info)
172
- logging.info(f"hyp:\n{hyp}")
173
 
174
- return hyp, build_html_output(info)
175
 
176
 
177
  title = "# Automatic Speech Recognition with Next-gen Kaldi"
25
  from datetime import datetime
26
 
27
  import gradio as gr
28
+ import torch
29
  import torchaudio
30
 
31
  from examples import examples
38
  """Convert the input audio file to a wave file"""
39
  out_filename = in_filename + ".wav"
40
  logging.info(f"Converting '{in_filename}' to '{out_filename}'")
41
+ _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
42
  return out_filename
43
 
44
 
109
  return "", build_html_output(str(e), "result_item_error")
110
 
111
 
112
+ @torch.no_grad()
113
  def process(
114
  language: str,
115
  repo_id: str,
125
 
126
  filename = convert_to_wav(in_filename)
127
 
128
+ logging.info(f"filename: {in_filename}")
129
+ os.system(f"ffprobe {filename}")
130
+
131
  now = datetime.now()
132
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
133
  logging.info(f"Started at {date_time}")
134
 
135
  start = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ recognizer = get_pretrained_model(
138
+ repo_id,
139
  decoding_method=decoding_method,
140
  num_active_paths=num_active_paths,
141
+ )
142
+ s = recognizer.create_stream()
143
+
144
+ s.accept_wave_file(filename)
145
+ recognizer.decode_stream(s)
146
+
147
+ text = s.result.text
148
 
149
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
150
  end = time.time()
151
 
152
+ metadata = torchaudio.info(filename)
153
+ duration = metadata.num_frames / sample_rate
154
  rtf = (end - start) / duration
155
 
156
  logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
162
  """
163
  if rtf > 1:
164
  info += (
165
+ "<br/>We are loading the model for the first run. "
166
  "Please run again to measure the real RTF.<br/>"
167
  )
168
 
169
  logging.info(info)
170
+ logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
171
 
172
+ return text, build_html_output(info)
173
 
174
 
175
  title = "# Automatic Speech Recognition with Next-gen Kaldi"
examples.py CHANGED
@@ -176,4 +176,61 @@ examples = [
176
  4,
177
  "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav",
178
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  ]
176
  4,
177
  "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav",
178
  ],
179
+ [
180
+ "Tibetan",
181
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
182
+ "greedy_search",
183
+ 4,
184
+ "./test_wavs/tibetan/a_0_cacm-A70_31116.wav",
185
+ ],
186
+ [
187
+ "Tibetan",
188
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
189
+ "greedy_search",
190
+ 4,
191
+ "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
192
+ ],
193
+ [
194
+ "Tibetan",
195
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
196
+ "greedy_search",
197
+ 4,
198
+ "./test_wavs/tibetan/a_0_cacm-A70_31118.wav",
199
+ ],
200
+ # arabic
201
+ [
202
+ "Arabic",
203
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
204
+ "greedy_search",
205
+ 4,
206
+ "./test_wavs/arabic/a.wav",
207
+ ],
208
+ [
209
+ "Arabic",
210
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
211
+ "greedy_search",
212
+ 4,
213
+ "./test_wavs/arabic/b.wav",
214
+ ],
215
+ [
216
+ "Arabic",
217
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
218
+ "greedy_search",
219
+ 4,
220
+ "./test_wavs/arabic/c.wav",
221
+ ],
222
+ [
223
+ "German",
224
+ "csukuangfj/wav2vec2.0-torchaudio",
225
+ "greedy_search",
226
+ 4,
227
+ "./test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav",
228
+ ],
229
+ [
230
+ "German",
231
+ "csukuangfj/wav2vec2.0-torchaudio",
232
+ "greedy_search",
233
+ 4,
234
+ "./test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav",
235
+ ],
236
  ]
giga-tokens.txt ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <blk> 0
2
+ <sos/eos> 1
3
+ <unk> 2
4
+ S 3
5
+ T 4
6
+ ▁THE 5
7
+ ▁A 6
8
+ E 7
9
+ ▁AND 8
10
+ ▁TO 9
11
+ N 10
12
+ D 11
13
+ ▁OF 12
14
+ ' 13
15
+ ING 14
16
+ ▁I 15
17
+ Y 16
18
+ ▁IN 17
19
+ ED 18
20
+ ▁THAT 19
21
+ ▁ 20
22
+ P 21
23
+ R 22
24
+ ▁YOU 23
25
+ M 24
26
+ RE 25
27
+ ER 26
28
+ C 27
29
+ O 28
30
+ ▁IT 29
31
+ L 30
32
+ A 31
33
+ U 32
34
+ G 33
35
+ ▁WE 34
36
+ ▁IS 35
37
+ ▁SO 36
38
+ AL 37
39
+ I 38
40
+ ▁S 39
41
+ ▁RE 40
42
+ AR 41
43
+ B 42
44
+ ▁FOR 43
45
+ ▁C 44
46
+ ▁BE 45
47
+ LE 46
48
+ F 47
49
+ W 48
50
+ ▁E 49
51
+ ▁HE 50
52
+ LL 51
53
+ ▁WAS 52
54
+ LY 53
55
+ OR 54
56
+ IN 55
57
+ ▁F 56
58
+ VE 57
59
+ ▁THIS 58
60
+ TH 59
61
+ K 60
62
+ ▁ON 61
63
+ IT 62
64
+ ▁B 63
65
+ ▁WITH 64
66
+ ▁BUT 65
67
+ EN 66
68
+ CE 67
69
+ RI 68
70
+ ▁DO 69
71
+ UR 70
72
+ ▁HAVE 71
73
+ ▁DE 72
74
+ ▁ME 73
75
+ ▁T 74
76
+ ENT 75
77
+ CH 76
78
+ ▁THEY 77
79
+ ▁NOT 78
80
+ ES 79
81
+ V 80
82
+ ▁AS 81
83
+ RA 82
84
+ ▁P 83
85
+ ON 84
86
+ TER 85
87
+ ▁ARE 86
88
+ ▁WHAT 87
89
+ IC 88
90
+ ▁ST 89
91
+ ▁LIKE 90
92
+ ATION 91
93
+ ▁OR 92
94
+ ▁CA 93
95
+ ▁AT 94
96
+ H 95
97
+ ▁KNOW 96
98
+ ▁G 97
99
+ AN 98
100
+ ▁CON 99
101
+ IL 100
102
+ ND 101
103
+ RO 102
104
+ ▁HIS 103
105
+ ▁CAN 104
106
+ ▁ALL 105
107
+ TE 106
108
+ ▁THERE 107
109
+ ▁SU 108
110
+ ▁MO 109
111
+ ▁MA 110
112
+ LI 111
113
+ ▁ONE 112
114
+ ▁ABOUT 113
115
+ LA 114
116
+ ▁CO 115
117
+ - 116
118
+ ▁MY 117
119
+ ▁HAD 118
120
+ CK 119
121
+ NG 120
122
+ ▁NO 121
123
+ MENT 122
124
+ AD 123
125
+ LO 124
126
+ ME 125
127
+ ▁AN 126
128
+ ▁FROM 127
129
+ NE 128
130
+ ▁IF 129
131
+ VER 130
132
+ ▁JUST 131
133
+ ▁PRO 132
134
+ ION 133
135
+ ▁PA 134
136
+ ▁WHO 135
137
+ ▁SE 136
138
+ EL 137
139
+ IR 138
140
+ ▁US 139
141
+ ▁UP 140
142
+ ▁YOUR 141
143
+ CI 142
144
+ RY 143
145
+ ▁GO 144
146
+ ▁SHE 145
147
+ ▁LE 146
148
+ ▁OUT 147
149
+ ▁PO 148
150
+ ▁HO 149
151
+ ATE 150
152
+ ▁BO 151
153
+ ▁BY 152
154
+ ▁FA 153
155
+ ▁MI 154
156
+ AS 155
157
+ MP 156
158
+ ▁HER 157
159
+ VI 158
160
+ ▁THINK 159
161
+ ▁SOME 160
162
+ ▁WHEN 161
163
+ ▁AH 162
164
+ ▁PEOPLE 163
165
+ IG 164
166
+ ▁WA 165
167
+ ▁TE 166
168
+ ▁LA 167
169
+ ▁WERE 168
170
+ ▁LI 169
171
+ ▁WOULD 170
172
+ ▁SEE 171
173
+ ▁WHICH 172
174
+ DE 173
175
+ GE 174
176
+ ▁K 175
177
+ IGHT 176
178
+ ▁HA 177
179
+ ▁OUR 178
180
+ UN 179
181
+ ▁HOW 180
182
+ ▁GET 181
183
+ IS 182
184
+ UT 183
185
+ Z 184
186
+ CO 185
187
+ ET 186
188
+ UL 187
189
+ IES 188
190
+ IVE 189
191
+ AT 190
192
+ ▁O 191
193
+ ▁DON 192
194
+ LU 193
195
+ ▁TIME 194
196
+ ▁WILL 195
197
+ ▁MORE 196
198
+ ▁SP 197
199
+ ▁NOW 198
200
+ RU 199
201
+ ▁THEIR 200
202
+ ▁UN 201
203
+ ITY 202
204
+ OL 203
205
+ X 204
206
+ TI 205
207
+ US 206
208
+ ▁VERY 207
209
+ TION 208
210
+ ▁FI 209
211
+ ▁SAY 210
212
+ ▁BECAUSE 211
213
+ ▁EX 212
214
+ ▁RO 213
215
+ ERS 214
216
+ IST 215
217
+ ▁DA 216
218
+ TING 217
219
+ ▁EN 218
220
+ OM 219
221
+ ▁BA 220
222
+ ▁BEEN 221
223
+ ▁LO 222
224
+ ▁UM 223
225
+ AGE 224
226
+ ABLE 225
227
+ ▁WO 226
228
+ ▁RA 227
229
+ ▁OTHER 228
230
+ ▁REALLY 229
231
+ ENCE 230
232
+ ▁GOING 231
233
+ ▁HIM 232
234
+ ▁HAS 233
235
+ ▁THEM 234
236
+ ▁DIS 235
237
+ ▁WANT 236
238
+ ID 237
239
+ TA 238
240
+ ▁LOOK 239
241
+ KE 240
242
+ ▁DID 241
243
+ ▁SA 242
244
+ ▁VI 243
245
+ ▁SAID 244
246
+ ▁RIGHT 245
247
+ ▁THESE 246
248
+ ▁WORK 247
249
+ ▁COM 248
250
+ ALLY 249
251
+ FF 250
252
+ QU 251
253
+ AC 252
254
+ ▁DR 253
255
+ ▁WAY 254
256
+ ▁INTO 255
257
+ MO 256
258
+ TED 257
259
+ EST 258
260
+ ▁HERE 259
261
+ OK 260
262
+ ▁COULD 261
263
+ ▁WELL 262
264
+ MA 263
265
+ ▁PRE 264
266
+ ▁DI 265
267
+ MAN 266
268
+ ▁COMP 267
269
+ ▁THEN 268
270
+ IM 269
271
+ ▁PER 270
272
+ ▁NA 271
273
+ ▁WHERE 272
274
+ ▁TWO 273
275
+ ▁WI 274
276
+ ▁FE 275
277
+ INE 276
278
+ ▁ANY 277
279
+ TURE 278
280
+ ▁OVER 279
281
+ BO 280
282
+ ACH 281
283
+ OW 282
284
+ ▁MAKE 283
285
+ ▁TRA 284
286
+ HE 285
287
+ UND 286
288
+ ▁EVEN 287
289
+ ANCE 288
290
+ ▁YEAR 289
291
+ HO 290
292
+ AM 291
293
+ ▁CHA 292
294
+ ▁BACK 293
295
+ VO 294
296
+ ANT 295
297
+ DI 296
298
+ ▁ALSO 297
299
+ ▁THOSE 298
300
+ ▁MAN 299
301
+ CTION 300
302
+ ICAL 301
303
+ ▁JO 302
304
+ ▁OP 303
305
+ ▁NEW 304
306
+ ▁MU 305
307
+ ▁HU 306
308
+ ▁KIND 307
309
+ ▁NE 308
310
+ CA 309
311
+ END 310
312
+ TIC 311
313
+ FUL 312
314
+ ▁YEAH 313
315
+ SH 314
316
+ ▁APP 315
317
+ ▁THINGS 316
318
+ SIDE 317
319
+ ▁GOOD 318
320
+ ONE 319
321
+ ▁TAKE 320
322
+ CU 321
323
+ ▁EVERY 322
324
+ ▁MEAN 323
325
+ ▁FIRST 324
326
+ OP 325
327
+ ▁TH 326
328
+ ▁MUCH 327
329
+ ▁PART 328
330
+ UGH 329
331
+ ▁COME 330
332
+ J 331
333
+ ▁THAN 332
334
+ ▁EXP 333
335
+ ▁AGAIN 334
336
+ ▁LITTLE 335
337
+ MB 336
338
+ ▁NEED 337
339
+ ▁TALK 338
340
+ IF 339
341
+ FOR 340
342
+ ▁SH 341
343
+ ISH 342
344
+ ▁STA 343
345
+ ATED 344
346
+ ▁GU 345
347
+ ▁LET 346
348
+ IA 347
349
+ ▁MAR 348
350
+ ▁DOWN 349
351
+ ▁DAY 350
352
+ ▁GA 351
353
+ ▁SOMETHING 352
354
+ ▁BU 353
355
+ DUC 354
356
+ HA 355
357
+ ▁LOT 356
358
+ ▁RU 357
359
+ ▁THOUGH 358
360
+ ▁GREAT 359
361
+ AIN 360
362
+ ▁THROUGH 361
363
+ ▁THING 362
364
+ OUS 363
365
+ ▁PRI 364
366
+ ▁GOT 365
367
+ ▁SHOULD 366
368
+ ▁AFTER 367
369
+ ▁HEAR 368
370
+ ▁TA 369
371
+ ▁ONLY 370
372
+ ▁CHI 371
373
+ IOUS 372
374
+ ▁SHA 373
375
+ ▁MOST 374
376
+ ▁ACTUALLY 375
377
+ ▁START 376
378
+ LIC 377
379
+ ▁VA 378
380
+ ▁RI 379
381
+ DAY 380
382
+ IAN 381
383
+ ▁DOES 382
384
+ ROW 383
385
+ ▁GRA 384
386
+ ITION 385
387
+ ▁MANY 386
388
+ ▁BEFORE 387
389
+ ▁GIVE 388
390
+ PORT 389
391
+ QUI 390
392
+ ▁LIFE 391
393
+ ▁WORLD 392
394
+ ▁PI 393
395
+ ▁LONG 394
396
+ ▁THREE 395
397
+ IZE 396
398
+ NESS 397
399
+ ▁SHOW 398
400
+ PH 399
401
+ ▁WHY 400
402
+ ▁QUESTION 401
403
+ WARD 402
404
+ ▁THANK 403
405
+ ▁PH 404
406
+ ▁DIFFERENT 405
407
+ ▁OWN 406
408
+ ▁FEEL 407
409
+ ▁MIGHT 408
410
+ ▁HAPPEN 409
411
+ ▁MADE 410
412
+ ▁BRO 411
413
+ IBLE 412
414
+ ▁HI 413
415
+ ▁STATE 414
416
+ ▁HAND 415
417
+ ▁NEVER 416
418
+ ▁PLACE 417
419
+ ▁LOVE 418
420
+ ▁DU 419
421
+ ▁POINT 420
422
+ ▁HELP 421
423
+ ▁COUNT 422
424
+ ▁STILL 423
425
+ ▁MR 424
426
+ ▁FIND 425
427
+ ▁PERSON 426
428
+ ▁CAME 427
429
+ ▁SAME 428
430
+ ▁LAST 429
431
+ ▁HIGH 430
432
+ ▁OLD 431
433
+ ▁UNDER 432
434
+ ▁FOUR 433
435
+ ▁AROUND 434
436
+ ▁SORT 435
437
+ ▁CHANGE 436
438
+ ▁YES 437
439
+ SHIP 438
440
+ ▁ANOTHER 439
441
+ ATIVE 440
442
+ ▁FOUND 441
443
+ ▁JA 442
444
+ ▁ALWAYS 443
445
+ ▁NEXT 444
446
+ ▁TURN 445
447
+ ▁JU 446
448
+ ▁SIX 447
449
+ ▁FACT 448
450
+ ▁INTEREST 449
451
+ ▁WORD 450
452
+ ▁THOUSAND 451
453
+ ▁HUNDRED 452
454
+ ▁NUMBER 453
455
+ ▁IDEA 454
456
+ ▁PLAN 455
457
+ ▁COURSE 456
458
+ ▁SCHOOL 457
459
+ ▁HOUSE 458
460
+ ▁TWENTY 459
461
+ ▁JE 460
462
+ ▁PLAY 461
463
+ ▁AWAY 462
464
+ ▁LEARN 463
465
+ ▁HARD 464
466
+ ▁WEEK 465
467
+ ▁BETTER 466
468
+ ▁WHILE 467
469
+ ▁FRIEND 468
470
+ ▁OKAY 469
471
+ ▁NINE 470
472
+ ▁UNDERSTAND 471
473
+ ▁KEEP 472
474
+ ▁GONNA 473
475
+ ▁SYSTEM 474
476
+ ▁AMERICA 475
477
+ ▁POWER 476
478
+ ▁IMPORTANT 477
479
+ ▁WITHOUT 478
480
+ ▁MAYBE 479
481
+ ▁SEVEN 480
482
+ ▁BETWEEN 481
483
+ ▁BUILD 482
484
+ ▁CERTAIN 483
485
+ ▁PROBLEM 484
486
+ ▁MONEY 485
487
+ ▁BELIEVE 486
488
+ ▁SECOND 487
489
+ ▁REASON 488
490
+ ▁TOGETHER 489
491
+ ▁PUBLIC 490
492
+ ▁ANYTHING 491
493
+ ▁SPEAK 492
494
+ ▁BUSINESS 493
495
+ ▁EVERYTHING 494
496
+ ▁CLOSE 495
497
+ ▁QUITE 496
498
+ ▁ANSWER 497
499
+ ▁ENOUGH 498
500
+ Q 499
model.py CHANGED
@@ -16,21 +16,49 @@
16
 
17
  from huggingface_hub import hf_hub_download
18
  from functools import lru_cache
 
19
 
 
 
 
 
 
 
20
 
21
- from offline_asr import OfflineAsr
22
 
23
  sample_rate = 16000
24
 
25
 
26
  @lru_cache(maxsize=30)
27
- def get_pretrained_model(repo_id: str) -> OfflineAsr:
 
 
 
 
28
  if repo_id in chinese_models:
29
- return chinese_models[repo_id](repo_id)
 
 
30
  elif repo_id in english_models:
31
- return english_models[repo_id](repo_id)
 
 
32
  elif repo_id in chinese_english_mixed_models:
33
- return chinese_english_mixed_models[repo_id](repo_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  else:
35
  raise ValueError(f"Unsupported repo_id: {repo_id}")
36
 
@@ -75,7 +103,11 @@ def _get_token_filename(
75
 
76
 
77
  @lru_cache(maxsize=10)
78
- def _get_aishell2_pretrained_model(repo_id: str) -> OfflineAsr:
 
 
 
 
79
  assert repo_id in [
80
  # context-size 1
81
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
@@ -83,172 +115,462 @@ def _get_aishell2_pretrained_model(repo_id: str) -> OfflineAsr:
83
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
84
  ], repo_id
85
 
86
- nn_model_filename = _get_nn_model_filename(
87
  repo_id=repo_id,
88
  filename="cpu_jit.pt",
89
  )
90
- token_filename = _get_token_filename(repo_id=repo_id)
91
-
92
- return OfflineAsr(
93
- nn_model_filename=nn_model_filename,
94
- bpe_model_filename=None,
95
- token_filename=token_filename,
96
- sample_rate=sample_rate,
97
- device="cpu",
 
 
 
 
 
 
98
  )
99
 
 
 
 
 
100
 
101
  @lru_cache(maxsize=10)
102
- def _get_gigaspeech_pre_trained_model(repo_id: str) -> OfflineAsr:
 
 
 
 
103
  assert repo_id in [
104
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
105
  ], repo_id
106
 
107
- nn_model_filename = _get_nn_model_filename(
108
  repo_id=repo_id,
109
  filename="cpu_jit-iter-3488000-avg-20.pt",
110
  )
111
- bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
112
-
113
- return OfflineAsr(
114
- nn_model_filename=nn_model_filename,
115
- bpe_model_filename=bpe_model_filename,
116
- token_filename=None,
117
- sample_rate=sample_rate,
118
- device="cpu",
 
 
 
 
 
 
119
  )
120
 
 
 
 
 
121
 
122
  @lru_cache(maxsize=10)
123
- def _get_librispeech_pre_trained_model(repo_id: str) -> OfflineAsr:
 
 
 
 
124
  assert repo_id in [
 
125
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
 
 
126
  ], repo_id
127
 
128
- nn_model_filename = _get_nn_model_filename(
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  repo_id=repo_id,
130
- filename="cpu_jit.pt",
131
  )
132
- bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
133
-
134
- return OfflineAsr(
135
- nn_model_filename=nn_model_filename,
136
- bpe_model_filename=bpe_model_filename,
137
- token_filename=None,
138
- sample_rate=sample_rate,
139
- device="cpu",
 
 
 
 
 
 
140
  )
141
 
 
 
 
 
142
 
143
  @lru_cache(maxsize=10)
144
- def _get_wenetspeech_pre_trained_model(repo_id: str):
 
 
 
 
145
  assert repo_id in [
146
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
147
  ], repo_id
148
 
149
- nn_model_filename = _get_nn_model_filename(
150
  repo_id=repo_id,
151
  filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
152
  )
153
- token_filename = _get_token_filename(repo_id=repo_id)
154
-
155
- return OfflineAsr(
156
- nn_model_filename=nn_model_filename,
157
- bpe_model_filename=None,
158
- token_filename=token_filename,
159
- sample_rate=sample_rate,
160
- device="cpu",
 
 
 
 
 
 
161
  )
162
 
 
 
 
 
163
 
164
  @lru_cache(maxsize=10)
165
- def _get_tal_csasr_pre_trained_model(repo_id: str):
 
 
 
 
166
  assert repo_id in [
167
  "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
168
  ], repo_id
169
 
170
- nn_model_filename = _get_nn_model_filename(
171
  repo_id=repo_id,
172
  filename="cpu_jit.pt",
173
  )
174
- token_filename = _get_token_filename(repo_id=repo_id)
175
-
176
- return OfflineAsr(
177
- nn_model_filename=nn_model_filename,
178
- bpe_model_filename=None,
179
- token_filename=token_filename,
180
- sample_rate=sample_rate,
181
- device="cpu",
 
 
 
 
 
 
182
  )
183
 
 
 
 
 
184
 
185
  @lru_cache(maxsize=10)
186
- def _get_alimeeting_pre_trained_model(repo_id: str):
 
 
 
 
187
  assert repo_id in [
188
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
189
  ], repo_id
190
 
191
- nn_model_filename = _get_nn_model_filename(
192
  repo_id=repo_id,
193
  filename="cpu_jit_torch_1.7.1.pt",
194
  )
195
- token_filename = _get_token_filename(repo_id=repo_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
- return OfflineAsr(
198
- nn_model_filename=nn_model_filename,
199
- bpe_model_filename=None,
200
- token_filename=token_filename,
201
- sample_rate=sample_rate,
202
- device="cpu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  @lru_cache(maxsize=10)
207
- def _get_aidatatang_200zh_pretrained_mode(repo_id: str):
 
 
 
 
208
  assert repo_id in [
209
  "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
210
  ], repo_id
211
 
212
- nn_model_filename = _get_nn_model_filename(
213
  repo_id=repo_id,
214
  filename="cpu_jit_torch.1.7.1.pt",
215
  )
216
- token_filename = _get_token_filename(repo_id=repo_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
- return OfflineAsr(
219
- nn_model_filename=nn_model_filename,
220
- bpe_model_filename=None,
221
- token_filename=token_filename,
222
- sample_rate=sample_rate,
223
- device="cpu",
 
 
 
 
 
 
 
 
224
  )
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  chinese_models = {
228
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
229
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
230
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
231
- "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
232
  "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
 
 
233
  }
234
 
235
  english_models = {
236
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
 
 
 
237
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_librispeech_pre_trained_model, # noqa
 
238
  }
239
 
240
  chinese_english_mixed_models = {
241
  "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_tal_csasr_pre_trained_model, # noqa
242
  }
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  all_models = {
245
  **chinese_models,
246
  **english_models,
247
  **chinese_english_mixed_models,
 
 
 
248
  }
249
 
250
  language_to_models = {
251
  "Chinese": list(chinese_models.keys()),
252
  "English": list(english_models.keys()),
253
  "Chinese+English": list(chinese_english_mixed_models.keys()),
 
 
 
254
  }
16
 
17
  from huggingface_hub import hf_hub_download
18
  from functools import lru_cache
19
+ import os
20
 
21
+ os.system(
22
+ "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
23
+ )
24
+
25
+ import k2
26
+ import sherpa
27
 
 
28
 
29
  sample_rate = 16000
30
 
31
 
32
  @lru_cache(maxsize=30)
33
+ def get_pretrained_model(
34
+ repo_id: str,
35
+ decoding_method: str,
36
+ num_active_paths: int,
37
+ ) -> sherpa.OfflineRecognizer:
38
  if repo_id in chinese_models:
39
+ return chinese_models[repo_id](
40
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
41
+ )
42
  elif repo_id in english_models:
43
+ return english_models[repo_id](
44
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
45
+ )
46
  elif repo_id in chinese_english_mixed_models:
47
+ return chinese_english_mixed_models[repo_id](
48
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
49
+ )
50
+ elif repo_id in tibetan_models:
51
+ return tibetan_models[repo_id](
52
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
53
+ )
54
+ elif repo_id in arabic_models:
55
+ return arabic_models[repo_id](
56
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
57
+ )
58
+ elif repo_id in german_models:
59
+ return german_models[repo_id](
60
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
61
+ )
62
  else:
63
  raise ValueError(f"Unsupported repo_id: {repo_id}")
64
 
103
 
104
 
105
  @lru_cache(maxsize=10)
106
+ def _get_aishell2_pretrained_model(
107
+ repo_id: str,
108
+ decoding_method: str,
109
+ num_active_paths: int,
110
+ ) -> sherpa.OfflineRecognizer:
111
  assert repo_id in [
112
  # context-size 1
113
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
115
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
116
  ], repo_id
117
 
118
+ nn_model = _get_nn_model_filename(
119
  repo_id=repo_id,
120
  filename="cpu_jit.pt",
121
  )
122
+ tokens = _get_token_filename(repo_id=repo_id)
123
+
124
+ feat_config = sherpa.FeatureConfig()
125
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
126
+ feat_config.fbank_opts.mel_opts.num_bins = 80
127
+ feat_config.fbank_opts.frame_opts.dither = 0
128
+
129
+ config = sherpa.OfflineRecognizerConfig(
130
+ nn_model=nn_model,
131
+ tokens=tokens,
132
+ use_gpu=False,
133
+ feat_config=feat_config,
134
+ decoding_method=decoding_method,
135
+ num_active_paths=num_active_paths,
136
  )
137
 
138
+ recognizer = sherpa.OfflineRecognizer(config)
139
+
140
+ return recognizer
141
+
142
 
143
  @lru_cache(maxsize=10)
144
+ def _get_gigaspeech_pre_trained_model(
145
+ repo_id: str,
146
+ decoding_method: str,
147
+ num_active_paths: int,
148
+ ) -> sherpa.OfflineRecognizer:
149
  assert repo_id in [
150
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
151
  ], repo_id
152
 
153
+ nn_model = _get_nn_model_filename(
154
  repo_id=repo_id,
155
  filename="cpu_jit-iter-3488000-avg-20.pt",
156
  )
157
+ tokens = "./giga-tokens.txt"
158
+
159
+ feat_config = sherpa.FeatureConfig()
160
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
161
+ feat_config.fbank_opts.mel_opts.num_bins = 80
162
+ feat_config.fbank_opts.frame_opts.dither = 0
163
+
164
+ config = sherpa.OfflineRecognizerConfig(
165
+ nn_model=nn_model,
166
+ tokens=tokens,
167
+ use_gpu=False,
168
+ feat_config=feat_config,
169
+ decoding_method=decoding_method,
170
+ num_active_paths=num_active_paths,
171
  )
172
 
173
+ recognizer = sherpa.OfflineRecognizer(config)
174
+
175
+ return recognizer
176
+
177
 
178
  @lru_cache(maxsize=10)
179
+ def _get_librispeech_pre_trained_model(
180
+ repo_id: str,
181
+ decoding_method: str,
182
+ num_active_paths: int,
183
+ ) -> sherpa.OfflineRecognizer:
184
  assert repo_id in [
185
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02", # noqa
186
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
187
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa
188
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa
189
  ], repo_id
190
 
191
+ filename = "cpu_jit.pt"
192
+ if (
193
+ repo_id
194
+ == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11"
195
+ ):
196
+ filename = "cpu_jit-torch-1.10.0.pt"
197
+
198
+ if (
199
+ repo_id
200
+ == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02"
201
+ ):
202
+ filename = "cpu_jit-torch-1.10.pt"
203
+
204
+ nn_model = _get_nn_model_filename(
205
  repo_id=repo_id,
206
+ filename=filename,
207
  )
208
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
209
+
210
+ feat_config = sherpa.FeatureConfig()
211
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
212
+ feat_config.fbank_opts.mel_opts.num_bins = 80
213
+ feat_config.fbank_opts.frame_opts.dither = 0
214
+
215
+ config = sherpa.OfflineRecognizerConfig(
216
+ nn_model=nn_model,
217
+ tokens=tokens,
218
+ use_gpu=False,
219
+ feat_config=feat_config,
220
+ decoding_method=decoding_method,
221
+ num_active_paths=num_active_paths,
222
  )
223
 
224
+ recognizer = sherpa.OfflineRecognizer(config)
225
+
226
+ return recognizer
227
+
228
 
229
  @lru_cache(maxsize=10)
230
+ def _get_wenetspeech_pre_trained_model(
231
+ repo_id: str,
232
+ decoding_method: str,
233
+ num_active_paths: int,
234
+ ):
235
  assert repo_id in [
236
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
237
  ], repo_id
238
 
239
+ nn_model = _get_nn_model_filename(
240
  repo_id=repo_id,
241
  filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
242
  )
243
+ tokens = _get_token_filename(repo_id=repo_id)
244
+
245
+ feat_config = sherpa.FeatureConfig()
246
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
247
+ feat_config.fbank_opts.mel_opts.num_bins = 80
248
+ feat_config.fbank_opts.frame_opts.dither = 0
249
+
250
+ config = sherpa.OfflineRecognizerConfig(
251
+ nn_model=nn_model,
252
+ tokens=tokens,
253
+ use_gpu=False,
254
+ feat_config=feat_config,
255
+ decoding_method=decoding_method,
256
+ num_active_paths=num_active_paths,
257
  )
258
 
259
+ recognizer = sherpa.OfflineRecognizer(config)
260
+
261
+ return recognizer
262
+
263
 
264
  @lru_cache(maxsize=10)
265
+ def _get_tal_csasr_pre_trained_model(
266
+ repo_id: str,
267
+ decoding_method: str,
268
+ num_active_paths: int,
269
+ ):
270
  assert repo_id in [
271
  "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
272
  ], repo_id
273
 
274
+ nn_model = _get_nn_model_filename(
275
  repo_id=repo_id,
276
  filename="cpu_jit.pt",
277
  )
278
+ tokens = _get_token_filename(repo_id=repo_id)
279
+
280
+ feat_config = sherpa.FeatureConfig()
281
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
282
+ feat_config.fbank_opts.mel_opts.num_bins = 80
283
+ feat_config.fbank_opts.frame_opts.dither = 0
284
+
285
+ config = sherpa.OfflineRecognizerConfig(
286
+ nn_model=nn_model,
287
+ tokens=tokens,
288
+ use_gpu=False,
289
+ feat_config=feat_config,
290
+ decoding_method=decoding_method,
291
+ num_active_paths=num_active_paths,
292
  )
293
 
294
+ recognizer = sherpa.OfflineRecognizer(config)
295
+
296
+ return recognizer
297
+
298
 
299
  @lru_cache(maxsize=10)
300
+ def _get_alimeeting_pre_trained_model(
301
+ repo_id: str,
302
+ decoding_method: str,
303
+ num_active_paths: int,
304
+ ):
305
  assert repo_id in [
306
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
307
  ], repo_id
308
 
309
+ nn_model = _get_nn_model_filename(
310
  repo_id=repo_id,
311
  filename="cpu_jit_torch_1.7.1.pt",
312
  )
313
+ tokens = _get_token_filename(repo_id=repo_id)
314
+
315
+ feat_config = sherpa.FeatureConfig()
316
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
317
+ feat_config.fbank_opts.mel_opts.num_bins = 80
318
+ feat_config.fbank_opts.frame_opts.dither = 0
319
+
320
+ config = sherpa.OfflineRecognizerConfig(
321
+ nn_model=nn_model,
322
+ tokens=tokens,
323
+ use_gpu=False,
324
+ feat_config=feat_config,
325
+ decoding_method=decoding_method,
326
+ num_active_paths=num_active_paths,
327
+ )
328
+
329
+ recognizer = sherpa.OfflineRecognizer(config)
330
+
331
+ return recognizer
332
 
333
+
334
+ @lru_cache(maxsize=10)
335
+ def _get_wenet_model(
336
+ repo_id: str,
337
+ decoding_method: str,
338
+ num_active_paths: int,
339
+ ):
340
+ assert repo_id in [
341
+ "csukuangfj/wenet-chinese-model",
342
+ "csukuangfj/wenet-english-model",
343
+ ], repo_id
344
+
345
+ nn_model = _get_nn_model_filename(
346
+ repo_id=repo_id,
347
+ filename="final.zip",
348
+ subfolder=".",
349
+ )
350
+ tokens = _get_token_filename(
351
+ repo_id=repo_id,
352
+ filename="units.txt",
353
+ subfolder=".",
354
  )
355
 
356
+ feat_config = sherpa.FeatureConfig(normalize_samples=False)
357
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
358
+ feat_config.fbank_opts.mel_opts.num_bins = 80
359
+ feat_config.fbank_opts.frame_opts.dither = 0
360
+
361
+ config = sherpa.OfflineRecognizerConfig(
362
+ nn_model=nn_model,
363
+ tokens=tokens,
364
+ use_gpu=False,
365
+ feat_config=feat_config,
366
+ decoding_method=decoding_method,
367
+ num_active_paths=num_active_paths,
368
+ )
369
+
370
+ recognizer = sherpa.OfflineRecognizer(config)
371
+
372
+ return recognizer
373
+
374
 
375
  @lru_cache(maxsize=10)
376
+ def _get_aidatatang_200zh_pretrained_mode(
377
+ repo_id: str,
378
+ decoding_method: str,
379
+ num_active_paths: int,
380
+ ):
381
  assert repo_id in [
382
  "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
383
  ], repo_id
384
 
385
+ nn_model = _get_nn_model_filename(
386
  repo_id=repo_id,
387
  filename="cpu_jit_torch.1.7.1.pt",
388
  )
389
+ tokens = _get_token_filename(repo_id=repo_id)
390
+
391
+ feat_config = sherpa.FeatureConfig()
392
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
393
+ feat_config.fbank_opts.mel_opts.num_bins = 80
394
+ feat_config.fbank_opts.frame_opts.dither = 0
395
+
396
+ config = sherpa.OfflineRecognizerConfig(
397
+ nn_model=nn_model,
398
+ tokens=tokens,
399
+ use_gpu=False,
400
+ feat_config=feat_config,
401
+ decoding_method=decoding_method,
402
+ num_active_paths=num_active_paths,
403
+ )
404
+
405
+ recognizer = sherpa.OfflineRecognizer(config)
406
+
407
+ return recognizer
408
+
409
+
410
+ @lru_cache(maxsize=10)
411
+ def _get_tibetan_pre_trained_model(
412
+ repo_id: str,
413
+ decoding_method: str,
414
+ num_active_paths: int,
415
+ ):
416
+ assert repo_id in [
417
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
418
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
419
+ ], repo_id
420
+
421
+ filename = "cpu_jit.pt"
422
+ if (
423
+ repo_id
424
+ == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
425
+ ):
426
+ filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt"
427
+
428
+ nn_model = _get_nn_model_filename(
429
+ repo_id=repo_id,
430
+ filename=filename,
431
+ )
432
+
433
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
434
+
435
+ feat_config = sherpa.FeatureConfig()
436
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
437
+ feat_config.fbank_opts.mel_opts.num_bins = 80
438
+ feat_config.fbank_opts.frame_opts.dither = 0
439
+
440
+ config = sherpa.OfflineRecognizerConfig(
441
+ nn_model=nn_model,
442
+ tokens=tokens,
443
+ use_gpu=False,
444
+ feat_config=feat_config,
445
+ decoding_method=decoding_method,
446
+ num_active_paths=num_active_paths,
447
+ )
448
+
449
+ recognizer = sherpa.OfflineRecognizer(config)
450
+
451
+ return recognizer
452
+
453
+
454
+ @lru_cache(maxsize=10)
455
+ def _get_arabic_pre_trained_model(
456
+ repo_id: str,
457
+ decoding_method: str,
458
+ num_active_paths: int,
459
+ ):
460
+ assert repo_id in [
461
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
462
+ ], repo_id
463
+
464
+ nn_model = _get_nn_model_filename(
465
+ repo_id=repo_id,
466
+ filename="cpu_jit.pt",
467
+ )
468
 
469
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000")
470
+
471
+ feat_config = sherpa.FeatureConfig()
472
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
473
+ feat_config.fbank_opts.mel_opts.num_bins = 80
474
+ feat_config.fbank_opts.frame_opts.dither = 0
475
+
476
+ config = sherpa.OfflineRecognizerConfig(
477
+ nn_model=nn_model,
478
+ tokens=tokens,
479
+ use_gpu=False,
480
+ feat_config=feat_config,
481
+ decoding_method=decoding_method,
482
+ num_active_paths=num_active_paths,
483
  )
484
 
485
+ recognizer = sherpa.OfflineRecognizer(config)
486
+
487
+ return recognizer
488
+
489
+
490
+ @lru_cache(maxsize=10)
491
+ def _get_german_pre_trained_model(
492
+ repo_id: str,
493
+ decoding_method: str,
494
+ num_active_paths: int,
495
+ ):
496
+ assert repo_id in [
497
+ "csukuangfj/wav2vec2.0-torchaudio",
498
+ ], repo_id
499
+
500
+ nn_model = _get_nn_model_filename(
501
+ repo_id=repo_id,
502
+ filename="voxpopuli_asr_base_10k_de.pt",
503
+ subfolder=".",
504
+ )
505
+
506
+ tokens = _get_token_filename(
507
+ repo_id=repo_id,
508
+ filename="tokens-de.txt",
509
+ subfolder=".",
510
+ )
511
+
512
+ config = sherpa.OfflineRecognizerConfig(
513
+ nn_model=nn_model,
514
+ tokens=tokens,
515
+ use_gpu=False,
516
+ decoding_method=decoding_method,
517
+ num_active_paths=num_active_paths,
518
+ )
519
+
520
+ recognizer = sherpa.OfflineRecognizer(config)
521
+
522
+ return recognizer
523
+
524
 
525
  chinese_models = {
526
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
527
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
528
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
 
529
  "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
530
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
531
+ "csukuangfj/wenet-chinese-model": _get_wenet_model,
532
  }
533
 
534
  english_models = {
535
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
536
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_librispeech_pre_trained_model, # noqa
537
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_librispeech_pre_trained_model, # noqa
538
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_librispeech_pre_trained_model, # noqa
539
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_librispeech_pre_trained_model, # noqa
540
+ "csukuangfj/wenet-english-model": _get_wenet_model,
541
  }
542
 
543
  chinese_english_mixed_models = {
544
  "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_tal_csasr_pre_trained_model, # noqa
545
  }
546
 
547
+ tibetan_models = {
548
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model, # noqa
549
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model, # noqa
550
+ }
551
+
552
+ arabic_models = {
553
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model, # noqa
554
+ }
555
+
556
+ german_models = {
557
+ "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
558
+ }
559
+
560
  all_models = {
561
  **chinese_models,
562
  **english_models,
563
  **chinese_english_mixed_models,
564
+ **tibetan_models,
565
+ **arabic_models,
566
+ **german_models,
567
  }
568
 
569
  language_to_models = {
570
  "Chinese": list(chinese_models.keys()),
571
  "English": list(english_models.keys()),
572
  "Chinese+English": list(chinese_english_mixed_models.keys()),
573
+ "Tibetan": list(tibetan_models.keys()),
574
+ "Arabic": list(arabic_models.keys()),
575
+ "German": list(german_models.keys()),
576
  }
offline_asr.py DELETED
@@ -1,427 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
3
- #
4
- # Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/offline_asr.py
5
- #
6
- # See LICENSE for clarification regarding multiple authors
7
- #
8
- # Licensed under the Apache License, Version 2.0 (the "License");
9
- # you may not use this file except in compliance with the License.
10
- # You may obtain a copy of the License at
11
- #
12
- # http://www.apache.org/licenses/LICENSE-2.0
13
- #
14
- # Unless required by applicable law or agreed to in writing, software
15
- # distributed under the License is distributed on an "AS IS" BASIS,
16
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
- # See the License for the specific language governing permissions and
18
- # limitations under the License.
19
- """
20
- A standalone script for offline ASR recognition.
21
-
22
- It loads a torchscript model, decodes the given wav files, and exits.
23
-
24
- Usage:
25
- ./offline_asr.py --help
26
-
27
- For BPE based models (e.g., LibriSpeech):
28
-
29
- ./offline_asr.py \
30
- --nn-model-filename /path/to/cpu_jit.pt \
31
- --bpe-model-filename /path/to/bpe.model \
32
- --decoding-method greedy_search \
33
- ./foo.wav \
34
- ./bar.wav \
35
- ./foobar.wav
36
-
37
- For character based models (e.g., aishell):
38
-
39
- ./offline.py \
40
- --nn-model-filename /path/to/cpu_jit.pt \
41
- --token-filename /path/to/lang_char/tokens.txt \
42
- --decoding-method greedy_search \
43
- ./foo.wav \
44
- ./bar.wav \
45
- ./foobar.wav
46
-
47
- Note: We provide pre-trained models for testing.
48
-
49
- (1) Pre-trained model with the LibriSpeech dataset
50
-
51
- sudo apt-get install git-lfs
52
- git lfs install
53
- git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
54
-
55
- nn_model_filename=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-1.6.0.pt
56
- bpe_model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model
57
-
58
- wav1=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav
59
- wav2=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav
60
- wav3=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav
61
-
62
- sherpa/bin/conformer_rnnt/offline_asr.py \
63
- --nn-model-filename $nn_model_filename \
64
- --bpe-model $bpe_model \
65
- $wav1 \
66
- $wav2 \
67
- $wav3
68
-
69
- (2) Pre-trained model with the aishell dataset
70
-
71
- sudo apt-get install git-lfs
72
- git lfs install
73
- git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
74
-
75
- nn_model_filename=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt
76
- token_filename=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt
77
-
78
- wav1=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav
79
- wav2=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav
80
- wav3=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav
81
-
82
- sherpa/bin/conformer_rnnt/offline_asr.py \
83
- --nn-model-filename $nn_model_filename \
84
- --token-filename $token_filename \
85
- $wav1 \
86
- $wav2 \
87
- $wav3
88
- """
89
- import argparse
90
- import functools
91
- import logging
92
- from typing import List, Optional, Union
93
-
94
- import k2
95
- import kaldifeat
96
- import sentencepiece as spm
97
- import torch
98
- import torchaudio
99
- from sherpa import RnntConformerModel
100
-
101
- from decode import run_model_and_do_greedy_search, run_model_and_do_modified_beam_search
102
-
103
-
104
- def get_args():
105
- parser = argparse.ArgumentParser(
106
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
107
- )
108
-
109
- parser.add_argument(
110
- "--nn-model-filename",
111
- type=str,
112
- help="""The torchscript model. You can use
113
- icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \
114
- --jit=1
115
- to generate this model.
116
- """,
117
- )
118
-
119
- parser.add_argument(
120
- "--bpe-model-filename",
121
- type=str,
122
- help="""The BPE model
123
- You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx
124
- from icefall,
125
- where xxx is the number of BPE tokens you used to train the model.
126
- Note: Use it only when your model is using BPE. You don't need to
127
- provide it if you provide `--token-filename`
128
- """,
129
- )
130
-
131
- parser.add_argument(
132
- "--token-filename",
133
- type=str,
134
- help="""Filename for tokens.txt
135
- You can find it in the directory
136
- egs/aishell/ASR/data/lang_char/tokens.txt from icefall.
137
- Note: You don't need to provide it if you provide `--bpe-model`
138
- """,
139
- )
140
-
141
- parser.add_argument(
142
- "--decoding-method",
143
- type=str,
144
- default="greedy_search",
145
- help="""Decoding method to use. Currently, only greedy_search and
146
- modified_beam_search are implemented.
147
- """,
148
- )
149
-
150
- parser.add_argument(
151
- "--num-active-paths",
152
- type=int,
153
- default=4,
154
- help="""Used only when decoding_method is modified_beam_search.
155
- It specifies number of active paths for each utterance. Due to
156
- merging paths with identical token sequences, the actual number
157
- may be less than "num_active_paths".
158
- """,
159
- )
160
-
161
- parser.add_argument(
162
- "--sample-rate",
163
- type=int,
164
- default=16000,
165
- help="The expected sample rate of the input sound files",
166
- )
167
-
168
- parser.add_argument(
169
- "sound_files",
170
- type=str,
171
- nargs="+",
172
- help="The input sound file(s) to transcribe. "
173
- "Supported formats are those supported by torchaudio.load(). "
174
- "For example, wav and flac are supported. "
175
- "The sample rate has to equal to `--sample-rate`.",
176
- )
177
-
178
- return parser.parse_args()
179
-
180
-
181
- def read_sound_files(
182
- filenames: List[str],
183
- expected_sample_rate: int,
184
- ) -> List[torch.Tensor]:
185
- """Read a list of sound files into a list 1-D float32 torch tensors.
186
- Args:
187
- filenames:
188
- A list of sound filenames.
189
- expected_sample_rate:
190
- The expected sample rate of the sound files.
191
- Returns:
192
- Return a list of 1-D float32 torch tensors.
193
- """
194
- ans = []
195
- for f in filenames:
196
- wave, sample_rate = torchaudio.load(f)
197
- assert sample_rate == expected_sample_rate, (
198
- f"expected sample rate: {expected_sample_rate}. " f"Given: {sample_rate}"
199
- )
200
- # We use only the first channel
201
- ans.append(wave[0])
202
- return ans
203
-
204
-
205
- class OfflineAsr(object):
206
- def __init__(
207
- self,
208
- nn_model_filename: str,
209
- bpe_model_filename: Optional[str] = None,
210
- token_filename: Optional[str] = None,
211
- decoding_method: str = "greedy_search",
212
- num_active_paths: int = 4,
213
- sample_rate: int = 16000,
214
- device: Union[str, torch.device] = "cpu",
215
- ):
216
- """
217
- Args:
218
- nn_model_filename:
219
- Path to the torch script model.
220
- bpe_model_filename:
221
- Path to the BPE model. If it is None, you have to provide
222
- `token_filename`.
223
- token_filename:
224
- Path to tokens.txt. If it is None, you have to provide
225
- `bpe_model_filename`.
226
- sample_rate:
227
- Expected sample rate of the feature extractor.
228
- device:
229
- The device to use for computation.
230
- """
231
- self.model = RnntConformerModel(
232
- filename=nn_model_filename,
233
- device=device,
234
- optimize_for_inference=False,
235
- )
236
-
237
- if bpe_model_filename:
238
- self.sp = spm.SentencePieceProcessor()
239
- self.sp.load(bpe_model_filename)
240
- else:
241
- assert token_filename is not None, token_filename
242
- self.token_table = k2.SymbolTable.from_file(token_filename)
243
-
244
- self.feature_extractor = self._build_feature_extractor(
245
- sample_rate=sample_rate,
246
- device=device,
247
- )
248
-
249
- self.device = device
250
-
251
- def _build_feature_extractor(
252
- self,
253
- sample_rate: int = 16000,
254
- device: Union[str, torch.device] = "cpu",
255
- ) -> kaldifeat.OfflineFeature:
256
- """Build a fbank feature extractor for extracting features.
257
-
258
- Args:
259
- sample_rate:
260
- Expected sample rate of the feature extractor.
261
- device:
262
- The device to use for computation.
263
- Returns:
264
- Return a fbank feature extractor.
265
- """
266
- opts = kaldifeat.FbankOptions()
267
- opts.device = device
268
- opts.frame_opts.dither = 0
269
- opts.frame_opts.snip_edges = False
270
- opts.frame_opts.samp_freq = sample_rate
271
- opts.mel_opts.num_bins = 80
272
-
273
- fbank = kaldifeat.Fbank(opts)
274
-
275
- return fbank
276
-
277
- def decode_waves(
278
- self,
279
- waves: List[torch.Tensor],
280
- decoding_method: str,
281
- num_active_paths: int,
282
- ) -> List[List[str]]:
283
- """
284
- Args:
285
- waves:
286
- A list of 1-D torch.float32 tensors containing audio samples.
287
- wavs[i] contains audio samples for the i-th utterance.
288
-
289
- Note:
290
- Whether it should be in the range [-32768, 32767] or be normalized
291
- to [-1, 1] depends on which range you used for your training data.
292
- For instance, if your training data used [-32768, 32767],
293
- then the given waves have to contain samples in this range.
294
-
295
- All models trained in icefall use the normalized range [-1, 1].
296
- decoding_method:
297
- The decoding method to use. Currently, only greedy_search and
298
- modified_beam_search are implemented.
299
- num_active_paths:
300
- Used only when decoding_method is modified_beam_search.
301
- It specifies number of active paths for each utterance. Due to
302
- merging paths with identical token sequences, the actual number
303
- may be less than "num_active_paths".
304
- Returns:
305
- Return a list of decoded results. `ans[i]` contains the decoded
306
- results for `wavs[i]`.
307
- """
308
- assert decoding_method in (
309
- "greedy_search",
310
- "modified_beam_search",
311
- ), decoding_method
312
-
313
- if decoding_method == "greedy_search":
314
- nn_and_decoding_func = run_model_and_do_greedy_search
315
- elif decoding_method == "modified_beam_search":
316
- nn_and_decoding_func = functools.partial(
317
- run_model_and_do_modified_beam_search,
318
- num_active_paths=num_active_paths,
319
- )
320
- else:
321
- raise ValueError(
322
- f"Unsupported decoding_method: {decoding_method} "
323
- "Please use greedy_search or modified_beam_search"
324
- )
325
-
326
- waves = [w.to(self.device) for w in waves]
327
- features = self.feature_extractor(waves)
328
-
329
- tokens = nn_and_decoding_func(self.model, features)
330
-
331
- if hasattr(self, "sp"):
332
- results = self.sp.decode(tokens)
333
- else:
334
- results = [[self.token_table[i] for i in hyp] for hyp in tokens]
335
- blank = chr(0x2581)
336
- results = ["".join(r) for r in results]
337
- results = [r.replace(blank, " ") for r in results]
338
-
339
- return results
340
-
341
-
342
- @torch.no_grad()
343
- def main():
344
- args = get_args()
345
- logging.info(vars(args))
346
-
347
- nn_model_filename = args.nn_model_filename
348
- bpe_model_filename = args.bpe_model_filename
349
- token_filename = args.token_filename
350
- decoding_method = args.decoding_method
351
- num_active_paths = args.num_active_paths
352
- sample_rate = args.sample_rate
353
- sound_files = args.sound_files
354
-
355
- assert decoding_method in ("greedy_search", "modified_beam_search"), decoding_method
356
-
357
- if decoding_method == "modified_beam_search":
358
- assert num_active_paths >= 1, num_active_paths
359
-
360
- if bpe_model_filename:
361
- assert token_filename is None
362
-
363
- if token_filename:
364
- assert bpe_model_filename is None
365
-
366
- device = torch.device("cpu")
367
- if torch.cuda.is_available():
368
- device = torch.device("cuda", 0)
369
-
370
- logging.info(f"device: {device}")
371
-
372
- offline_asr = OfflineAsr(
373
- nn_model_filename=nn_model_filename,
374
- bpe_model_filename=bpe_model_filename,
375
- token_filename=token_filename,
376
- decoding_method=decoding_method,
377
- num_active_paths=num_active_paths,
378
- sample_rate=sample_rate,
379
- device=device,
380
- )
381
-
382
- waves = read_sound_files(
383
- filenames=sound_files,
384
- expected_sample_rate=sample_rate,
385
- )
386
-
387
- logging.info("Decoding started.")
388
-
389
- hyps = offline_asr.decode_waves(waves)
390
-
391
- s = "\n"
392
- for filename, hyp in zip(sound_files, hyps):
393
- s += f"{filename}:\n{hyp}\n\n"
394
- logging.info(s)
395
-
396
- logging.info("Decoding done.")
397
-
398
-
399
- torch.set_num_threads(1)
400
- torch.set_num_interop_threads(1)
401
-
402
- # See https://github.com/pytorch/pytorch/issues/38342
403
- # and https://github.com/pytorch/pytorch/issues/33354
404
- #
405
- # If we don't do this, the delay increases whenever there is
406
- # a new request that changes the actual batch size.
407
- # If you use `py-spy dump --pid <server-pid> --native`, you will
408
- # see a lot of time is spent in re-compiling the torch script model.
409
- torch._C._jit_set_profiling_executor(False)
410
- torch._C._jit_set_profiling_mode(False)
411
- torch._C._set_graph_executor_optimize(False)
412
- """
413
- // Use the following in C++
414
- torch::jit::getExecutorMode() = false;
415
- torch::jit::getProfilingMode() = false;
416
- torch::jit::setGraphExecutorOptimize(false);
417
- """
418
-
419
- if __name__ == "__main__":
420
- torch.manual_seed(20220609)
421
-
422
- formatter = (
423
- "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa
424
- )
425
- logging.basicConfig(format=formatter, level=logging.INFO)
426
-
427
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,11 +1,9 @@
1
- https://download.pytorch.org/whl/cpu/torch-1.10.0%2Bcpu-cp38-cp38-linux_x86_64.whl
2
- https://k2-fsa.org/nightly/whl/k2-1.17.dev20220711+cpu.torch1.10.0-cp38-cp38-linux_x86_64.whl
3
- https://download.pytorch.org/whl/cpu/torchaudio-0.10.0%2Bcpu-cp38-cp38-linux_x86_64.whl
4
-
5
-
6
- https://huggingface.co/csukuangfj/wheels/resolve/main/kaldifeat-1.17-cp38-cp38-linux_x86_64.whl
7
- https://huggingface.co/csukuangfj/wheels/resolve/main/k2_sherpa-0.6-cp38-cp38-linux_x86_64.whl
8
 
 
 
 
9
 
10
  sentencepiece>=0.1.96
11
  numpy
1
+ https://download.pytorch.org/whl/cpu/torch-1.13.0%2Bcpu-cp38-cp38-linux_x86_64.whl
2
+ https://download.pytorch.org/whl/cpu/torchaudio-0.13.0%2Bcpu-cp38-cp38-linux_x86_64.whl
 
 
 
 
 
3
 
4
+ https://huggingface.co/csukuangfj/wheels/resolve/main/k2-1.23.2.dev20221204%2Bcpu.torch1.13.0-cp38-cp38-linux_x86_64.whl
5
+ https://huggingface.co/csukuangfj/wheels/resolve/main/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
6
+ https://huggingface.co/csukuangfj/wheels/resolve/main/kaldifeat-1.22-cp38-cp38-linux_x86_64.whl
7
 
8
  sentencepiece>=0.1.96
9
  numpy
test_wavs/arabic/a.wav ADDED
Binary file (253 kB). View file
test_wavs/arabic/b.wav ADDED
Binary file (243 kB). View file
test_wavs/arabic/c.wav ADDED
Binary file (150 kB). View file
test_wavs/arabic/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281 بعد أن عجز وبدأ يصدر مشكلات شعبه ومشكلات مصر
2
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244 وهؤلاء أولياء الشيطان ها هو ذا أحدهم الآن ضيفا عليكم على قناة الجزيرة ولا يستحي في ذلك
3
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004 عندما استغاث الليبيون بالعالم استغاثوا لرفع الظلم وليس لقهر إرادة الأمة ومصادرة الحياة الدستورية
test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav ADDED
Binary file (381 kB). View file
test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav ADDED
Binary file (282 kB). View file
test_wavs/tibetan/a_0_cacm-A70_31116.wav ADDED
Binary file (97.4 kB). View file
test_wavs/tibetan/a_0_cacm-A70_31117.wav ADDED
Binary file (128 kB). View file
test_wavs/tibetan/a_0_cacm-A70_31118.wav ADDED
Binary file (87.1 kB). View file
test_wavs/tibetan/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ a_0_cacm-A70_31116.wav ལོ བཅུ ཙམ མ འདང བའི དུས སྐབས ནང
2
+ a_0_cacm-A70_31117.wav དྲག པོའི ངོ ལོག ཟིང འཁྲུག སྒྲིག འཛུགས དང ངན བཀོད བྱས ཡོད
3
+ a_0_cacm-A70_31118.wav གནས བབ འདིའི རིགས གང མགྱོགས འགྱུར བ གཏོང དགོས