Spaces:

csukuangfj
/

test

Runtime error

App Files Files Community

csukuangfj commited on Dec 4, 2022

Commit

ee0d936

1 Parent(s): 08d2e6b

small fixes

Browse files

Files changed (4) hide show

app.py +12 -19
giga-tokens.txt +500 -0
model.py +205 -88
offline_asr.py +0 -427

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ def convert_to_wav(in_filename: str) -> str:
     """Convert the input audio file to a wave file"""
     out_filename = in_filename + ".wav"
     logging.info(f"Converting '{in_filename}' to '{out_filename}'")
-    _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' '{out_filename}'")
     return out_filename
@@ -128,31 +128,24 @@ def process(
     logging.info(f"Started at {date_time}")
     start = time.time()
-    wave, wave_sample_rate = torchaudio.load(filename)
-    if wave_sample_rate != sample_rate:
-        logging.info(
-            f"Expected sample rate: {sample_rate}. Given: {wave_sample_rate}. "
-            f"Resampling to {sample_rate}."
-        )
-        wave = torchaudio.functional.resample(
-            wave,
-            orig_freq=wave_sample_rate,
-            new_freq=sample_rate,
-        )
-    wave = wave[0]  # use only the first channel.
-    hyp = get_pretrained_model(repo_id).decode_waves(
-        [wave],
         decoding_method=decoding_method,
         num_active_paths=num_active_paths,
-    )[0]
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
-    duration = wave.shape[0] / sample_rate
     rtf = (end - start) / duration
     logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")

     """Convert the input audio file to a wave file"""
     out_filename = in_filename + ".wav"
     logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
     return out_filename
     logging.info(f"Started at {date_time}")
     start = time.time()
+    recognizer = get_pretrained_model(
+        repo_id,
         decoding_method=decoding_method,
         num_active_paths=num_active_paths,
+    )
+    s = recognizer.create_stream()
+    s.accept_wave_file(filename)
+    recognizer.decode_stream(s)
+    logging.info(s.text)
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
+    metadata = torchaudio.info(filename)
+    duration = wave.num_frames / sample_rate
     rtf = (end - start) / duration
     logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")

giga-tokens.txt ADDED Viewed

	@@ -0,0 +1,500 @@

+<blk> 0
+<sos/eos> 1
+<unk> 2
+S 3
+T 4
+▁THE 5
+▁A 6
+E 7
+▁AND 8
+▁TO 9
+N 10
+D 11
+▁OF 12
+' 13
+ING 14
+▁I 15
+Y 16
+▁IN 17
+ED 18
+▁THAT 19
+▁ 20
+P 21
+R 22
+▁YOU 23
+M 24
+RE 25
+ER 26
+C 27
+O 28
+▁IT 29
+L 30
+A 31
+U 32
+G 33
+▁WE 34
+▁IS 35
+▁SO 36
+AL 37
+I 38
+▁S 39
+▁RE 40
+AR 41
+B 42
+▁FOR 43
+▁C 44
+▁BE 45
+LE 46
+F 47
+W 48
+▁E 49
+▁HE 50
+LL 51
+▁WAS 52
+LY 53
+OR 54
+IN 55
+▁F 56
+VE 57
+▁THIS 58
+TH 59
+K 60
+▁ON 61
+IT 62
+▁B 63
+▁WITH 64
+▁BUT 65
+EN 66
+CE 67
+RI 68
+▁DO 69
+UR 70
+▁HAVE 71
+▁DE 72
+▁ME 73
+▁T 74
+ENT 75
+CH 76
+▁THEY 77
+▁NOT 78
+ES 79
+V 80
+▁AS 81
+RA 82
+▁P 83
+ON 84
+TER 85
+▁ARE 86
+▁WHAT 87
+IC 88
+▁ST 89
+▁LIKE 90
+ATION 91
+▁OR 92
+▁CA 93
+▁AT 94
+H 95
+▁KNOW 96
+▁G 97
+AN 98
+▁CON 99
+IL 100
+ND 101
+RO 102
+▁HIS 103
+▁CAN 104
+▁ALL 105
+TE 106
+▁THERE 107
+▁SU 108
+▁MO 109
+▁MA 110
+LI 111
+▁ONE 112
+▁ABOUT 113
+LA 114
+▁CO 115
+- 116
+▁MY 117
+▁HAD 118
+CK 119
+NG 120
+▁NO 121
+MENT 122
+AD 123
+LO 124
+ME 125
+▁AN 126
+▁FROM 127
+NE 128
+▁IF 129
+VER 130
+▁JUST 131
+▁PRO 132
+ION 133
+▁PA 134
+▁WHO 135
+▁SE 136
+EL 137
+IR 138
+▁US 139
+▁UP 140
+▁YOUR 141
+CI 142
+RY 143
+▁GO 144
+▁SHE 145
+▁LE 146
+▁OUT 147
+▁PO 148
+▁HO 149
+ATE 150
+▁BO 151
+▁BY 152
+▁FA 153
+▁MI 154
+AS 155
+MP 156
+▁HER 157
+VI 158
+▁THINK 159
+▁SOME 160
+▁WHEN 161
+▁AH 162
+▁PEOPLE 163
+IG 164
+▁WA 165
+▁TE 166
+▁LA 167
+▁WERE 168
+▁LI 169
+▁WOULD 170
+▁SEE 171
+▁WHICH 172
+DE 173
+GE 174
+▁K 175
+IGHT 176
+▁HA 177
+▁OUR 178
+UN 179
+▁HOW 180
+▁GET 181
+IS 182
+UT 183
+Z 184
+CO 185
+ET 186
+UL 187
+IES 188
+IVE 189
+AT 190
+▁O 191
+▁DON 192
+LU 193
+▁TIME 194
+▁WILL 195
+▁MORE 196
+▁SP 197
+▁NOW 198
+RU 199
+▁THEIR 200
+▁UN 201
+ITY 202
+OL 203
+X 204
+TI 205
+US 206
+▁VERY 207
+TION 208
+▁FI 209
+▁SAY 210
+▁BECAUSE 211
+▁EX 212
+▁RO 213
+ERS 214
+IST 215
+▁DA 216
+TING 217
+▁EN 218
+OM 219
+▁BA 220
+▁BEEN 221
+▁LO 222
+▁UM 223
+AGE 224
+ABLE 225
+▁WO 226
+▁RA 227
+▁OTHER 228
+▁REALLY 229
+ENCE 230
+▁GOING 231
+▁HIM 232
+▁HAS 233
+▁THEM 234
+▁DIS 235
+▁WANT 236
+ID 237
+TA 238
+▁LOOK 239
+KE 240
+▁DID 241
+▁SA 242
+▁VI 243
+▁SAID 244
+▁RIGHT 245
+▁THESE 246
+▁WORK 247
+▁COM 248
+ALLY 249
+FF 250
+QU 251
+AC 252
+▁DR 253
+▁WAY 254
+▁INTO 255
+MO 256
+TED 257
+EST 258
+▁HERE 259
+OK 260
+▁COULD 261
+▁WELL 262
+MA 263
+▁PRE 264
+▁DI 265
+MAN 266
+▁COMP 267
+▁THEN 268
+IM 269
+▁PER 270
+▁NA 271
+▁WHERE 272
+▁TWO 273
+▁WI 274
+▁FE 275
+INE 276
+▁ANY 277
+TURE 278
+▁OVER 279
+BO 280
+ACH 281
+OW 282
+▁MAKE 283
+▁TRA 284
+HE 285
+UND 286
+▁EVEN 287
+ANCE 288
+▁YEAR 289
+HO 290
+AM 291
+▁CHA 292
+▁BACK 293
+VO 294
+ANT 295
+DI 296
+▁ALSO 297
+▁THOSE 298
+▁MAN 299
+CTION 300
+ICAL 301
+▁JO 302
+▁OP 303
+▁NEW 304
+▁MU 305
+▁HU 306
+▁KIND 307
+▁NE 308
+CA 309
+END 310
+TIC 311
+FUL 312
+▁YEAH 313
+SH 314
+▁APP 315
+▁THINGS 316
+SIDE 317
+▁GOOD 318
+ONE 319
+▁TAKE 320
+CU 321
+▁EVERY 322
+▁MEAN 323
+▁FIRST 324
+OP 325
+▁TH 326
+▁MUCH 327
+▁PART 328
+UGH 329
+▁COME 330
+J 331
+▁THAN 332
+▁EXP 333
+▁AGAIN 334
+▁LITTLE 335
+MB 336
+▁NEED 337
+▁TALK 338
+IF 339
+FOR 340
+▁SH 341
+ISH 342
+▁STA 343
+ATED 344
+▁GU 345
+▁LET 346
+IA 347
+▁MAR 348
+▁DOWN 349
+▁DAY 350
+▁GA 351
+▁SOMETHING 352
+▁BU 353
+DUC 354
+HA 355
+▁LOT 356
+▁RU 357
+▁THOUGH 358
+▁GREAT 359
+AIN 360
+▁THROUGH 361
+▁THING 362
+OUS 363
+▁PRI 364
+▁GOT 365
+▁SHOULD 366
+▁AFTER 367
+▁HEAR 368
+▁TA 369
+▁ONLY 370
+▁CHI 371
+IOUS 372
+▁SHA 373
+▁MOST 374
+▁ACTUALLY 375
+▁START 376
+LIC 377
+▁VA 378
+▁RI 379
+DAY 380
+IAN 381
+▁DOES 382
+ROW 383
+▁GRA 384
+ITION 385
+▁MANY 386
+▁BEFORE 387
+▁GIVE 388
+PORT 389
+QUI 390
+▁LIFE 391
+▁WORLD 392
+▁PI 393
+▁LONG 394
+▁THREE 395
+IZE 396
+NESS 397
+▁SHOW 398
+PH 399
+▁WHY 400
+▁QUESTION 401
+WARD 402
+▁THANK 403
+▁PH 404
+▁DIFFERENT 405
+▁OWN 406
+▁FEEL 407
+▁MIGHT 408
+▁HAPPEN 409
+▁MADE 410
+▁BRO 411
+IBLE 412
+▁HI 413
+▁STATE 414
+▁HAND 415
+▁NEVER 416
+▁PLACE 417
+▁LOVE 418
+▁DU 419
+▁POINT 420
+▁HELP 421
+▁COUNT 422
+▁STILL 423
+▁MR 424
+▁FIND 425
+▁PERSON 426
+▁CAME 427
+▁SAME 428
+▁LAST 429
+▁HIGH 430
+▁OLD 431
+▁UNDER 432
+▁FOUR 433
+▁AROUND 434
+▁SORT 435
+▁CHANGE 436
+▁YES 437
+SHIP 438
+▁ANOTHER 439
+ATIVE 440
+▁FOUND 441
+▁JA 442
+▁ALWAYS 443
+▁NEXT 444
+▁TURN 445
+▁JU 446
+▁SIX 447
+▁FACT 448
+▁INTEREST 449
+▁WORD 450
+▁THOUSAND 451
+▁HUNDRED 452
+▁NUMBER 453
+▁IDEA 454
+▁PLAN 455
+▁COURSE 456
+▁SCHOOL 457
+▁HOUSE 458
+▁TWENTY 459
+▁JE 460
+▁PLAY 461
+▁AWAY 462
+▁LEARN 463
+▁HARD 464
+▁WEEK 465
+▁BETTER 466
+▁WHILE 467
+▁FRIEND 468
+▁OKAY 469
+▁NINE 470
+▁UNDERSTAND 471
+▁KEEP 472
+▁GONNA 473
+▁SYSTEM 474
+▁AMERICA 475
+▁POWER 476
+▁IMPORTANT 477
+▁WITHOUT 478
+▁MAYBE 479
+▁SEVEN 480
+▁BETWEEN 481
+▁BUILD 482
+▁CERTAIN 483
+▁PROBLEM 484
+▁MONEY 485
+▁BELIEVE 486
+▁SECOND 487
+▁REASON 488
+▁TOGETHER 489
+▁PUBLIC 490
+▁ANYTHING 491
+▁SPEAK 492
+▁BUSINESS 493
+▁EVERYTHING 494
+▁CLOSE 495
+▁QUITE 496
+▁ANSWER 497
+▁ENOUGH 498
+Q 499

model.py CHANGED Viewed

@@ -16,23 +16,35 @@
 from huggingface_hub import hf_hub_download
 from functools import lru_cache
-from offline_asr import OfflineAsr
 sample_rate = 16000
 @lru_cache(maxsize=30)
-def get_pretrained_model(repo_id: str) -> OfflineAsr:
     if repo_id in chinese_models:
-        return chinese_models[repo_id](repo_id)
     elif repo_id in english_models:
-        return english_models[repo_id](repo_id)
     elif repo_id in chinese_english_mixed_models:
-        return chinese_english_mixed_models[repo_id](repo_id)
     elif repo_id in tibetan_models:
         return tibetan_models[repo_id](repo_id)
     else:
         raise ValueError(f"Unsupported repo_id: {repo_id}")
@@ -77,7 +89,11 @@ def _get_token_filename(
 @lru_cache(maxsize=10)
-def _get_aishell2_pretrained_model(repo_id: str) -> OfflineAsr:
     assert repo_id in [
         # context-size 1
         "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",  # noqa
@@ -85,44 +101,68 @@ def _get_aishell2_pretrained_model(repo_id: str) -> OfflineAsr:
         "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",  # noqa
     ], repo_id
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit.pt",
     )
-    token_filename = _get_token_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=None,
-        token_filename=token_filename,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_gigaspeech_pre_trained_model(repo_id: str) -> OfflineAsr:
     assert repo_id in [
         "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
     ], repo_id
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit-iter-3488000-avg-20.pt",
     )
-    bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=bpe_model_filename,
-        token_filename=None,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_librispeech_pre_trained_model(repo_id: str) -> OfflineAsr:
     assert repo_id in [
         "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02",  # noqa
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",  # noqa
@@ -143,107 +183,172 @@ def _get_librispeech_pre_trained_model(repo_id: str) -> OfflineAsr:
     ):
         filename = "cpu_jit-torch-1.10.pt"
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename=filename,
     )
-    bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=bpe_model_filename,
-        token_filename=None,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_wenetspeech_pre_trained_model(repo_id: str):
     assert repo_id in [
         "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
     ], repo_id
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
     )
-    token_filename = _get_token_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=None,
-        token_filename=token_filename,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_tal_csasr_pre_trained_model(repo_id: str):
     assert repo_id in [
         "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
     ], repo_id
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit.pt",
     )
-    token_filename = _get_token_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=None,
-        token_filename=token_filename,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_alimeeting_pre_trained_model(repo_id: str):
     assert repo_id in [
         "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
     ], repo_id
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit_torch_1.7.1.pt",
     )
-    token_filename = _get_token_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=None,
-        token_filename=token_filename,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_aidatatang_200zh_pretrained_mode(repo_id: str):
     assert repo_id in [
         "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
     ], repo_id
-    nn_model_filename = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit_torch.1.7.1.pt",
     )
-    token_filename = _get_token_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=None,
-        token_filename=token_filename,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 @lru_cache(maxsize=10)
-def _get_tibetan_pre_trained_model(repo_id: str):
     assert repo_id in [
         "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
         "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
@@ -254,21 +359,33 @@ def _get_tibetan_pre_trained_model(repo_id: str):
         repo_id
         == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
     ):
-        nn_model_filename = _get_nn_model_filename(
-            repo_id=repo_id,
-            filename="cpu_jit-epoch-28-avg-23-torch-1.10.0.pt",
-        )
-    bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
-    return OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=bpe_model_filename,
-        token_filename=None,
-        sample_rate=sample_rate,
-        device="cpu",
     )
 chinese_models = {
     "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model,  # noqa

 from huggingface_hub import hf_hub_download
 from functools import lru_cache
+import sherpa
 sample_rate = 16000
 @lru_cache(maxsize=30)
+def get_pretrained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
     if repo_id in chinese_models:
+        return chinese_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
     elif repo_id in english_models:
+        return english_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
     elif repo_id in chinese_english_mixed_models:
+        return chinese_english_mixed_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
     elif repo_id in tibetan_models:
         return tibetan_models[repo_id](repo_id)
+        return tibetan_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
     else:
         raise ValueError(f"Unsupported repo_id: {repo_id}")
 @lru_cache(maxsize=10)
+def _get_aishell2_pretrained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
     assert repo_id in [
         # context-size 1
         "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",  # noqa
         "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",  # noqa
     ], repo_id
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit.pt",
     )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 @lru_cache(maxsize=10)
+def _get_gigaspeech_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
     assert repo_id in [
         "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
     ], repo_id
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit-iter-3488000-avg-20.pt",
     )
+    tokens = "./giga-tokens.txt"
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
 @lru_cache(maxsize=10)
+def _get_librispeech_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+) -> sherpa.OfflineRecognizer:
     assert repo_id in [
         "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02",  # noqa
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",  # noqa
     ):
         filename = "cpu_jit-torch-1.10.pt"
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename=filename,
     )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 @lru_cache(maxsize=10)
+def _get_wenetspeech_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
     assert repo_id in [
         "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
     ], repo_id
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
     )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 @lru_cache(maxsize=10)
+def _get_tal_csasr_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
     assert repo_id in [
         "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
     ], repo_id
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit.pt",
     )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 @lru_cache(maxsize=10)
+def _get_alimeeting_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
     assert repo_id in [
         "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
     ], repo_id
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit_torch_1.7.1.pt",
     )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 @lru_cache(maxsize=10)
+def _get_aidatatang_200zh_pretrained_mode(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
     assert repo_id in [
         "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
     ], repo_id
+    nn_model = _get_nn_model_filename(
         repo_id=repo_id,
         filename="cpu_jit_torch.1.7.1.pt",
     )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 @lru_cache(maxsize=10)
+def _get_tibetan_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
     assert repo_id in [
         "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
         "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
         repo_id
         == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
     ):
+        filename = ("cpu_jit-epoch-28-avg-23-torch-1.10.0.pt",)
+    nn_model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename=filename,
+    )
+    tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OfflineRecognizerConfig(
+        nn_model=nn_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
     )
+    recognizer = sherpa.OfflineRecognizer(config)
+    return recognizer
 chinese_models = {
     "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model,  # noqa

offline_asr.py DELETED Viewed

@@ -1,427 +0,0 @@
-#!/usr/bin/env python3
-# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/offline_asr.py
-#
-# See LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A standalone script for offline ASR recognition.
-It loads a torchscript model, decodes the given wav files, and exits.
-Usage:
-    ./offline_asr.py --help
-For BPE based models (e.g., LibriSpeech):
-    ./offline_asr.py \
-        --nn-model-filename /path/to/cpu_jit.pt \
-        --bpe-model-filename /path/to/bpe.model \
-        --decoding-method greedy_search \
-        ./foo.wav \
-        ./bar.wav \
-        ./foobar.wav
-For character based models (e.g., aishell):
-    ./offline.py \
-        --nn-model-filename /path/to/cpu_jit.pt \
-        --token-filename /path/to/lang_char/tokens.txt \
-        --decoding-method greedy_search \
-        ./foo.wav \
-        ./bar.wav \
-        ./foobar.wav
-Note: We provide pre-trained models for testing.
-(1) Pre-trained model with the LibriSpeech dataset
-    sudo apt-get install git-lfs
-    git lfs install
-    git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
-    nn_model_filename=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-1.6.0.pt
-    bpe_model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model
-    wav1=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav
-    wav2=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav
-    wav3=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav
-    sherpa/bin/conformer_rnnt/offline_asr.py \
-      --nn-model-filename $nn_model_filename \
-      --bpe-model $bpe_model \
-      $wav1 \
-      $wav2 \
-      $wav3
-(2) Pre-trained model with the aishell dataset
-    sudo apt-get install git-lfs
-    git lfs install
-    git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
-    nn_model_filename=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt
-    token_filename=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt
-    wav1=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav
-    wav2=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav
-    wav3=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav
-    sherpa/bin/conformer_rnnt/offline_asr.py \
-      --nn-model-filename $nn_model_filename \
-      --token-filename $token_filename \
-      $wav1 \
-      $wav2 \
-      $wav3
-"""
-import argparse
-import functools
-import logging
-from typing import List, Optional, Union
-import k2
-import kaldifeat
-import sentencepiece as spm
-import torch
-import torchaudio
-from sherpa import RnntConformerModel
-from decode import run_model_and_do_greedy_search, run_model_and_do_modified_beam_search
-def get_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "--nn-model-filename",
-        type=str,
-        help="""The torchscript model. You can use
-          icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \
-             --jit=1
-        to generate this model.
-        """,
-    )
-    parser.add_argument(
-        "--bpe-model-filename",
-        type=str,
-        help="""The BPE model
-        You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx
-        from icefall,
-        where xxx is the number of BPE tokens you used to train the model.
-        Note: Use it only when your model is using BPE. You don't need to
-        provide it if you provide `--token-filename`
-        """,
-    )
-    parser.add_argument(
-        "--token-filename",
-        type=str,
-        help="""Filename for tokens.txt
-        You can find it in the directory
-        egs/aishell/ASR/data/lang_char/tokens.txt from icefall.
-        Note: You don't need to provide it if you provide `--bpe-model`
-        """,
-    )
-    parser.add_argument(
-        "--decoding-method",
-        type=str,
-        default="greedy_search",
-        help="""Decoding method to use. Currently, only greedy_search and
-        modified_beam_search are implemented.
-        """,
-    )
-    parser.add_argument(
-        "--num-active-paths",
-        type=int,
-        default=4,
-        help="""Used only when decoding_method is modified_beam_search.
-        It specifies number of active paths for each utterance. Due to
-        merging paths with identical token sequences, the actual number
-        may be less than "num_active_paths".
-        """,
-    )
-    parser.add_argument(
-        "--sample-rate",
-        type=int,
-        default=16000,
-        help="The expected sample rate of the input sound files",
-    )
-    parser.add_argument(
-        "sound_files",
-        type=str,
-        nargs="+",
-        help="The input sound file(s) to transcribe. "
-        "Supported formats are those supported by torchaudio.load(). "
-        "For example, wav and flac are supported. "
-        "The sample rate has to equal to `--sample-rate`.",
-    )
-    return parser.parse_args()
-def read_sound_files(
-    filenames: List[str],
-    expected_sample_rate: int,
-) -> List[torch.Tensor]:
-    """Read a list of sound files into a list 1-D float32 torch tensors.
-    Args:
-      filenames:
-        A list of sound filenames.
-      expected_sample_rate:
-        The expected sample rate of the sound files.
-    Returns:
-      Return a list of 1-D float32 torch tensors.
-    """
-    ans = []
-    for f in filenames:
-        wave, sample_rate = torchaudio.load(f)
-        assert sample_rate == expected_sample_rate, (
-            f"expected sample rate: {expected_sample_rate}. " f"Given: {sample_rate}"
-        )
-        # We use only the first channel
-        ans.append(wave[0])
-    return ans
-class OfflineAsr(object):
-    def __init__(
-        self,
-        nn_model_filename: str,
-        bpe_model_filename: Optional[str] = None,
-        token_filename: Optional[str] = None,
-        decoding_method: str = "greedy_search",
-        num_active_paths: int = 4,
-        sample_rate: int = 16000,
-        device: Union[str, torch.device] = "cpu",
-    ):
-        """
-        Args:
-          nn_model_filename:
-            Path to the torch script model.
-          bpe_model_filename:
-            Path to the BPE model. If it is None, you have to provide
-            `token_filename`.
-          token_filename:
-            Path to tokens.txt. If it is None, you have to provide
-            `bpe_model_filename`.
-          sample_rate:
-            Expected sample rate of the feature extractor.
-          device:
-            The device to use for computation.
-        """
-        self.model = RnntConformerModel(
-            filename=nn_model_filename,
-            device=device,
-            optimize_for_inference=False,
-        )
-        if bpe_model_filename:
-            self.sp = spm.SentencePieceProcessor()
-            self.sp.load(bpe_model_filename)
-        else:
-            assert token_filename is not None, token_filename
-            self.token_table = k2.SymbolTable.from_file(token_filename)
-        self.feature_extractor = self._build_feature_extractor(
-            sample_rate=sample_rate,
-            device=device,
-        )
-        self.device = device
-    def _build_feature_extractor(
-        self,
-        sample_rate: int = 16000,
-        device: Union[str, torch.device] = "cpu",
-    ) -> kaldifeat.OfflineFeature:
-        """Build a fbank feature extractor for extracting features.
-        Args:
-          sample_rate:
-            Expected sample rate of the feature extractor.
-          device:
-            The device to use for computation.
-        Returns:
-          Return a fbank feature extractor.
-        """
-        opts = kaldifeat.FbankOptions()
-        opts.device = device
-        opts.frame_opts.dither = 0
-        opts.frame_opts.snip_edges = False
-        opts.frame_opts.samp_freq = sample_rate
-        opts.mel_opts.num_bins = 80
-        fbank = kaldifeat.Fbank(opts)
-        return fbank
-    def decode_waves(
-        self,
-        waves: List[torch.Tensor],
-        decoding_method: str,
-        num_active_paths: int,
-    ) -> List[List[str]]:
-        """
-        Args:
-          waves:
-            A list of 1-D torch.float32 tensors containing audio samples.
-            wavs[i] contains audio samples for the i-th utterance.
-            Note:
-              Whether it should be in the range [-32768, 32767] or be normalized
-              to [-1, 1] depends on which range you used for your training data.
-              For instance, if your training data used [-32768, 32767],
-              then the given waves have to contain samples in this range.
-              All models trained in icefall use the normalized range [-1, 1].
-          decoding_method:
-            The decoding method to use. Currently, only greedy_search and
-            modified_beam_search are implemented.
-          num_active_paths:
-            Used only when decoding_method is modified_beam_search.
-            It specifies number of active paths for each utterance. Due to
-            merging paths with identical token sequences, the actual number
-            may be less than "num_active_paths".
-        Returns:
-          Return a list of decoded results. `ans[i]` contains the decoded
-          results for `wavs[i]`.
-        """
-        assert decoding_method in (
-            "greedy_search",
-            "modified_beam_search",
-        ), decoding_method
-        if decoding_method == "greedy_search":
-            nn_and_decoding_func = run_model_and_do_greedy_search
-        elif decoding_method == "modified_beam_search":
-            nn_and_decoding_func = functools.partial(
-                run_model_and_do_modified_beam_search,
-                num_active_paths=num_active_paths,
-            )
-        else:
-            raise ValueError(
-                f"Unsupported decoding_method: {decoding_method} "
-                "Please use greedy_search or modified_beam_search"
-            )
-        waves = [w.to(self.device) for w in waves]
-        features = self.feature_extractor(waves)
-        tokens = nn_and_decoding_func(self.model, features)
-        if hasattr(self, "sp"):
-            results = self.sp.decode(tokens)
-        else:
-            results = [[self.token_table[i] for i in hyp] for hyp in tokens]
-            blank = chr(0x2581)
-            results = ["".join(r) for r in results]
-            results = [r.replace(blank, " ") for r in results]
-        return results
-@torch.no_grad()
-def main():
-    args = get_args()
-    logging.info(vars(args))
-    nn_model_filename = args.nn_model_filename
-    bpe_model_filename = args.bpe_model_filename
-    token_filename = args.token_filename
-    decoding_method = args.decoding_method
-    num_active_paths = args.num_active_paths
-    sample_rate = args.sample_rate
-    sound_files = args.sound_files
-    assert decoding_method in ("greedy_search", "modified_beam_search"), decoding_method
-    if decoding_method == "modified_beam_search":
-        assert num_active_paths >= 1, num_active_paths
-    if bpe_model_filename:
-        assert token_filename is None
-    if token_filename:
-        assert bpe_model_filename is None
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-    logging.info(f"device: {device}")
-    offline_asr = OfflineAsr(
-        nn_model_filename=nn_model_filename,
-        bpe_model_filename=bpe_model_filename,
-        token_filename=token_filename,
-        decoding_method=decoding_method,
-        num_active_paths=num_active_paths,
-        sample_rate=sample_rate,
-        device=device,
-    )
-    waves = read_sound_files(
-        filenames=sound_files,
-        expected_sample_rate=sample_rate,
-    )
-    logging.info("Decoding started.")
-    hyps = offline_asr.decode_waves(waves)
-    s = "\n"
-    for filename, hyp in zip(sound_files, hyps):
-        s += f"{filename}:\n{hyp}\n\n"
-    logging.info(s)
-    logging.info("Decoding done.")
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-# See https://github.com/pytorch/pytorch/issues/38342
-# and https://github.com/pytorch/pytorch/issues/33354
-#
-# If we don't do this, the delay increases whenever there is
-# a new request that changes the actual batch size.
-# If you use `py-spy dump --pid <server-pid> --native`, you will
-# see a lot of time is spent in re-compiling the torch script model.
-torch._C._jit_set_profiling_executor(False)
-torch._C._jit_set_profiling_mode(False)
-torch._C._set_graph_executor_optimize(False)
-"""
-// Use the following in C++
-torch::jit::getExecutorMode() = false;
-torch::jit::getProfilingMode() = false;
-torch::jit::setGraphExecutorOptimize(false);
-"""
-if __name__ == "__main__":
-    torch.manual_seed(20220609)
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"  # noqa
-    )
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    main()