simpleoier commited on
Commit
4256c70
1 Parent(s): 14e2586

Update model

Browse files
Files changed (20) hide show
  1. .gitattributes +1 -0
  2. README.md +727 -0
  3. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/config.yaml +657 -0
  4. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/acc_m.png +0 -0
  5. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/acc_u.png +0 -0
  6. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/backward_time.png +0 -0
  7. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/correct_m.png +0 -0
  8. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/correct_u.png +0 -0
  9. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/count_m.png +0 -0
  10. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/count_u.png +0 -0
  11. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/forward_time.png +0 -0
  12. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/gpu_max_cached_mem_GB.png +0 -0
  13. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/iter_time.png +0 -0
  14. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/loss.png +0 -0
  15. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/optim0_lr0.png +0 -0
  16. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/optim_step_time.png +0 -0
  17. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/train_time.png +0 -0
  18. exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/valid.loss.ave.pth +3 -0
  19. exp/kmeans_iter1_hubert_train_960_portion0.1/km_500.mdl +3 -0
  20. meta.yaml +8 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ exp/kmeans_iter1_hubert_train_960_portion0.1/km_500.mdl filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,730 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - self-supervised-learning
6
+ language: en
7
+ datasets:
8
+ - librispeech
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SSL model
13
+
14
+ ### `simpleoier/simpleoier_librispeech_hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw`
15
+
16
+ This model was trained by simpleoier using librispeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 753f40d61813436d4e76660904d02eaed7a6649e
26
+ pip install -e .
27
+ cd egs2/librispeech/ssl1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model simpleoier/simpleoier_librispeech_hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw
29
+ ```
30
+
31
+
32
+
33
+ ## SSL config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_ssl_torchaudiohubert_base_960h_pretrain_it1.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 64
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 8
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 49251
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 250
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 10
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 2
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: true
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ create_graph_in_tensorboard: false
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param: []
100
+ ignore_init_mismatch: false
101
+ freeze_param: []
102
+ num_iters_per_epoch: null
103
+ batch_size: 20
104
+ valid_batch_size: null
105
+ batch_bins: 45000000
106
+ valid_batch_bins: null
107
+ train_shape_file:
108
+ - exp/hubert_iter1_stats_raw/train/speech_shape
109
+ - exp/hubert_iter1_stats_raw/train/text_shape.word
110
+ valid_shape_file:
111
+ - exp/hubert_iter1_stats_raw/valid/speech_shape
112
+ - exp/hubert_iter1_stats_raw/valid/text_shape.word
113
+ batch_type: numel
114
+ valid_batch_type: null
115
+ fold_length:
116
+ - 80000
117
+ - 400
118
+ sort_in_batch: descending
119
+ sort_batch: descending
120
+ multiple_iterator: false
121
+ chunk_length: 500
122
+ chunk_shift_ratio: 0.5
123
+ num_cache_chunks: 1024
124
+ train_data_path_and_name_and_type:
125
+ - - dump/raw/train_960/wav.scp
126
+ - speech
127
+ - sound
128
+ - - dump/raw/train_960/text.km.kmeans_iter1_hubert_train_960_portion0.1
129
+ - text
130
+ - text
131
+ valid_data_path_and_name_and_type:
132
+ - - dump/raw/dev/wav.scp
133
+ - speech
134
+ - sound
135
+ - - dump/raw/dev/text.km.kmeans_iter1_hubert_train_960_portion0.1
136
+ - text
137
+ - text
138
+ allow_variable_data_keys: false
139
+ max_cache_size: 0.0
140
+ max_cache_fd: 32
141
+ valid_max_cache_size: null
142
+ optim: adam
143
+ optim_conf:
144
+ lr: 0.0005
145
+ scheduler: warmuplr
146
+ scheduler_conf:
147
+ warmup_steps: 32000
148
+ token_list:
149
+ - '386'
150
+ - '160'
151
+ - '89'
152
+ - '3'
153
+ - '448'
154
+ - '431'
155
+ - '319'
156
+ - '247'
157
+ - '256'
158
+ - '23'
159
+ - '267'
160
+ - '274'
161
+ - '479'
162
+ - '227'
163
+ - '197'
164
+ - '74'
165
+ - '362'
166
+ - '159'
167
+ - '190'
168
+ - '275'
169
+ - '241'
170
+ - '147'
171
+ - '242'
172
+ - '105'
173
+ - '7'
174
+ - '320'
175
+ - '311'
176
+ - '327'
177
+ - '130'
178
+ - '485'
179
+ - '427'
180
+ - '22'
181
+ - '493'
182
+ - '254'
183
+ - '451'
184
+ - '399'
185
+ - '342'
186
+ - '443'
187
+ - '38'
188
+ - '33'
189
+ - '53'
190
+ - '238'
191
+ - '86'
192
+ - '61'
193
+ - '263'
194
+ - '218'
195
+ - '316'
196
+ - '350'
197
+ - '96'
198
+ - '492'
199
+ - '341'
200
+ - '496'
201
+ - '325'
202
+ - '462'
203
+ - '24'
204
+ - '328'
205
+ - '133'
206
+ - '407'
207
+ - '41'
208
+ - '304'
209
+ - '373'
210
+ - '167'
211
+ - '352'
212
+ - '456'
213
+ - '149'
214
+ - '279'
215
+ - '84'
216
+ - '217'
217
+ - '494'
218
+ - '139'
219
+ - '381'
220
+ - '416'
221
+ - '305'
222
+ - '446'
223
+ - '337'
224
+ - '228'
225
+ - '35'
226
+ - '372'
227
+ - '55'
228
+ - '237'
229
+ - '66'
230
+ - '13'
231
+ - '188'
232
+ - '291'
233
+ - '43'
234
+ - '132'
235
+ - '232'
236
+ - '144'
237
+ - '497'
238
+ - '318'
239
+ - '0'
240
+ - '31'
241
+ - '49'
242
+ - '400'
243
+ - '10'
244
+ - '406'
245
+ - '398'
246
+ - '154'
247
+ - '300'
248
+ - '226'
249
+ - '93'
250
+ - '348'
251
+ - '82'
252
+ - '2'
253
+ - '423'
254
+ - '113'
255
+ - '395'
256
+ - '92'
257
+ - '394'
258
+ - '293'
259
+ - '62'
260
+ - '137'
261
+ - '476'
262
+ - '216'
263
+ - '432'
264
+ - '155'
265
+ - '29'
266
+ - '369'
267
+ - '64'
268
+ - '163'
269
+ - '389'
270
+ - '278'
271
+ - '25'
272
+ - '164'
273
+ - '310'
274
+ - '213'
275
+ - '126'
276
+ - '331'
277
+ - '414'
278
+ - '11'
279
+ - '404'
280
+ - '185'
281
+ - '365'
282
+ - '484'
283
+ - '409'
284
+ - '17'
285
+ - '193'
286
+ - '178'
287
+ - '273'
288
+ - '37'
289
+ - '390'
290
+ - '128'
291
+ - '170'
292
+ - '203'
293
+ - '298'
294
+ - '229'
295
+ - '383'
296
+ - '67'
297
+ - '27'
298
+ - '118'
299
+ - '72'
300
+ - '142'
301
+ - '73'
302
+ - '65'
303
+ - '231'
304
+ - '104'
305
+ - '124'
306
+ - '428'
307
+ - '345'
308
+ - '230'
309
+ - '287'
310
+ - '175'
311
+ - '294'
312
+ - '184'
313
+ - '97'
314
+ - '48'
315
+ - '457'
316
+ - '288'
317
+ - '204'
318
+ - '379'
319
+ - '107'
320
+ - '200'
321
+ - '99'
322
+ - '269'
323
+ - '442'
324
+ - '353'
325
+ - '129'
326
+ - '445'
327
+ - '51'
328
+ - '360'
329
+ - '80'
330
+ - '83'
331
+ - '201'
332
+ - '223'
333
+ - '312'
334
+ - '69'
335
+ - '30'
336
+ - '202'
337
+ - '70'
338
+ - '286'
339
+ - '236'
340
+ - '50'
341
+ - '123'
342
+ - '88'
343
+ - '205'
344
+ - '151'
345
+ - '127'
346
+ - '186'
347
+ - '367'
348
+ - '299'
349
+ - '313'
350
+ - '220'
351
+ - '206'
352
+ - '297'
353
+ - '422'
354
+ - '71'
355
+ - '44'
356
+ - '281'
357
+ - '91'
358
+ - '57'
359
+ - '408'
360
+ - '112'
361
+ - '26'
362
+ - '145'
363
+ - '16'
364
+ - '75'
365
+ - '235'
366
+ - '183'
367
+ - '222'
368
+ - '171'
369
+ - '121'
370
+ - '250'
371
+ - '472'
372
+ - '195'
373
+ - '94'
374
+ - '357'
375
+ - '393'
376
+ - '380'
377
+ - '370'
378
+ - '363'
379
+ - '103'
380
+ - '396'
381
+ - '468'
382
+ - '346'
383
+ - '40'
384
+ - '180'
385
+ - '42'
386
+ - '351'
387
+ - '450'
388
+ - '477'
389
+ - '239'
390
+ - '143'
391
+ - '361'
392
+ - '314'
393
+ - '392'
394
+ - '161'
395
+ - '473'
396
+ - '198'
397
+ - '194'
398
+ - '371'
399
+ - '433'
400
+ - '56'
401
+ - '444'
402
+ - '138'
403
+ - '157'
404
+ - '245'
405
+ - '140'
406
+ - '165'
407
+ - '412'
408
+ - '354'
409
+ - '9'
410
+ - '333'
411
+ - '85'
412
+ - '176'
413
+ - '323'
414
+ - '301'
415
+ - '215'
416
+ - '264'
417
+ - '434'
418
+ - '489'
419
+ - '355'
420
+ - '488'
421
+ - '382'
422
+ - '177'
423
+ - '268'
424
+ - '290'
425
+ - '114'
426
+ - '266'
427
+ - '334'
428
+ - '356'
429
+ - '90'
430
+ - '244'
431
+ - '259'
432
+ - '368'
433
+ - '6'
434
+ - '303'
435
+ - '478'
436
+ - '199'
437
+ - '376'
438
+ - '480'
439
+ - '401'
440
+ - '1'
441
+ - '168'
442
+ - '453'
443
+ - '19'
444
+ - '54'
445
+ - '221'
446
+ - '100'
447
+ - '4'
448
+ - '495'
449
+ - '77'
450
+ - '240'
451
+ - '45'
452
+ - '481'
453
+ - '224'
454
+ - '20'
455
+ - '120'
456
+ - '58'
457
+ - '162'
458
+ - '12'
459
+ - '109'
460
+ - '491'
461
+ - '115'
462
+ - '397'
463
+ - '340'
464
+ - '196'
465
+ - '68'
466
+ - '34'
467
+ - '415'
468
+ - '429'
469
+ - '421'
470
+ - '475'
471
+ - '335'
472
+ - '338'
473
+ - '172'
474
+ - '39'
475
+ - '258'
476
+ - '330'
477
+ - '246'
478
+ - '425'
479
+ - '296'
480
+ - '125'
481
+ - '60'
482
+ - '52'
483
+ - '271'
484
+ - '173'
485
+ - '469'
486
+ - '289'
487
+ - '439'
488
+ - '207'
489
+ - '487'
490
+ - '272'
491
+ - '332'
492
+ - '284'
493
+ - '308'
494
+ - '388'
495
+ - '95'
496
+ - '248'
497
+ - '101'
498
+ - '36'
499
+ - '14'
500
+ - '315'
501
+ - '262'
502
+ - '146'
503
+ - '343'
504
+ - '79'
505
+ - '426'
506
+ - '21'
507
+ - '253'
508
+ - '63'
509
+ - '292'
510
+ - '81'
511
+ - '385'
512
+ - '309'
513
+ - '366'
514
+ - '116'
515
+ - '131'
516
+ - '87'
517
+ - '449'
518
+ - '283'
519
+ - '214'
520
+ - '474'
521
+ - '329'
522
+ - '471'
523
+ - '225'
524
+ - '108'
525
+ - '136'
526
+ - '148'
527
+ - '306'
528
+ - '150'
529
+ - '378'
530
+ - '460'
531
+ - '307'
532
+ - '141'
533
+ - '98'
534
+ - '436'
535
+ - '402'
536
+ - '192'
537
+ - '8'
538
+ - '483'
539
+ - '440'
540
+ - '47'
541
+ - '466'
542
+ - '486'
543
+ - '5'
544
+ - '257'
545
+ - '447'
546
+ - '377'
547
+ - '111'
548
+ - '251'
549
+ - '490'
550
+ - '265'
551
+ - '438'
552
+ - '158'
553
+ - '384'
554
+ - '135'
555
+ - '102'
556
+ - '276'
557
+ - '211'
558
+ - '219'
559
+ - '187'
560
+ - '347'
561
+ - '32'
562
+ - '182'
563
+ - '169'
564
+ - '410'
565
+ - '455'
566
+ - '461'
567
+ - '482'
568
+ - '374'
569
+ - '463'
570
+ - '452'
571
+ - '59'
572
+ - '152'
573
+ - '174'
574
+ - '418'
575
+ - '166'
576
+ - '470'
577
+ - '459'
578
+ - '153'
579
+ - '179'
580
+ - '498'
581
+ - '430'
582
+ - '419'
583
+ - '467'
584
+ - '208'
585
+ - '326'
586
+ - '210'
587
+ - '270'
588
+ - '243'
589
+ - '255'
590
+ - '233'
591
+ - '261'
592
+ - '336'
593
+ - '282'
594
+ - '234'
595
+ - '464'
596
+ - '181'
597
+ - '156'
598
+ - '359'
599
+ - '454'
600
+ - '420'
601
+ - '28'
602
+ - '249'
603
+ - '106'
604
+ - '302'
605
+ - '191'
606
+ - '209'
607
+ - '46'
608
+ - '117'
609
+ - '403'
610
+ - '280'
611
+ - '324'
612
+ - '458'
613
+ - '134'
614
+ - '122'
615
+ - '212'
616
+ - '18'
617
+ - '437'
618
+ - '78'
619
+ - '375'
620
+ - '252'
621
+ - '405'
622
+ - '295'
623
+ - '435'
624
+ - '317'
625
+ - '260'
626
+ - '364'
627
+ - '322'
628
+ - '15'
629
+ - '339'
630
+ - '413'
631
+ - '465'
632
+ - '285'
633
+ - '189'
634
+ - '417'
635
+ - '344'
636
+ - '110'
637
+ - '119'
638
+ - '277'
639
+ - '499'
640
+ - '358'
641
+ - '411'
642
+ - '387'
643
+ - '349'
644
+ - '424'
645
+ - '391'
646
+ - '76'
647
+ - '441'
648
+ - '321'
649
+ - <unk>
650
+ - <sos/eos>
651
+ init: null
652
+ collate_fn_conf:
653
+ label_downsampling: 1
654
+ pad: false
655
+ rand_crop: true
656
+ input_size: 1
657
+ num_classes: 500
658
+ use_preprocessor: true
659
+ token_type: word
660
+ bpemodel: null
661
+ non_linguistic_symbols: null
662
+ cleaner: null
663
+ g2p: null
664
+ speech_volume_normalize: null
665
+ rir_scp: null
666
+ rir_apply_prob: 1.0
667
+ noise_scp: null
668
+ noise_apply_prob: 1.0
669
+ noise_db_range: '13_15'
670
+ pred_masked_weight: 1.0
671
+ pred_nomask_weight: 0.0
672
+ loss_weights: 0.0
673
+ frontend: null
674
+ frontend_conf: {}
675
+ specaug: null
676
+ specaug_conf: {}
677
+ normalize: null
678
+ normalize_conf: {}
679
+ preencoder: null
680
+ preencoder_conf: {}
681
+ encoder: torchaudio_hubert
682
+ encoder_conf:
683
+ encoder_projection_dropout: 0.1
684
+ encoder_attention_dropout: 0.1
685
+ encoder_ff_interm_dropout: 0.0
686
+ encoder_dropout: 0.1
687
+ encoder_layer_drop: 0.05
688
+ model: torchaudio
689
+ model_conf: {}
690
+ required:
691
+ - output_dir
692
+ - token_list
693
+ version: '202209'
694
+ distributed: true
695
+ ```
696
+
697
+ </details>
698
+
699
+
700
+
701
+ ### Citing ESPnet
702
+
703
+ ```BibTex
704
+ @inproceedings{watanabe2018espnet,
705
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
706
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
707
+ year={2018},
708
+ booktitle={Proceedings of Interspeech},
709
+ pages={2207--2211},
710
+ doi={10.21437/Interspeech.2018-1456},
711
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
712
+ }
713
+
714
+
715
+
716
+
717
+ ```
718
+
719
+ or arXiv:
720
+
721
+ ```bibtex
722
+ @misc{watanabe2018espnet,
723
+ title={ESPnet: End-to-End Speech Processing Toolkit},
724
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
725
+ year={2018},
726
+ eprint={1804.00015},
727
+ archivePrefix={arXiv},
728
+ primaryClass={cs.CL}
729
+ }
730
+ ```
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/config.yaml ADDED
@@ -0,0 +1,657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_ssl_torchaudiohubert_base_960h_pretrain_it1.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 64
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 8
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 49251
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 250
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: true
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 45000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/hubert_iter1_stats_raw/train/speech_shape
72
+ - exp/hubert_iter1_stats_raw/train/text_shape.word
73
+ valid_shape_file:
74
+ - exp/hubert_iter1_stats_raw/valid/speech_shape
75
+ - exp/hubert_iter1_stats_raw/valid/text_shape.word
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 80000
80
+ - 400
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train_960/wav.scp
89
+ - speech
90
+ - sound
91
+ - - dump/raw/train_960/text.km.kmeans_iter1_hubert_train_960_portion0.1
92
+ - text
93
+ - text
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/wav.scp
96
+ - speech
97
+ - sound
98
+ - - dump/raw/dev/text.km.kmeans_iter1_hubert_train_960_portion0.1
99
+ - text
100
+ - text
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adam
106
+ optim_conf:
107
+ lr: 0.0005
108
+ scheduler: warmuplr
109
+ scheduler_conf:
110
+ warmup_steps: 32000
111
+ token_list:
112
+ - '386'
113
+ - '160'
114
+ - '89'
115
+ - '3'
116
+ - '448'
117
+ - '431'
118
+ - '319'
119
+ - '247'
120
+ - '256'
121
+ - '23'
122
+ - '267'
123
+ - '274'
124
+ - '479'
125
+ - '227'
126
+ - '197'
127
+ - '74'
128
+ - '362'
129
+ - '159'
130
+ - '190'
131
+ - '275'
132
+ - '241'
133
+ - '147'
134
+ - '242'
135
+ - '105'
136
+ - '7'
137
+ - '320'
138
+ - '311'
139
+ - '327'
140
+ - '130'
141
+ - '485'
142
+ - '427'
143
+ - '22'
144
+ - '493'
145
+ - '254'
146
+ - '451'
147
+ - '399'
148
+ - '342'
149
+ - '443'
150
+ - '38'
151
+ - '33'
152
+ - '53'
153
+ - '238'
154
+ - '86'
155
+ - '61'
156
+ - '263'
157
+ - '218'
158
+ - '316'
159
+ - '350'
160
+ - '96'
161
+ - '492'
162
+ - '341'
163
+ - '496'
164
+ - '325'
165
+ - '462'
166
+ - '24'
167
+ - '328'
168
+ - '133'
169
+ - '407'
170
+ - '41'
171
+ - '304'
172
+ - '373'
173
+ - '167'
174
+ - '352'
175
+ - '456'
176
+ - '149'
177
+ - '279'
178
+ - '84'
179
+ - '217'
180
+ - '494'
181
+ - '139'
182
+ - '381'
183
+ - '416'
184
+ - '305'
185
+ - '446'
186
+ - '337'
187
+ - '228'
188
+ - '35'
189
+ - '372'
190
+ - '55'
191
+ - '237'
192
+ - '66'
193
+ - '13'
194
+ - '188'
195
+ - '291'
196
+ - '43'
197
+ - '132'
198
+ - '232'
199
+ - '144'
200
+ - '497'
201
+ - '318'
202
+ - '0'
203
+ - '31'
204
+ - '49'
205
+ - '400'
206
+ - '10'
207
+ - '406'
208
+ - '398'
209
+ - '154'
210
+ - '300'
211
+ - '226'
212
+ - '93'
213
+ - '348'
214
+ - '82'
215
+ - '2'
216
+ - '423'
217
+ - '113'
218
+ - '395'
219
+ - '92'
220
+ - '394'
221
+ - '293'
222
+ - '62'
223
+ - '137'
224
+ - '476'
225
+ - '216'
226
+ - '432'
227
+ - '155'
228
+ - '29'
229
+ - '369'
230
+ - '64'
231
+ - '163'
232
+ - '389'
233
+ - '278'
234
+ - '25'
235
+ - '164'
236
+ - '310'
237
+ - '213'
238
+ - '126'
239
+ - '331'
240
+ - '414'
241
+ - '11'
242
+ - '404'
243
+ - '185'
244
+ - '365'
245
+ - '484'
246
+ - '409'
247
+ - '17'
248
+ - '193'
249
+ - '178'
250
+ - '273'
251
+ - '37'
252
+ - '390'
253
+ - '128'
254
+ - '170'
255
+ - '203'
256
+ - '298'
257
+ - '229'
258
+ - '383'
259
+ - '67'
260
+ - '27'
261
+ - '118'
262
+ - '72'
263
+ - '142'
264
+ - '73'
265
+ - '65'
266
+ - '231'
267
+ - '104'
268
+ - '124'
269
+ - '428'
270
+ - '345'
271
+ - '230'
272
+ - '287'
273
+ - '175'
274
+ - '294'
275
+ - '184'
276
+ - '97'
277
+ - '48'
278
+ - '457'
279
+ - '288'
280
+ - '204'
281
+ - '379'
282
+ - '107'
283
+ - '200'
284
+ - '99'
285
+ - '269'
286
+ - '442'
287
+ - '353'
288
+ - '129'
289
+ - '445'
290
+ - '51'
291
+ - '360'
292
+ - '80'
293
+ - '83'
294
+ - '201'
295
+ - '223'
296
+ - '312'
297
+ - '69'
298
+ - '30'
299
+ - '202'
300
+ - '70'
301
+ - '286'
302
+ - '236'
303
+ - '50'
304
+ - '123'
305
+ - '88'
306
+ - '205'
307
+ - '151'
308
+ - '127'
309
+ - '186'
310
+ - '367'
311
+ - '299'
312
+ - '313'
313
+ - '220'
314
+ - '206'
315
+ - '297'
316
+ - '422'
317
+ - '71'
318
+ - '44'
319
+ - '281'
320
+ - '91'
321
+ - '57'
322
+ - '408'
323
+ - '112'
324
+ - '26'
325
+ - '145'
326
+ - '16'
327
+ - '75'
328
+ - '235'
329
+ - '183'
330
+ - '222'
331
+ - '171'
332
+ - '121'
333
+ - '250'
334
+ - '472'
335
+ - '195'
336
+ - '94'
337
+ - '357'
338
+ - '393'
339
+ - '380'
340
+ - '370'
341
+ - '363'
342
+ - '103'
343
+ - '396'
344
+ - '468'
345
+ - '346'
346
+ - '40'
347
+ - '180'
348
+ - '42'
349
+ - '351'
350
+ - '450'
351
+ - '477'
352
+ - '239'
353
+ - '143'
354
+ - '361'
355
+ - '314'
356
+ - '392'
357
+ - '161'
358
+ - '473'
359
+ - '198'
360
+ - '194'
361
+ - '371'
362
+ - '433'
363
+ - '56'
364
+ - '444'
365
+ - '138'
366
+ - '157'
367
+ - '245'
368
+ - '140'
369
+ - '165'
370
+ - '412'
371
+ - '354'
372
+ - '9'
373
+ - '333'
374
+ - '85'
375
+ - '176'
376
+ - '323'
377
+ - '301'
378
+ - '215'
379
+ - '264'
380
+ - '434'
381
+ - '489'
382
+ - '355'
383
+ - '488'
384
+ - '382'
385
+ - '177'
386
+ - '268'
387
+ - '290'
388
+ - '114'
389
+ - '266'
390
+ - '334'
391
+ - '356'
392
+ - '90'
393
+ - '244'
394
+ - '259'
395
+ - '368'
396
+ - '6'
397
+ - '303'
398
+ - '478'
399
+ - '199'
400
+ - '376'
401
+ - '480'
402
+ - '401'
403
+ - '1'
404
+ - '168'
405
+ - '453'
406
+ - '19'
407
+ - '54'
408
+ - '221'
409
+ - '100'
410
+ - '4'
411
+ - '495'
412
+ - '77'
413
+ - '240'
414
+ - '45'
415
+ - '481'
416
+ - '224'
417
+ - '20'
418
+ - '120'
419
+ - '58'
420
+ - '162'
421
+ - '12'
422
+ - '109'
423
+ - '491'
424
+ - '115'
425
+ - '397'
426
+ - '340'
427
+ - '196'
428
+ - '68'
429
+ - '34'
430
+ - '415'
431
+ - '429'
432
+ - '421'
433
+ - '475'
434
+ - '335'
435
+ - '338'
436
+ - '172'
437
+ - '39'
438
+ - '258'
439
+ - '330'
440
+ - '246'
441
+ - '425'
442
+ - '296'
443
+ - '125'
444
+ - '60'
445
+ - '52'
446
+ - '271'
447
+ - '173'
448
+ - '469'
449
+ - '289'
450
+ - '439'
451
+ - '207'
452
+ - '487'
453
+ - '272'
454
+ - '332'
455
+ - '284'
456
+ - '308'
457
+ - '388'
458
+ - '95'
459
+ - '248'
460
+ - '101'
461
+ - '36'
462
+ - '14'
463
+ - '315'
464
+ - '262'
465
+ - '146'
466
+ - '343'
467
+ - '79'
468
+ - '426'
469
+ - '21'
470
+ - '253'
471
+ - '63'
472
+ - '292'
473
+ - '81'
474
+ - '385'
475
+ - '309'
476
+ - '366'
477
+ - '116'
478
+ - '131'
479
+ - '87'
480
+ - '449'
481
+ - '283'
482
+ - '214'
483
+ - '474'
484
+ - '329'
485
+ - '471'
486
+ - '225'
487
+ - '108'
488
+ - '136'
489
+ - '148'
490
+ - '306'
491
+ - '150'
492
+ - '378'
493
+ - '460'
494
+ - '307'
495
+ - '141'
496
+ - '98'
497
+ - '436'
498
+ - '402'
499
+ - '192'
500
+ - '8'
501
+ - '483'
502
+ - '440'
503
+ - '47'
504
+ - '466'
505
+ - '486'
506
+ - '5'
507
+ - '257'
508
+ - '447'
509
+ - '377'
510
+ - '111'
511
+ - '251'
512
+ - '490'
513
+ - '265'
514
+ - '438'
515
+ - '158'
516
+ - '384'
517
+ - '135'
518
+ - '102'
519
+ - '276'
520
+ - '211'
521
+ - '219'
522
+ - '187'
523
+ - '347'
524
+ - '32'
525
+ - '182'
526
+ - '169'
527
+ - '410'
528
+ - '455'
529
+ - '461'
530
+ - '482'
531
+ - '374'
532
+ - '463'
533
+ - '452'
534
+ - '59'
535
+ - '152'
536
+ - '174'
537
+ - '418'
538
+ - '166'
539
+ - '470'
540
+ - '459'
541
+ - '153'
542
+ - '179'
543
+ - '498'
544
+ - '430'
545
+ - '419'
546
+ - '467'
547
+ - '208'
548
+ - '326'
549
+ - '210'
550
+ - '270'
551
+ - '243'
552
+ - '255'
553
+ - '233'
554
+ - '261'
555
+ - '336'
556
+ - '282'
557
+ - '234'
558
+ - '464'
559
+ - '181'
560
+ - '156'
561
+ - '359'
562
+ - '454'
563
+ - '420'
564
+ - '28'
565
+ - '249'
566
+ - '106'
567
+ - '302'
568
+ - '191'
569
+ - '209'
570
+ - '46'
571
+ - '117'
572
+ - '403'
573
+ - '280'
574
+ - '324'
575
+ - '458'
576
+ - '134'
577
+ - '122'
578
+ - '212'
579
+ - '18'
580
+ - '437'
581
+ - '78'
582
+ - '375'
583
+ - '252'
584
+ - '405'
585
+ - '295'
586
+ - '435'
587
+ - '317'
588
+ - '260'
589
+ - '364'
590
+ - '322'
591
+ - '15'
592
+ - '339'
593
+ - '413'
594
+ - '465'
595
+ - '285'
596
+ - '189'
597
+ - '417'
598
+ - '344'
599
+ - '110'
600
+ - '119'
601
+ - '277'
602
+ - '499'
603
+ - '358'
604
+ - '411'
605
+ - '387'
606
+ - '349'
607
+ - '424'
608
+ - '391'
609
+ - '76'
610
+ - '441'
611
+ - '321'
612
+ - <unk>
613
+ - <sos/eos>
614
+ init: null
615
+ collate_fn_conf:
616
+ label_downsampling: 1
617
+ pad: false
618
+ rand_crop: true
619
+ input_size: 1
620
+ num_classes: 500
621
+ use_preprocessor: true
622
+ token_type: word
623
+ bpemodel: null
624
+ non_linguistic_symbols: null
625
+ cleaner: null
626
+ g2p: null
627
+ speech_volume_normalize: null
628
+ rir_scp: null
629
+ rir_apply_prob: 1.0
630
+ noise_scp: null
631
+ noise_apply_prob: 1.0
632
+ noise_db_range: '13_15'
633
+ pred_masked_weight: 1.0
634
+ pred_nomask_weight: 0.0
635
+ loss_weights: 0.0
636
+ frontend: null
637
+ frontend_conf: {}
638
+ specaug: null
639
+ specaug_conf: {}
640
+ normalize: null
641
+ normalize_conf: {}
642
+ preencoder: null
643
+ preencoder_conf: {}
644
+ encoder: torchaudio_hubert
645
+ encoder_conf:
646
+ encoder_projection_dropout: 0.1
647
+ encoder_attention_dropout: 0.1
648
+ encoder_ff_interm_dropout: 0.0
649
+ encoder_dropout: 0.1
650
+ encoder_layer_drop: 0.05
651
+ model: torchaudio
652
+ model_conf: {}
653
+ required:
654
+ - output_dir
655
+ - token_list
656
+ version: '202209'
657
+ distributed: true
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/acc_m.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/acc_u.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/backward_time.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/correct_m.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/correct_u.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/count_m.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/count_u.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/forward_time.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/iter_time.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/loss.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/optim0_lr0.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/optim_step_time.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/images/train_time.png ADDED
exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/valid.loss.ave.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7e7ff3bfc454ba573eec0d6d8a70863b486c62e0c377a0452d02871caeab087
3
+ size 378890737
exp/kmeans_iter1_hubert_train_960_portion0.1/km_500.mdl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0468a440908f8d60a0c69aa525bae5e4c4d2c2d7eb9fa0a86e84c6e45e3dc42f
3
+ size 1538858
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202209'
2
+ files:
3
+ model_file: exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/valid.loss.ave.pth
4
+ python: "3.9.15 (main, Nov 24 2022, 14:31:59) \n[GCC 11.2.0]"
5
+ timestamp: 1673330431.175186
6
+ torch: 1.13.0
7
+ yaml_files:
8
+ train_config: exp/hubert_iter1_train_ssl_torchaudiohubert_base_960h_pretrain_it1_raw/config.yaml