fuann commited on
Commit
fe2908b
1 Parent(s): dfe902a

Upload folder using huggingface_hub

Browse files
best/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
best/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db747c73a5a0bcfa3a14bcee18d9fe4f6906d017f56d57c77cd9a17aded446d1
3
+ size 1266295149
best/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8c57214620f9ca91e802bd74ba3796439d92b8900367e6c767a8d141c124fd
3
+ size 3695
config.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bc4625ed5a3bd5eb46b3a62a3a586bd3d984606cf3fdfa1192e0aee8b316e15
3
+ size 4591
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "err"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_phonemize": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[PAD]",
8
+ "phone_delimiter_token": " ",
9
+ "phonemizer_backend": "espeak",
10
+ "phonemizer_lang": "en-us",
11
+ "processor_class": "Wav2Vec2Processor",
12
+ "tokenizer_class": "Wav2Vec2PhonemeCTCTokenizer",
13
+ "unk_token": "err",
14
+ "word_delimiter_token": null
15
+ }
train.log ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO] Set manual seed 66
2
+ [NOTE] Model args ...
3
+ {
4
+ "model_path": "facebook/wav2vec2-large-lv60",
5
+ "problem_type": "single_label_classification",
6
+ "task_type": "asr",
7
+ "model_type": "baseline",
8
+ "final_dropout": 0.0,
9
+ "ctc_zero_infinity": true,
10
+ "layerdrop": 0.1,
11
+ "activation_dropout": 0.1,
12
+ "mask_time_prob": 0.065,
13
+ "mask_time_length": 10,
14
+ "mask_feature_prob": 0.015,
15
+ "mask_feature_length": 64
16
+ }
17
+ [INFO] Save tokenizer/extractor/processor to exp/timit/train_timit_baseline_wav2vec2_large_lv60_66_noworddel ...
18
+ [INFO] data-json/timit/train_dataset/dataset.arrow exists, using it
19
+ [INFO] data-json/timit/dev_dataset/dataset.arrow exists, using it
20
+ [INFO] Train a baseline model from facebook/wav2vec2-large-lv60 ...
21
+ {'loss': 452.0092, 'learning_rate': 7.166666666666667e-06, 'epoch': 0.86}
22
+ {'loss': 396.7542, 'learning_rate': 1.5333333333333334e-05, 'epoch': 1.72}
23
+ {'eval_loss': 356.89007568359375, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.6259, 'eval_samples_per_second': 34.406, 'eval_steps_per_second': 1.118, 'epoch': 1.72}
24
+ {'loss': 315.949, 'learning_rate': 2.3666666666666668e-05, 'epoch': 2.59}
25
+ {'loss': 168.6427, 'learning_rate': 3.2000000000000005e-05, 'epoch': 3.45}
26
+ {'eval_loss': 133.75314331054688, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.1446, 'eval_samples_per_second': 35.892, 'eval_steps_per_second': 1.166, 'epoch': 3.45}
27
+ {'loss': 129.0611, 'learning_rate': 4.0333333333333336e-05, 'epoch': 4.31}
28
+ {'loss': 126.4254, 'learning_rate': 4.866666666666667e-05, 'epoch': 5.17}
29
+ {'eval_loss': 123.59996032714844, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.1434, 'eval_samples_per_second': 35.896, 'eval_steps_per_second': 1.167, 'epoch': 5.17}
30
+ {'loss': 123.9036, 'learning_rate': 5.6999999999999996e-05, 'epoch': 6.03}
31
+ {'loss': 124.3407, 'learning_rate': 6.533333333333334e-05, 'epoch': 6.9}
32
+ {'eval_loss': 123.45381164550781, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.2845, 'eval_samples_per_second': 35.447, 'eval_steps_per_second': 1.152, 'epoch': 6.9}
33
+ {'loss': 122.7627, 'learning_rate': 7.366666666666668e-05, 'epoch': 7.76}
34
+ {'loss': 124.9659, 'learning_rate': 8.2e-05, 'epoch': 8.62}
35
+ {'eval_loss': 120.54447937011719, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.645, 'eval_samples_per_second': 34.35, 'eval_steps_per_second': 1.116, 'epoch': 8.62}
36
+ {'loss': 117.4764, 'learning_rate': 9.033333333333334e-05, 'epoch': 9.48}
37
+ {'loss': 122.2053, 'learning_rate': 9.866666666666668e-05, 'epoch': 10.34}
38
+ {'eval_loss': 111.49588012695312, 'eval_wer': 0.8151690243740453, 'eval_cer': 0.7344880456135308, 'eval_runtime': 11.1121, 'eval_samples_per_second': 35.997, 'eval_steps_per_second': 1.17, 'epoch': 10.34}
39
+ {'loss': 107.2907, 'learning_rate': 9.7e-05, 'epoch': 11.21}
40
+ {'loss': 91.5518, 'learning_rate': 9.342857142857143e-05, 'epoch': 12.07}
41
+ {'eval_loss': 71.25965118408203, 'eval_wer': 0.6252241482367006, 'eval_cer': 0.5373724306453931, 'eval_runtime': 11.7788, 'eval_samples_per_second': 33.959, 'eval_steps_per_second': 1.104, 'epoch': 12.07}
42
+ {'loss': 67.4171, 'learning_rate': 8.985714285714287e-05, 'epoch': 12.93}
43
+ {'loss': 50.1061, 'learning_rate': 8.62857142857143e-05, 'epoch': 13.79}
44
+ {'eval_loss': 27.590818405151367, 'eval_wer': 0.24048615262004383, 'eval_cer': 0.22212639547697763, 'eval_runtime': 11.6657, 'eval_samples_per_second': 34.289, 'eval_steps_per_second': 1.114, 'epoch': 13.79}
45
+ {'loss': 35.4837, 'learning_rate': 8.271428571428572e-05, 'epoch': 14.66}
46
+ {'loss': 26.2278, 'learning_rate': 7.914285714285715e-05, 'epoch': 15.52}
47
+ {'eval_loss': 14.451704025268555, 'eval_wer': 0.10035199574948529, 'eval_cer': 0.07608643572421063, 'eval_runtime': 11.7263, 'eval_samples_per_second': 34.111, 'eval_steps_per_second': 1.109, 'epoch': 15.52}
48
+ {'loss': 21.4687, 'learning_rate': 7.557142857142857e-05, 'epoch': 16.38}
49
+ {'loss': 19.5909, 'learning_rate': 7.2e-05, 'epoch': 17.24}
50
+ {'eval_loss': 11.897302627563477, 'eval_wer': 0.08839742312545659, 'eval_cer': 0.06741411527957453, 'eval_runtime': 11.3848, 'eval_samples_per_second': 35.135, 'eval_steps_per_second': 1.142, 'epoch': 17.24}
51
+ {'loss': 17.8596, 'learning_rate': 6.842857142857143e-05, 'epoch': 18.1}
52
+ {'loss': 16.5052, 'learning_rate': 6.485714285714286e-05, 'epoch': 18.97}
53
+ {'eval_loss': 9.827818870544434, 'eval_wer': 0.0839476655376237, 'eval_cer': 0.06537779694312683, 'eval_runtime': 11.4438, 'eval_samples_per_second': 34.953, 'eval_steps_per_second': 1.136, 'epoch': 18.97}
54
+ {'loss': 15.9378, 'learning_rate': 6.12857142857143e-05, 'epoch': 19.83}
55
+ {'loss': 13.871, 'learning_rate': 5.771428571428572e-05, 'epoch': 20.69}
56
+ {'eval_loss': 9.462098121643066, 'eval_wer': 0.0830178654446437, 'eval_cer': 0.0646830530401035, 'eval_runtime': 11.1352, 'eval_samples_per_second': 35.922, 'eval_steps_per_second': 1.167, 'epoch': 20.69}
57
+ {'loss': 15.1374, 'learning_rate': 5.414285714285715e-05, 'epoch': 21.55}
58
+ {'loss': 13.3166, 'learning_rate': 5.057142857142857e-05, 'epoch': 22.41}
59
+ {'eval_loss': 9.01073932647705, 'eval_wer': 0.08049412233512653, 'eval_cer': 0.06199990417325475, 'eval_runtime': 11.3574, 'eval_samples_per_second': 35.219, 'eval_steps_per_second': 1.145, 'epoch': 22.41}
60
+ {'loss': 13.1342, 'learning_rate': 4.7e-05, 'epoch': 23.28}
61
+ {'loss': 12.651, 'learning_rate': 4.342857142857143e-05, 'epoch': 24.14}
62
+ {'eval_loss': 8.882560729980469, 'eval_wer': 0.07856810785681079, 'eval_cer': 0.06020315269991855, 'eval_runtime': 11.1062, 'eval_samples_per_second': 36.016, 'eval_steps_per_second': 1.171, 'epoch': 24.14}
63
+ {'loss': 12.0675, 'learning_rate': 3.985714285714286e-05, 'epoch': 25.0}
64
+ {'loss': 12.8447, 'learning_rate': 3.628571428571429e-05, 'epoch': 25.86}
65
+ {'eval_loss': 8.821671485900879, 'eval_wer': 0.0781696221026765, 'eval_cer': 0.059101145129605674, 'eval_runtime': 11.5252, 'eval_samples_per_second': 34.707, 'eval_steps_per_second': 1.128, 'epoch': 25.86}
66
+ {'loss': 12.0886, 'learning_rate': 3.271428571428571e-05, 'epoch': 26.72}
67
+ {'loss': 13.3448, 'learning_rate': 2.9142857142857146e-05, 'epoch': 27.59}
68
+ {'eval_loss': 8.660683631896973, 'eval_wer': 0.0773062363020522, 'eval_cer': 0.058094964304537394, 'eval_runtime': 11.1216, 'eval_samples_per_second': 35.966, 'eval_steps_per_second': 1.169, 'epoch': 27.59}
69
+ {'loss': 11.6601, 'learning_rate': 2.5571428571428572e-05, 'epoch': 28.45}
70
+ {'loss': 11.2744, 'learning_rate': 2.2000000000000003e-05, 'epoch': 29.31}
71
+ {'eval_loss': 8.715682029724121, 'eval_wer': 0.07670850767085077, 'eval_cer': 0.057975180872981646, 'eval_runtime': 11.2314, 'eval_samples_per_second': 35.615, 'eval_steps_per_second': 1.157, 'epoch': 29.31}
72
+ {'loss': 10.8255, 'learning_rate': 1.842857142857143e-05, 'epoch': 30.17}
73
+ {'loss': 12.5791, 'learning_rate': 1.4857142857142858e-05, 'epoch': 31.03}
74
+ {'eval_loss': 8.562932014465332, 'eval_wer': 0.07617719333200505, 'eval_cer': 0.05694504336160222, 'eval_runtime': 11.5283, 'eval_samples_per_second': 34.697, 'eval_steps_per_second': 1.128, 'epoch': 31.03}
75
+ {'loss': 11.7516, 'learning_rate': 1.1285714285714285e-05, 'epoch': 31.9}
76
+ {'loss': 11.1128, 'learning_rate': 7.714285714285714e-06, 'epoch': 32.76}
77
+ {'eval_loss': 8.644214630126953, 'eval_wer': 0.0763764362090722, 'eval_cer': 0.05732835034258062, 'eval_runtime': 11.2033, 'eval_samples_per_second': 35.704, 'eval_steps_per_second': 1.16, 'epoch': 32.76}
78
+ {'loss': 11.2963, 'learning_rate': 4.142857142857143e-06, 'epoch': 33.62}
79
+ {'loss': 11.052, 'learning_rate': 5.714285714285715e-07, 'epoch': 34.48}
80
+ {'eval_loss': 8.613777160644531, 'eval_wer': 0.07571229328551504, 'eval_cer': 0.05687317330266878, 'eval_runtime': 11.142, 'eval_samples_per_second': 35.9, 'eval_steps_per_second': 1.167, 'epoch': 34.48}
81
+ {'train_runtime': 5691.097, 'train_samples_per_second': 22.491, 'train_steps_per_second': 0.351, 'train_loss': 75.09857940673828, 'epoch': 34.48}
train_conf.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "per_device_train_batch_size": 64,
4
+ "gradient_accumulation_steps": 1,
5
+ "per_device_eval_batch_size": 32,
6
+ "evaluation_strategy": "steps",
7
+ "max_steps": 2000,
8
+ "save_steps": 100,
9
+ "eval_steps": 100,
10
+ "logging_steps": 50,
11
+ "learning_rate": 0.0001,
12
+ "weight_decay": 0,
13
+ "warmup_steps": 600,
14
+ "save_total_limit": 1,
15
+ "metric_for_best_model": "wer",
16
+ "greater_is_better": false
17
+ },
18
+ {
19
+ "model_path": "facebook/wav2vec2-large-lv60",
20
+ "problem_type": "single_label_classification",
21
+ "task_type": "asr",
22
+ "model_type": "baseline",
23
+ "final_dropout": 0.0,
24
+ "ctc_zero_infinity": true,
25
+ "layerdrop": 0.1,
26
+ "activation_dropout": 0.1,
27
+ "mask_time_prob": 0.065,
28
+ "mask_time_length": 10,
29
+ "mask_feature_prob": 0.015,
30
+ "mask_feature_length": 64
31
+ }
32
+ ]
vocab.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "aa": 1,
4
+ "ae": 2,
5
+ "ah": 3,
6
+ "ao": 4,
7
+ "aw": 5,
8
+ "ay": 6,
9
+ "b": 7,
10
+ "ch": 8,
11
+ "d": 9,
12
+ "dh": 10,
13
+ "eh": 11,
14
+ "er": 12,
15
+ "err": 41,
16
+ "ey": 13,
17
+ "f": 14,
18
+ "g": 15,
19
+ "hh": 16,
20
+ "ih": 17,
21
+ "iy": 18,
22
+ "jh": 19,
23
+ "k": 20,
24
+ "l": 21,
25
+ "m": 22,
26
+ "n": 23,
27
+ "ng": 24,
28
+ "ow": 25,
29
+ "oy": 26,
30
+ "p": 27,
31
+ "r": 28,
32
+ "s": 29,
33
+ "sh": 30,
34
+ "sil": 31,
35
+ "t": 32,
36
+ "th": 33,
37
+ "uh": 34,
38
+ "uw": 35,
39
+ "v": 36,
40
+ "w": 37,
41
+ "y": 38,
42
+ "z": 39,
43
+ "zh": 40
44
+ }