tz579 commited on
Commit
6e42c7f
1 Parent(s): fde5f53

Training in progress, step 12776

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +12 -0
  2. added_tokens.json +4 -0
  3. config.json +109 -0
  4. demo.4gram.py +22 -0
  5. demo.nolm.py +22 -0
  6. hub/version.txt +1 -0
  7. modules/__init__.py +0 -0
  8. preprocessor_config.json +10 -0
  9. run.ami.log +0 -0
  10. run.ami.sh +39 -0
  11. run.timit.log +0 -0
  12. run.timit.log. +1049 -0
  13. run.timit.sh +30 -0
  14. run_speech_recognition_ctc.py +840 -0
  15. run_speech_recognition_ctc.py. +835 -0
  16. runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 +3 -0
  17. runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 +3 -0
  18. runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 +3 -0
  19. runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 +3 -0
  20. runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 +3 -0
  21. runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 +3 -0
  22. runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 +3 -0
  23. runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 +3 -0
  24. runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 +3 -0
  25. runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 +3 -0
  26. runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 +3 -0
  27. runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 +3 -0
  28. runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 +3 -0
  29. special_tokens_map.json +30 -0
  30. tokenizer_config.json +48 -0
  31. training_args.bin +3 -0
  32. vocab.json +32 -0
  33. wav2vec2-base-timit-fine-tuned./README.md +101 -0
  34. wav2vec2-base-timit-fine-tuned./added_tokens.json +4 -0
  35. wav2vec2-base-timit-fine-tuned./all_results.json +15 -0
  36. wav2vec2-base-timit-fine-tuned./config.json +119 -0
  37. wav2vec2-base-timit-fine-tuned./eval_results.json +9 -0
  38. wav2vec2-base-timit-fine-tuned./preprocessor_config.json +10 -0
  39. wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 +3 -0
  40. wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 +3 -0
  41. wav2vec2-base-timit-fine-tuned./special_tokens_map.json +30 -0
  42. wav2vec2-base-timit-fine-tuned./tokenizer_config.json +48 -0
  43. wav2vec2-base-timit-fine-tuned./train_results.json +9 -0
  44. wav2vec2-base-timit-fine-tuned./trainer_state.json +1873 -0
  45. wav2vec2-base-timit-fine-tuned./training_args.bin +3 -0
  46. wav2vec2-base-timit-fine-tuned./vocab.json +31 -0
  47. wav2vec2-base-timit-fine-tuned/README.md +101 -0
  48. wav2vec2-base-timit-fine-tuned/added_tokens.json +4 -0
  49. wav2vec2-base-timit-fine-tuned/all_results.json +15 -0
  50. wav2vec2-base-timit-fine-tuned/config.json +119 -0
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ */*__pycache__*
2
+ */checkpoint*/
3
+ */data*/
4
+ */mdls*/
5
+ */model*
6
+ *__pycache__*
7
+ checkpoint*/
8
+ data*/
9
+ mdls*/
10
+ input*/
11
+ output*/
12
+ model*
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 31,
3
+ "<s>": 30
4
+ }
config.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-lv60",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 768,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.0,
58
+ "hidden_dropout_prob": 0.1,
59
+ "hidden_size": 1024,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 4096,
62
+ "layer_norm_eps": 1e-05,
63
+ "layerdrop": 0.0,
64
+ "mask_feature_length": 10,
65
+ "mask_feature_min_masks": 0,
66
+ "mask_feature_prob": 0.0,
67
+ "mask_time_length": 10,
68
+ "mask_time_min_masks": 2,
69
+ "mask_time_prob": 0.05,
70
+ "model_type": "wav2vec2",
71
+ "num_adapter_layers": 3,
72
+ "num_attention_heads": 16,
73
+ "num_codevector_groups": 2,
74
+ "num_codevectors_per_group": 320,
75
+ "num_conv_pos_embedding_groups": 16,
76
+ "num_conv_pos_embeddings": 128,
77
+ "num_feat_extract_layers": 7,
78
+ "num_hidden_layers": 24,
79
+ "num_negatives": 100,
80
+ "output_hidden_size": 1024,
81
+ "pad_token_id": 29,
82
+ "proj_codevector_dim": 768,
83
+ "tdnn_dilation": [
84
+ 1,
85
+ 2,
86
+ 3,
87
+ 1,
88
+ 1
89
+ ],
90
+ "tdnn_dim": [
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 512,
95
+ 1500
96
+ ],
97
+ "tdnn_kernel": [
98
+ 5,
99
+ 3,
100
+ 3,
101
+ 1,
102
+ 1
103
+ ],
104
+ "torch_dtype": "float32",
105
+ "transformers_version": "4.42.0.dev0",
106
+ "use_weighted_layer_sum": false,
107
+ "vocab_size": 32,
108
+ "xvector_output_dim": 512
109
+ }
demo.4gram.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import
2
+ import librosa
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
4
+
5
+ # load the processor
6
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
7
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
8
+
9
+ # load the audio data (use your own wav file here!)
10
+ input_audio, sr = librosa.load('my_wav_file.wav', sr=16000)
11
+
12
+ # tokenize
13
+ input_values = processor(input_audio, return_tensors="pt", padding="longest").input_values
14
+
15
+ # retrieve logits
16
+ logits = model(input_values).logits
17
+
18
+ # decode using n-gram
19
+ transcription = processor.batch_decode(logits.detach().numpy()).text
20
+
21
+ # print the output
22
+ print(transcription)
demo.nolm.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import
2
+ import librosa, torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
4
+
5
+ # load the tokenizer and model
6
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
7
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
8
+
9
+ # load the audio data (use your own wav file here!)
10
+ input_audio, sr = librosa.load('my_wav_file.wav', sr=16000)
11
+
12
+ # tokenize
13
+ input_values = tokenizer(input_audio, return_tensors="pt", padding="longest").input_values
14
+
15
+ # retrieve logits
16
+ logits = model(input_values).logits
17
+
18
+ # take argmax and decode
19
+ transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1))
20
+
21
+ # print the output
22
+ print(transcription)
hub/version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
modules/__init__.py ADDED
File without changes
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
run.ami.log ADDED
The diff for this file is too large to render. See raw diff
 
run.ami.sh ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export HF_TOKEN=`cat /home/huggingface.token`
2
+ export HF_HOME="/home/Work/common_huggingface"
3
+
4
+ ## IMPORTANT: This script was stopped after 1.5 epochs (2400 steps)
5
+ ## because the training loss was exploding => the best checkpoint (2000 steps)
6
+ ## was then taken.
7
+ ## MAKE SURE TO DO HYPER-PARAMETER TUNING TO GET BETTER RESULTS
8
+ python run_speech_recognition_ctc.py \
9
+ --token="${HF_TOKEN}" \
10
+ --dataset_name="edinburghcstr/ami" \
11
+ --model_name_or_path="facebook/wav2vec2-large-lv60" \
12
+ --dataset_config_name="ihm" \
13
+ --train_split_name="train" \
14
+ --eval_split_name="validation" \
15
+ --output_dir="./" \
16
+ --preprocessing_num_workers="16" \
17
+ --overwrite_output_dir \
18
+ --num_train_epochs="2" \
19
+ --per_device_train_batch_size="16" \
20
+ --per_device_eval_batch_size="16" \
21
+ --gradient_accumulation_steps="1" \
22
+ --learning_rate="3e-4" \
23
+ --warmup_steps="500" \
24
+ --evaluation_strategy="steps" \
25
+ --text_column_name="text" \
26
+ --min_duration_in_seconds="0.25" \
27
+ --save_steps="400" \
28
+ --eval_steps="1000" \
29
+ --logging_steps="1" \
30
+ --layerdrop="0.0" \
31
+ --save_total_limit="3" \
32
+ --freeze_feature_encoder \
33
+ --gradient_checkpointing \
34
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \
35
+ --fp16 \
36
+ --group_by_length \
37
+ --push_to_hub \
38
+ --do_eval \
39
+ --do_train --do_eval
run.timit.log ADDED
The diff for this file is too large to render. See raw diff
 
run.timit.log. ADDED
@@ -0,0 +1,1049 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /opt/conda/lib/python3.12/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
2
+ warnings.warn(
3
+ 05/19/2024 22:08:09 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True
4
+ 05/19/2024 22:08:09 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
5
+ _n_gpu=1,
6
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
7
+ adafactor=False,
8
+ adam_beta1=0.9,
9
+ adam_beta2=0.999,
10
+ adam_epsilon=1e-08,
11
+ auto_find_batch_size=False,
12
+ batch_eval_metrics=False,
13
+ bf16=False,
14
+ bf16_full_eval=False,
15
+ data_seed=None,
16
+ dataloader_drop_last=False,
17
+ dataloader_num_workers=0,
18
+ dataloader_persistent_workers=False,
19
+ dataloader_pin_memory=True,
20
+ dataloader_prefetch_factor=None,
21
+ ddp_backend=None,
22
+ ddp_broadcast_buffers=None,
23
+ ddp_bucket_cap_mb=None,
24
+ ddp_find_unused_parameters=None,
25
+ ddp_timeout=1800,
26
+ debug=[],
27
+ deepspeed=None,
28
+ disable_tqdm=False,
29
+ dispatch_batches=None,
30
+ do_eval=True,
31
+ do_predict=False,
32
+ do_train=True,
33
+ eval_accumulation_steps=None,
34
+ eval_delay=0,
35
+ eval_do_concat_batches=True,
36
+ eval_steps=100,
37
+ eval_strategy=IntervalStrategy.STEPS,
38
+ evaluation_strategy=steps,
39
+ fp16=True,
40
+ fp16_backend=auto,
41
+ fp16_full_eval=False,
42
+ fp16_opt_level=O1,
43
+ fsdp=[],
44
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
45
+ fsdp_min_num_params=0,
46
+ fsdp_transformer_layer_cls_to_wrap=None,
47
+ full_determinism=False,
48
+ gradient_accumulation_steps=1,
49
+ gradient_checkpointing=False,
50
+ gradient_checkpointing_kwargs=None,
51
+ greater_is_better=None,
52
+ group_by_length=True,
53
+ half_precision_backend=auto,
54
+ hub_always_push=False,
55
+ hub_model_id=None,
56
+ hub_private_repo=False,
57
+ hub_strategy=HubStrategy.EVERY_SAVE,
58
+ hub_token=<HUB_TOKEN>,
59
+ ignore_data_skip=False,
60
+ include_inputs_for_metrics=False,
61
+ include_num_input_tokens_seen=False,
62
+ include_tokens_per_second=False,
63
+ jit_mode_eval=False,
64
+ label_names=None,
65
+ label_smoothing_factor=0.0,
66
+ learning_rate=0.0001,
67
+ length_column_name=length,
68
+ load_best_model_at_end=False,
69
+ local_rank=0,
70
+ log_level=passive,
71
+ log_level_replica=warning,
72
+ log_on_each_node=True,
73
+ logging_dir=./wav2vec2-base-timit-fine-tuned/runs/May19_22-08-09_tz579-raptorlake,
74
+ logging_first_step=False,
75
+ logging_nan_inf_filter=True,
76
+ logging_steps=10,
77
+ logging_strategy=IntervalStrategy.STEPS,
78
+ lr_scheduler_kwargs={},
79
+ lr_scheduler_type=SchedulerType.LINEAR,
80
+ max_grad_norm=1.0,
81
+ max_steps=-1,
82
+ metric_for_best_model=None,
83
+ mp_parameters=,
84
+ neftune_noise_alpha=None,
85
+ no_cuda=False,
86
+ num_train_epochs=20.0,
87
+ optim=OptimizerNames.ADAMW_TORCH,
88
+ optim_args=None,
89
+ optim_target_modules=None,
90
+ output_dir=./wav2vec2-base-timit-fine-tuned,
91
+ overwrite_output_dir=True,
92
+ past_index=-1,
93
+ per_device_eval_batch_size=1,
94
+ per_device_train_batch_size=32,
95
+ prediction_loss_only=False,
96
+ push_to_hub=True,
97
+ push_to_hub_model_id=None,
98
+ push_to_hub_organization=None,
99
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
100
+ ray_scope=last,
101
+ remove_unused_columns=True,
102
+ report_to=['tensorboard'],
103
+ restore_callback_states_from_checkpoint=False,
104
+ resume_from_checkpoint=None,
105
+ run_name=./wav2vec2-base-timit-fine-tuned,
106
+ save_on_each_node=False,
107
+ save_only_model=False,
108
+ save_safetensors=True,
109
+ save_steps=400,
110
+ save_strategy=IntervalStrategy.STEPS,
111
+ save_total_limit=3,
112
+ seed=42,
113
+ skip_memory_metrics=True,
114
+ split_batches=None,
115
+ tf32=None,
116
+ torch_compile=False,
117
+ torch_compile_backend=None,
118
+ torch_compile_mode=None,
119
+ torchdynamo=None,
120
+ tpu_metrics_debug=False,
121
+ tpu_num_cores=None,
122
+ use_cpu=False,
123
+ use_ipex=False,
124
+ use_legacy_prediction_loop=False,
125
+ use_mps_device=False,
126
+ warmup_ratio=0.0,
127
+ warmup_steps=1000,
128
+ weight_decay=0.005,
129
+ )
130
+ /opt/conda/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for timit_asr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/timit_asr
131
+ You can avoid this message in future by passing the argument `trust_remote_code=True`.
132
+ Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
133
+ warnings.warn(
134
+ /opt/conda/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
135
+ warnings.warn(
136
+ loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
137
+ /opt/conda/lib/python3.12/site-packages/transformers/configuration_utils.py:364: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.
138
+ warnings.warn(
139
+ Model config Wav2Vec2Config {
140
+ "_name_or_path": "facebook/wav2vec2-base",
141
+ "activation_dropout": 0.0,
142
+ "adapter_attn_dim": null,
143
+ "adapter_kernel_size": 3,
144
+ "adapter_stride": 2,
145
+ "add_adapter": false,
146
+ "apply_spec_augment": true,
147
+ "architectures": [
148
+ "Wav2Vec2ForPreTraining"
149
+ ],
150
+ "attention_dropout": 0.1,
151
+ "bos_token_id": 1,
152
+ "classifier_proj_size": 256,
153
+ "codevector_dim": 256,
154
+ "contrastive_logits_temperature": 0.1,
155
+ "conv_bias": false,
156
+ "conv_dim": [
157
+ 512,
158
+ 512,
159
+ 512,
160
+ 512,
161
+ 512,
162
+ 512,
163
+ 512
164
+ ],
165
+ "conv_kernel": [
166
+ 10,
167
+ 3,
168
+ 3,
169
+ 3,
170
+ 3,
171
+ 2,
172
+ 2
173
+ ],
174
+ "conv_stride": [
175
+ 5,
176
+ 2,
177
+ 2,
178
+ 2,
179
+ 2,
180
+ 2,
181
+ 2
182
+ ],
183
+ "ctc_loss_reduction": "sum",
184
+ "ctc_zero_infinity": false,
185
+ "diversity_loss_weight": 0.1,
186
+ "do_stable_layer_norm": false,
187
+ "eos_token_id": 2,
188
+ "feat_extract_activation": "gelu",
189
+ "feat_extract_norm": "group",
190
+ "feat_proj_dropout": 0.1,
191
+ "feat_quantizer_dropout": 0.0,
192
+ "final_dropout": 0.0,
193
+ "freeze_feat_extract_train": true,
194
+ "gradient_checkpointing": true,
195
+ "hidden_act": "gelu",
196
+ "hidden_dropout": 0.1,
197
+ "hidden_size": 768,
198
+ "initializer_range": 0.02,
199
+ "intermediate_size": 3072,
200
+ "layer_norm_eps": 1e-05,
201
+ "layerdrop": 0.0,
202
+ "mask_channel_length": 10,
203
+ "mask_channel_min_space": 1,
204
+ "mask_channel_other": 0.0,
205
+ "mask_channel_prob": 0.0,
206
+ "mask_channel_selection": "static",
207
+ "mask_feature_length": 10,
208
+ "mask_feature_min_masks": 0,
209
+ "mask_feature_prob": 0.0,
210
+ "mask_time_length": 10,
211
+ "mask_time_min_masks": 2,
212
+ "mask_time_min_space": 1,
213
+ "mask_time_other": 0.0,
214
+ "mask_time_prob": 0.05,
215
+ "mask_time_selection": "static",
216
+ "model_type": "wav2vec2",
217
+ "no_mask_channel_overlap": false,
218
+ "no_mask_time_overlap": false,
219
+ "num_adapter_layers": 3,
220
+ "num_attention_heads": 12,
221
+ "num_codevector_groups": 2,
222
+ "num_codevectors_per_group": 320,
223
+ "num_conv_pos_embedding_groups": 16,
224
+ "num_conv_pos_embeddings": 128,
225
+ "num_feat_extract_layers": 7,
226
+ "num_hidden_layers": 12,
227
+ "num_negatives": 100,
228
+ "output_hidden_size": 768,
229
+ "pad_token_id": 0,
230
+ "proj_codevector_dim": 256,
231
+ "tdnn_dilation": [
232
+ 1,
233
+ 2,
234
+ 3,
235
+ 1,
236
+ 1
237
+ ],
238
+ "tdnn_dim": [
239
+ 512,
240
+ 512,
241
+ 512,
242
+ 512,
243
+ 1500
244
+ ],
245
+ "tdnn_kernel": [
246
+ 5,
247
+ 3,
248
+ 3,
249
+ 1,
250
+ 1
251
+ ],
252
+ "transformers_version": "4.42.0.dev0",
253
+ "use_weighted_layer_sum": false,
254
+ "vocab_size": 32,
255
+ "xvector_output_dim": 512
256
+ }
257
+
258
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████| 3696/3696 [00:00<00:00, 258999.36 examples/s]
259
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:00<00:00, 582229.35 examples/s]
260
+ `use_fast` is set to `True` but the tokenizer class does not have a fast version. Falling back to the slow version.
261
+ loading file vocab.json
262
+ loading file tokenizer_config.json
263
+ loading file added_tokens.json
264
+ loading file special_tokens_map.json
265
+ loading file tokenizer.json
266
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
267
+ loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/preprocessor_config.json
268
+ loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
269
+ Model config Wav2Vec2Config {
270
+ "_name_or_path": "facebook/wav2vec2-base",
271
+ "activation_dropout": 0.0,
272
+ "adapter_attn_dim": null,
273
+ "adapter_kernel_size": 3,
274
+ "adapter_stride": 2,
275
+ "add_adapter": false,
276
+ "apply_spec_augment": true,
277
+ "architectures": [
278
+ "Wav2Vec2ForPreTraining"
279
+ ],
280
+ "attention_dropout": 0.1,
281
+ "bos_token_id": 1,
282
+ "classifier_proj_size": 256,
283
+ "codevector_dim": 256,
284
+ "contrastive_logits_temperature": 0.1,
285
+ "conv_bias": false,
286
+ "conv_dim": [
287
+ 512,
288
+ 512,
289
+ 512,
290
+ 512,
291
+ 512,
292
+ 512,
293
+ 512
294
+ ],
295
+ "conv_kernel": [
296
+ 10,
297
+ 3,
298
+ 3,
299
+ 3,
300
+ 3,
301
+ 2,
302
+ 2
303
+ ],
304
+ "conv_stride": [
305
+ 5,
306
+ 2,
307
+ 2,
308
+ 2,
309
+ 2,
310
+ 2,
311
+ 2
312
+ ],
313
+ "ctc_loss_reduction": "sum",
314
+ "ctc_zero_infinity": false,
315
+ "diversity_loss_weight": 0.1,
316
+ "do_stable_layer_norm": false,
317
+ "eos_token_id": 2,
318
+ "feat_extract_activation": "gelu",
319
+ "feat_extract_norm": "group",
320
+ "feat_proj_dropout": 0.1,
321
+ "feat_quantizer_dropout": 0.0,
322
+ "final_dropout": 0.0,
323
+ "freeze_feat_extract_train": true,
324
+ "gradient_checkpointing": true,
325
+ "hidden_act": "gelu",
326
+ "hidden_dropout": 0.1,
327
+ "hidden_size": 768,
328
+ "initializer_range": 0.02,
329
+ "intermediate_size": 3072,
330
+ "layer_norm_eps": 1e-05,
331
+ "layerdrop": 0.0,
332
+ "mask_channel_length": 10,
333
+ "mask_channel_min_space": 1,
334
+ "mask_channel_other": 0.0,
335
+ "mask_channel_prob": 0.0,
336
+ "mask_channel_selection": "static",
337
+ "mask_feature_length": 10,
338
+ "mask_feature_min_masks": 0,
339
+ "mask_feature_prob": 0.0,
340
+ "mask_time_length": 10,
341
+ "mask_time_min_masks": 2,
342
+ "mask_time_min_space": 1,
343
+ "mask_time_other": 0.0,
344
+ "mask_time_prob": 0.05,
345
+ "mask_time_selection": "static",
346
+ "model_type": "wav2vec2",
347
+ "no_mask_channel_overlap": false,
348
+ "no_mask_time_overlap": false,
349
+ "num_adapter_layers": 3,
350
+ "num_attention_heads": 12,
351
+ "num_codevector_groups": 2,
352
+ "num_codevectors_per_group": 320,
353
+ "num_conv_pos_embedding_groups": 16,
354
+ "num_conv_pos_embeddings": 128,
355
+ "num_feat_extract_layers": 7,
356
+ "num_hidden_layers": 12,
357
+ "num_negatives": 100,
358
+ "output_hidden_size": 768,
359
+ "pad_token_id": 0,
360
+ "proj_codevector_dim": 256,
361
+ "tdnn_dilation": [
362
+ 1,
363
+ 2,
364
+ 3,
365
+ 1,
366
+ 1
367
+ ],
368
+ "tdnn_dim": [
369
+ 512,
370
+ 512,
371
+ 512,
372
+ 512,
373
+ 1500
374
+ ],
375
+ "tdnn_kernel": [
376
+ 5,
377
+ 3,
378
+ 3,
379
+ 1,
380
+ 1
381
+ ],
382
+ "transformers_version": "4.42.0.dev0",
383
+ "use_weighted_layer_sum": false,
384
+ "vocab_size": 32,
385
+ "xvector_output_dim": 512
386
+ }
387
+
388
+ Feature extractor Wav2Vec2FeatureExtractor {
389
+ "do_normalize": true,
390
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
391
+ "feature_size": 1,
392
+ "padding_side": "right",
393
+ "padding_value": 0.0,
394
+ "return_attention_mask": false,
395
+ "sampling_rate": 16000
396
+ }
397
+
398
+ loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/pytorch_model.bin
399
+ Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
400
+ - This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
401
+ - This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
402
+ Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
403
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
404
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
405
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
406
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
407
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
408
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
409
+ loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
410
+ loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
411
+ loading configuration file ./wav2vec2-base-timit-fine-tuned/config.json
412
+ Model config Wav2Vec2Config {
413
+ "_name_or_path": "./wav2vec2-base-timit-fine-tuned",
414
+ "activation_dropout": 0.0,
415
+ "adapter_attn_dim": null,
416
+ "adapter_kernel_size": 3,
417
+ "adapter_stride": 2,
418
+ "add_adapter": false,
419
+ "apply_spec_augment": true,
420
+ "architectures": [
421
+ "Wav2Vec2ForPreTraining"
422
+ ],
423
+ "attention_dropout": 0.0,
424
+ "bos_token_id": 1,
425
+ "classifier_proj_size": 256,
426
+ "codevector_dim": 256,
427
+ "contrastive_logits_temperature": 0.1,
428
+ "conv_bias": false,
429
+ "conv_dim": [
430
+ 512,
431
+ 512,
432
+ 512,
433
+ 512,
434
+ 512,
435
+ 512,
436
+ 512
437
+ ],
438
+ "conv_kernel": [
439
+ 10,
440
+ 3,
441
+ 3,
442
+ 3,
443
+ 3,
444
+ 2,
445
+ 2
446
+ ],
447
+ "conv_stride": [
448
+ 5,
449
+ 2,
450
+ 2,
451
+ 2,
452
+ 2,
453
+ 2,
454
+ 2
455
+ ],
456
+ "ctc_loss_reduction": "mean",
457
+ "ctc_zero_infinity": false,
458
+ "diversity_loss_weight": 0.1,
459
+ "do_stable_layer_norm": false,
460
+ "eos_token_id": 2,
461
+ "feat_extract_activation": "gelu",
462
+ "feat_extract_norm": "group",
463
+ "feat_proj_dropout": 0.0,
464
+ "feat_quantizer_dropout": 0.0,
465
+ "final_dropout": 0.0,
466
+ "freeze_feat_extract_train": true,
467
+ "gradient_checkpointing": false,
468
+ "hidden_act": "gelu",
469
+ "hidden_dropout": 0.0,
470
+ "hidden_size": 768,
471
+ "initializer_range": 0.02,
472
+ "intermediate_size": 3072,
473
+ "layer_norm_eps": 1e-05,
474
+ "layerdrop": 0.0,
475
+ "mask_channel_length": 10,
476
+ "mask_channel_min_space": 1,
477
+ "mask_channel_other": 0.0,
478
+ "mask_channel_prob": 0.0,
479
+ "mask_channel_selection": "static",
480
+ "mask_feature_length": 10,
481
+ "mask_feature_min_masks": 0,
482
+ "mask_feature_prob": 0.0,
483
+ "mask_time_length": 10,
484
+ "mask_time_min_masks": 2,
485
+ "mask_time_min_space": 1,
486
+ "mask_time_other": 0.0,
487
+ "mask_time_prob": 0.05,
488
+ "mask_time_selection": "static",
489
+ "model_type": "wav2vec2",
490
+ "no_mask_channel_overlap": false,
491
+ "no_mask_time_overlap": false,
492
+ "num_adapter_layers": 3,
493
+ "num_attention_heads": 12,
494
+ "num_codevector_groups": 2,
495
+ "num_codevectors_per_group": 320,
496
+ "num_conv_pos_embedding_groups": 16,
497
+ "num_conv_pos_embeddings": 128,
498
+ "num_feat_extract_layers": 7,
499
+ "num_hidden_layers": 12,
500
+ "num_negatives": 100,
501
+ "output_hidden_size": 768,
502
+ "pad_token_id": 28,
503
+ "proj_codevector_dim": 256,
504
+ "tdnn_dilation": [
505
+ 1,
506
+ 2,
507
+ 3,
508
+ 1,
509
+ 1
510
+ ],
511
+ "tdnn_dim": [
512
+ 512,
513
+ 512,
514
+ 512,
515
+ 512,
516
+ 1500
517
+ ],
518
+ "tdnn_kernel": [
519
+ 5,
520
+ 3,
521
+ 3,
522
+ 1,
523
+ 1
524
+ ],
525
+ "transformers_version": "4.42.0.dev0",
526
+ "use_weighted_layer_sum": false,
527
+ "vocab_size": 31,
528
+ "xvector_output_dim": 512
529
+ }
530
+
531
+ loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
532
+ Feature extractor Wav2Vec2FeatureExtractor {
533
+ "do_normalize": true,
534
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
535
+ "feature_size": 1,
536
+ "padding_side": "right",
537
+ "padding_value": 0.0,
538
+ "return_attention_mask": false,
539
+ "sampling_rate": 16000
540
+ }
541
+
542
+ loading file vocab.json
543
+ loading file tokenizer_config.json
544
+ loading file added_tokens.json
545
+ loading file special_tokens_map.json
546
+ loading file tokenizer.json
547
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
548
+ Processor Wav2Vec2Processor:
549
+ - feature_extractor: Wav2Vec2FeatureExtractor {
550
+ "do_normalize": true,
551
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
552
+ "feature_size": 1,
553
+ "padding_side": "right",
554
+ "padding_value": 0.0,
555
+ "return_attention_mask": false,
556
+ "sampling_rate": 16000
557
+ }
558
+
559
+ - tokenizer: Wav2Vec2CTCTokenizer(name_or_path='./wav2vec2-base-timit-fine-tuned', vocab_size=29, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
560
+ 27: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
561
+ 28: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
562
+ 29: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
563
+ 30: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
564
+ }
565
+
566
+ {
567
+ "processor_class": "Wav2Vec2Processor"
568
+ }
569
+
570
+ Using auto half precision backend
571
+ The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
572
+ ***** Running training *****
573
+ Num examples = 3,696
574
+ Num Epochs = 20
575
+ Instantaneous batch size per device = 32
576
+ Total train batch size (w. parallel, distributed & accumulation) = 32
577
+ Gradient Accumulation steps = 1
578
+ Total optimization steps = 2,320
579
+ Number of trainable parameters = 90,195,103
580
+ 0%|▎ | 7/2320 [00:10<48:36, 1.26s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
581
+ return F.conv1d(input, weight, bias, self.stride,
582
+ {'loss': 9.1142, 'grad_norm': 9.595185279846191, 'learning_rate': 9e-07, 'epoch': 0.09}
583
+ {'loss': 8.3446, 'grad_norm': 9.732986450195312, 'learning_rate': 1.9e-06, 'epoch': 0.17}
584
+ {'loss': 8.6592, 'grad_norm': 14.272214889526367, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.26}
585
+ {'loss': 7.6985, 'grad_norm': 15.0160493850708, 'learning_rate': 3.8e-06, 'epoch': 0.34}
586
+ {'loss': 6.9688, 'grad_norm': 16.610979080200195, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.43}
587
+ {'loss': 6.232, 'grad_norm': 17.26924705505371, 'learning_rate': 5.8e-06, 'epoch': 0.52}
588
+ {'loss': 4.7271, 'grad_norm': 11.347734451293945, 'learning_rate': 6.800000000000001e-06, 'epoch': 0.6}
589
+ {'loss': 3.7919, 'grad_norm': 4.237112045288086, 'learning_rate': 7.8e-06, 'epoch': 0.69}
590
+ {'loss': 3.3967, 'grad_norm': 1.8833028078079224, 'learning_rate': 8.8e-06, 'epoch': 0.78}
591
+ {'loss': 3.1618, 'grad_norm': 1.3788093328475952, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.86}
592
+ 4%|████▏ | 100/2320 [01:39<33:07, 1.12it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
593
+ ***** Running Evaluation *****
594
+ Num examples = 1344
595
+ Batch size = 1
596
+ {'eval_loss': 3.1117007732391357, 'eval_wer': 1.0, 'eval_runtime': 40.0512, 'eval_samples_per_second': 33.557, 'eval_steps_per_second': 33.557, 'epoch': 0.86}
597
+ {'loss': 3.0865, 'grad_norm': 1.729278802871704, 'learning_rate': 1.08e-05, 'epoch': 0.95}
598
+ {'loss': 3.0809, 'grad_norm': 1.905969500541687, 'learning_rate': 1.18e-05, 'epoch': 1.03}
599
+ {'loss': 3.0346, 'grad_norm': 0.8360918760299683, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.12}
600
+ {'loss': 3.0106, 'grad_norm': 0.7653716206550598, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.21}
601
+ {'loss': 3.0165, 'grad_norm': 0.94779372215271, 'learning_rate': 1.48e-05, 'epoch': 1.29}
602
+ {'loss': 3.0, 'grad_norm': 0.8457741737365723, 'learning_rate': 1.58e-05, 'epoch': 1.38}
603
+ {'loss': 2.9903, 'grad_norm': 1.4369837045669556, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.47}
604
+ {'loss': 2.9852, 'grad_norm': 1.8290436267852783, 'learning_rate': 1.78e-05, 'epoch': 1.55}
605
+ {'loss': 2.99, 'grad_norm': 1.1530190706253052, 'learning_rate': 1.88e-05, 'epoch': 1.64}
606
+ {'loss': 2.9798, 'grad_norm': 1.1261711120605469, 'learning_rate': 1.9800000000000004e-05, 'epoch': 1.72}
607
+ 9%|████████▎ | 200/2320 [03:52<24:28, 1.44it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
608
+ ***** Running Evaluation *****
609
+ Num examples = 1344
610
+ Batch size = 1
611
+ {'eval_loss': 2.9736363887786865, 'eval_wer': 1.0, 'eval_runtime': 39.6236, 'eval_samples_per_second': 33.919, 'eval_steps_per_second': 33.919, 'epoch': 1.72}
612
+ {'loss': 2.9718, 'grad_norm': 0.903380811214447, 'learning_rate': 2.08e-05, 'epoch': 1.81}
613
+ {'loss': 2.9766, 'grad_norm': 0.4889620244503021, 'learning_rate': 2.18e-05, 'epoch': 1.9}
614
+ {'loss': 2.9658, 'grad_norm': 1.3861790895462036, 'learning_rate': 2.2800000000000002e-05, 'epoch': 1.98}
615
+ {'loss': 2.9588, 'grad_norm': 0.7976490259170532, 'learning_rate': 2.38e-05, 'epoch': 2.07}
616
+ {'loss': 2.9523, 'grad_norm': 0.698798418045044, 'learning_rate': 2.48e-05, 'epoch': 2.16}
617
+ {'loss': 2.9496, 'grad_norm': 1.0858148336410522, 'learning_rate': 2.58e-05, 'epoch': 2.24}
618
+ {'loss': 2.9421, 'grad_norm': 0.5658290386199951, 'learning_rate': 2.6800000000000004e-05, 'epoch': 2.33}
619
+ {'loss': 2.9427, 'grad_norm': 0.5713534355163574, 'learning_rate': 2.7800000000000005e-05, 'epoch': 2.41}
620
+ {'loss': 2.9228, 'grad_norm': 0.7386118769645691, 'learning_rate': 2.88e-05, 'epoch': 2.5}
621
+ {'loss': 2.9144, 'grad_norm': 0.767816960811615, 'learning_rate': 2.98e-05, 'epoch': 2.59}
622
+ 13%|████████████▍ | 300/2320 [06:10<33:46, 1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
623
+ ***** Running Evaluation *****
624
+ Num examples = 1344
625
+ Batch size = 1
626
+ {'eval_loss': 2.9074809551239014, 'eval_wer': 1.0, 'eval_runtime': 39.8997, 'eval_samples_per_second': 33.684, 'eval_steps_per_second': 33.684, 'epoch': 2.59}
627
+ {'loss': 2.8965, 'grad_norm': 0.8676608204841614, 'learning_rate': 3.08e-05, 'epoch': 2.67}
628
+ {'loss': 2.8815, 'grad_norm': 1.6954621076583862, 'learning_rate': 3.18e-05, 'epoch': 2.76}
629
+ {'loss': 2.855, 'grad_norm': 1.1631884574890137, 'learning_rate': 3.2800000000000004e-05, 'epoch': 2.84}
630
+ {'loss': 2.781, 'grad_norm': 1.625454306602478, 'learning_rate': 3.38e-05, 'epoch': 2.93}
631
+ {'loss': 2.7756, 'grad_norm': 2.0763564109802246, 'learning_rate': 3.48e-05, 'epoch': 3.02}
632
+ {'loss': 2.6458, 'grad_norm': 2.036031723022461, 'learning_rate': 3.58e-05, 'epoch': 3.1}
633
+ {'loss': 2.5189, 'grad_norm': 1.366801142692566, 'learning_rate': 3.68e-05, 'epoch': 3.19}
634
+ {'loss': 2.433, 'grad_norm': 2.034527540206909, 'learning_rate': 3.7800000000000004e-05, 'epoch': 3.28}
635
+ {'loss': 2.2885, 'grad_norm': 3.8338165283203125, 'learning_rate': 3.88e-05, 'epoch': 3.36}
636
+ {'loss': 2.1714, 'grad_norm': 2.3443217277526855, 'learning_rate': 3.9800000000000005e-05, 'epoch': 3.45}
637
+ 17%|████████████████▌ | 400/2320 [08:24<23:08, 1.38it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
638
+ ***** Running Evaluation *****
639
+ Num examples = 1344
640
+ Batch size = 1
641
+ {'eval_loss': 2.0944502353668213, 'eval_wer': 1.0325047801147227, 'eval_runtime': 39.7668, 'eval_samples_per_second': 33.797, 'eval_steps_per_second': 33.797, 'epoch': 3.45}
642
+ 17%|████████████████▌ | 400/2320 [09:04<23:08, 1.38it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-400
643
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/config.json
644
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/model.safetensors
645
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/preprocessor_config.json
646
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/tokenizer_config.json
647
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/special_tokens_map.json
648
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/added_tokens.json
649
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
650
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
651
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
652
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
653
+ 17%|████████████████▏ | 401/2320 [09:06<6:52:25, 12.90s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
654
+ return F.conv1d(input, weight, bias, self.stride,
655
+ {'loss': 2.0881, 'grad_norm': 4.349735260009766, 'learning_rate': 4.08e-05, 'epoch': 3.53}
656
+ {'loss': 1.9522, 'grad_norm': 2.450747489929199, 'learning_rate': 4.18e-05, 'epoch': 3.62}
657
+ {'loss': 1.8395, 'grad_norm': 2.2519729137420654, 'learning_rate': 4.2800000000000004e-05, 'epoch': 3.71}
658
+ {'loss': 1.7525, 'grad_norm': 2.693664789199829, 'learning_rate': 4.38e-05, 'epoch': 3.79}
659
+ {'loss': 1.6222, 'grad_norm': 1.9744929075241089, 'learning_rate': 4.4800000000000005e-05, 'epoch': 3.88}
660
+ {'loss': 1.5397, 'grad_norm': 3.802494764328003, 'learning_rate': 4.58e-05, 'epoch': 3.97}
661
+ {'loss': 1.4376, 'grad_norm': 2.301044225692749, 'learning_rate': 4.6800000000000006e-05, 'epoch': 4.05}
662
+ {'loss': 1.2829, 'grad_norm': 2.279372215270996, 'learning_rate': 4.78e-05, 'epoch': 4.14}
663
+ {'loss': 1.1976, 'grad_norm': 3.314736843109131, 'learning_rate': 4.88e-05, 'epoch': 4.22}
664
+ {'loss': 1.1579, 'grad_norm': 2.434694290161133, 'learning_rate': 4.9800000000000004e-05, 'epoch': 4.31}
665
+ 22%|████████████████████▋ | 500/2320 [10:43<34:53, 1.15s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
666
+ ***** Running Evaluation *****
667
+ Num examples = 1344
668
+ Batch size = 1
669
+ {'eval_loss': 1.045101284980774, 'eval_wer': 0.8299189656742239, 'eval_runtime': 39.7455, 'eval_samples_per_second': 33.815, 'eval_steps_per_second': 33.815, 'epoch': 4.31}
670
+ {'loss': 1.0684, 'grad_norm': 1.8384031057357788, 'learning_rate': 5.08e-05, 'epoch': 4.4}
671
+ {'loss': 1.0319, 'grad_norm': 3.599148988723755, 'learning_rate': 5.1800000000000005e-05, 'epoch': 4.48}
672
+ {'loss': 0.9179, 'grad_norm': 2.066476583480835, 'learning_rate': 5.28e-05, 'epoch': 4.57}
673
+ {'loss': 0.8838, 'grad_norm': 2.2173750400543213, 'learning_rate': 5.380000000000001e-05, 'epoch': 4.66}
674
+ {'loss': 0.8991, 'grad_norm': 2.427091121673584, 'learning_rate': 5.4800000000000004e-05, 'epoch': 4.74}
675
+ {'loss': 0.8, 'grad_norm': 2.7432241439819336, 'learning_rate': 5.580000000000001e-05, 'epoch': 4.83}
676
+ {'loss': 0.7803, 'grad_norm': 3.254221200942993, 'learning_rate': 5.68e-05, 'epoch': 4.91}
677
+ {'loss': 0.8205, 'grad_norm': 4.457448482513428, 'learning_rate': 5.7799999999999995e-05, 'epoch': 5.0}
678
+ {'loss': 0.6703, 'grad_norm': 3.1023166179656982, 'learning_rate': 5.88e-05, 'epoch': 5.09}
679
+ {'loss': 0.6087, 'grad_norm': 2.5916504859924316, 'learning_rate': 5.9800000000000003e-05, 'epoch': 5.17}
680
+ 26%|████████████████████████▊ | 600/2320 [12:58<23:53, 1.20it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
681
+ ***** Running Evaluation *****
682
+ Num examples = 1344
683
+ Batch size = 1
684
+ {'eval_loss': 0.6753795146942139, 'eval_wer': 0.6440863152144223, 'eval_runtime': 39.7485, 'eval_samples_per_second': 33.813, 'eval_steps_per_second': 33.813, 'epoch': 5.17}
685
+ {'loss': 0.6569, 'grad_norm': 2.1707613468170166, 'learning_rate': 6.08e-05, 'epoch': 5.26}
686
+ {'loss': 0.5627, 'grad_norm': 2.4291555881500244, 'learning_rate': 6.18e-05, 'epoch': 5.34}
687
+ {'loss': 0.5381, 'grad_norm': 2.249617338180542, 'learning_rate': 6.280000000000001e-05, 'epoch': 5.43}
688
+ {'loss': 0.6338, 'grad_norm': 1.6661946773529053, 'learning_rate': 6.38e-05, 'epoch': 5.52}
689
+ {'loss': 0.5181, 'grad_norm': 2.60294771194458, 'learning_rate': 6.48e-05, 'epoch': 5.6}
690
+ {'loss': 0.5189, 'grad_norm': 3.3003089427948, 'learning_rate': 6.58e-05, 'epoch': 5.69}
691
+ {'loss': 0.564, 'grad_norm': 1.880764126777649, 'learning_rate': 6.680000000000001e-05, 'epoch': 5.78}
692
+ {'loss': 0.4729, 'grad_norm': 2.0575127601623535, 'learning_rate': 6.780000000000001e-05, 'epoch': 5.86}
693
+ {'loss': 0.4899, 'grad_norm': 2.5159761905670166, 'learning_rate': 6.879999999999999e-05, 'epoch': 5.95}
694
+ {'loss': 0.481, 'grad_norm': 1.4463504552841187, 'learning_rate': 6.98e-05, 'epoch': 6.03}
695
+ 30%|████████████████████████████▉ | 700/2320 [15:14<36:18, 1.34s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
696
+ ***** Running Evaluation *****
697
+ Num examples = 1344
698
+ Batch size = 1
699
+ {'eval_loss': 0.5275412201881409, 'eval_wer': 0.5760721114449604, 'eval_runtime': 39.9601, 'eval_samples_per_second': 33.634, 'eval_steps_per_second': 33.634, 'epoch': 6.03}
700
+ {'loss': 0.3865, 'grad_norm': 1.788765549659729, 'learning_rate': 7.08e-05, 'epoch': 6.12}
701
+ {'loss': 0.3726, 'grad_norm': 1.862762212753296, 'learning_rate': 7.18e-05, 'epoch': 6.21}
702
+ {'loss': 0.4116, 'grad_norm': 1.6512093544006348, 'learning_rate': 7.280000000000001e-05, 'epoch': 6.29}
703
+ {'loss': 0.3779, 'grad_norm': 2.098067045211792, 'learning_rate': 7.38e-05, 'epoch': 6.38}
704
+ {'loss': 0.3728, 'grad_norm': 3.3030078411102295, 'learning_rate': 7.48e-05, 'epoch': 6.47}
705
+ {'loss': 0.4047, 'grad_norm': 2.1799120903015137, 'learning_rate': 7.58e-05, 'epoch': 6.55}
706
+ {'loss': 0.313, 'grad_norm': 1.862434983253479, 'learning_rate': 7.680000000000001e-05, 'epoch': 6.64}
707
+ {'loss': 0.4052, 'grad_norm': 6.29113245010376, 'learning_rate': 7.780000000000001e-05, 'epoch': 6.72}
708
+ {'loss': 0.3218, 'grad_norm': 1.4220325946807861, 'learning_rate': 7.88e-05, 'epoch': 6.81}
709
+ {'loss': 0.3072, 'grad_norm': 2.586819648742676, 'learning_rate': 7.98e-05, 'epoch': 6.9}
710
+ 34%|█████████████████████████████████ | 800/2320 [17:30<20:39, 1.23it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
711
+ ***** Running Evaluation *****
712
+ Num examples = 1344
713
+ Batch size = 1
714
+ {'eval_loss': 0.4836220443248749, 'eval_wer': 0.5264499681325685, 'eval_runtime': 39.8762, 'eval_samples_per_second': 33.704, 'eval_steps_per_second': 33.704, 'epoch': 6.9}
715
+ 34%|█████████████████████████████████ | 800/2320 [18:10<20:39, 1.23it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-800
716
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/config.json
717
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/model.safetensors
718
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/preprocessor_config.json
719
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/tokenizer_config.json
720
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/special_tokens_map.json
721
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/added_tokens.json
722
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
723
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
724
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
725
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
726
+ {'loss': 0.3862, 'grad_norm': 1.6589460372924805, 'learning_rate': 8.080000000000001e-05, 'epoch': 6.98}
727
+ {'loss': 0.2938, 'grad_norm': 1.7299175262451172, 'learning_rate': 8.18e-05, 'epoch': 7.07}
728
+ {'loss': 0.249, 'grad_norm': 2.0545098781585693, 'learning_rate': 8.28e-05, 'epoch': 7.16}
729
+ 36%|██████████████████████████████████▋ | 837/2320 [18:46<17:32, 1.41it/s]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
730
+ return F.conv1d(input, weight, bias, self.stride,
731
+ {'loss': 0.3202, 'grad_norm': 24.935670852661133, 'learning_rate': 8.38e-05, 'epoch': 7.24}
732
+ {'loss': 0.2803, 'grad_norm': 2.497840642929077, 'learning_rate': 8.48e-05, 'epoch': 7.33}
733
+ {'loss': 0.2473, 'grad_norm': 2.698636531829834, 'learning_rate': 8.58e-05, 'epoch': 7.41}
734
+ {'loss': 0.3223, 'grad_norm': 1.4561227560043335, 'learning_rate': 8.680000000000001e-05, 'epoch': 7.5}
735
+ {'loss': 0.2481, 'grad_norm': 1.7760556936264038, 'learning_rate': 8.78e-05, 'epoch': 7.59}
736
+ {'loss': 0.2545, 'grad_norm': 2.308103084564209, 'learning_rate': 8.88e-05, 'epoch': 7.67}
737
+ {'loss': 0.332, 'grad_norm': 1.4128385782241821, 'learning_rate': 8.98e-05, 'epoch': 7.76}
738
+ 39%|█████████████████████████████████████▏ | 900/2320 [19:48<29:47, 1.26s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
739
+ ***** Running Evaluation *****
740
+ Num examples = 1344
741
+ Batch size = 1
742
+ {'eval_loss': 0.44030094146728516, 'eval_wer': 0.5233542747883092, 'eval_runtime': 39.9401, 'eval_samples_per_second': 33.65, 'eval_steps_per_second': 33.65, 'epoch': 7.76}
743
+ {'loss': 0.2411, 'grad_norm': 1.7903906106948853, 'learning_rate': 9.080000000000001e-05, 'epoch': 7.84}
744
+ {'loss': 0.2707, 'grad_norm': 2.0804216861724854, 'learning_rate': 9.180000000000001e-05, 'epoch': 7.93}
745
+ {'loss': 0.3186, 'grad_norm': 1.4420605897903442, 'learning_rate': 9.28e-05, 'epoch': 8.02}
746
+ {'loss': 0.1937, 'grad_norm': 2.2910854816436768, 'learning_rate': 9.38e-05, 'epoch': 8.1}
747
+ {'loss': 0.2321, 'grad_norm': 3.5892796516418457, 'learning_rate': 9.48e-05, 'epoch': 8.19}
748
+ {'loss': 0.2868, 'grad_norm': 1.6509956121444702, 'learning_rate': 9.58e-05, 'epoch': 8.28}
749
+ {'loss': 0.2004, 'grad_norm': 1.6983604431152344, 'learning_rate': 9.680000000000001e-05, 'epoch': 8.36}
750
+ {'loss': 0.2025, 'grad_norm': 2.061176061630249, 'learning_rate': 9.78e-05, 'epoch': 8.45}
751
+ {'loss': 0.2598, 'grad_norm': 1.7732270956039429, 'learning_rate': 9.88e-05, 'epoch': 8.53}
752
+ {'loss': 0.1876, 'grad_norm': 1.8335466384887695, 'learning_rate': 9.98e-05, 'epoch': 8.62}
753
+ 43%|████████████████████████████████████████▉ | 1000/2320 [22:05<20:18, 1.08it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
754
+ ***** Running Evaluation *****
755
+ Num examples = 1344
756
+ Batch size = 1
757
+ {'eval_loss': 0.4757933020591736, 'eval_wer': 0.5221706273331512, 'eval_runtime': 39.8291, 'eval_samples_per_second': 33.744, 'eval_steps_per_second': 33.744, 'epoch': 8.62}
758
+ {'loss': 0.2456, 'grad_norm': 2.52902889251709, 'learning_rate': 9.939393939393939e-05, 'epoch': 8.71}
759
+ {'loss': 0.2499, 'grad_norm': 1.7294162511825562, 'learning_rate': 9.863636363636364e-05, 'epoch': 8.79}
760
+ {'loss': 0.1854, 'grad_norm': 21.9121150970459, 'learning_rate': 9.787878787878789e-05, 'epoch': 8.88}
761
+ {'loss': 0.2576, 'grad_norm': 3.9164559841156006, 'learning_rate': 9.712121212121212e-05, 'epoch': 8.97}
762
+ {'loss': 0.2118, 'grad_norm': 1.239221215248108, 'learning_rate': 9.636363636363637e-05, 'epoch': 9.05}
763
+ {'loss': 0.1577, 'grad_norm': 3.1416544914245605, 'learning_rate': 9.560606060606061e-05, 'epoch': 9.14}
764
+ {'loss': 0.2092, 'grad_norm': 2.4253621101379395, 'learning_rate': 9.484848484848486e-05, 'epoch': 9.22}
765
+ {'loss': 0.1876, 'grad_norm': 1.194345474243164, 'learning_rate': 9.40909090909091e-05, 'epoch': 9.31}
766
+ {'loss': 0.1546, 'grad_norm': 2.411029100418091, 'learning_rate': 9.333333333333334e-05, 'epoch': 9.4}
767
+ {'loss': 0.2232, 'grad_norm': 3.246082067489624, 'learning_rate': 9.257575757575758e-05, 'epoch': 9.48}
768
+ 47%|█████████████████████████████████████████████ | 1100/2320 [24:18<14:01, 1.45it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
769
+ ***** Running Evaluation *****
770
+ Num examples = 1344
771
+ Batch size = 1
772
+ {'eval_loss': 0.45077577233314514, 'eval_wer': 0.48921059819721385, 'eval_runtime': 39.9221, 'eval_samples_per_second': 33.666, 'eval_steps_per_second': 33.666, 'epoch': 9.48}
773
+ {'loss': 0.1777, 'grad_norm': 1.3427454233169556, 'learning_rate': 9.181818181818183e-05, 'epoch': 9.57}
774
+ {'loss': 0.1646, 'grad_norm': 1.5090447664260864, 'learning_rate': 9.106060606060606e-05, 'epoch': 9.66}
775
+ {'loss': 0.225, 'grad_norm': 1.3060975074768066, 'learning_rate': 9.030303030303031e-05, 'epoch': 9.74}
776
+ {'loss': 0.1552, 'grad_norm': 1.3011540174484253, 'learning_rate': 8.954545454545455e-05, 'epoch': 9.83}
777
+ {'loss': 0.1715, 'grad_norm': 1.9938538074493408, 'learning_rate': 8.87878787878788e-05, 'epoch': 9.91}
778
+ {'loss': 0.2092, 'grad_norm': 3.334385395050049, 'learning_rate': 8.803030303030304e-05, 'epoch': 10.0}
779
+ {'loss': 0.14, 'grad_norm': 1.011092185974121, 'learning_rate': 8.727272727272727e-05, 'epoch': 10.09}
780
+ {'loss': 0.1512, 'grad_norm': 2.517902135848999, 'learning_rate': 8.651515151515152e-05, 'epoch': 10.17}
781
+ {'loss': 0.1846, 'grad_norm': 1.2418378591537476, 'learning_rate': 8.575757575757576e-05, 'epoch': 10.26}
782
+ {'loss': 0.1332, 'grad_norm': 1.5885329246520996, 'learning_rate': 8.5e-05, 'epoch': 10.34}
783
+ 52%|█████████████████████████████████████████████████▏ | 1200/2320 [26:37<18:40, 1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
784
+ ***** Running Evaluation *****
785
+ Num examples = 1344
786
+ Batch size = 1
787
+ {'eval_loss': 0.4394075274467468, 'eval_wer': 0.4740052808886461, 'eval_runtime': 39.9367, 'eval_samples_per_second': 33.653, 'eval_steps_per_second': 33.653, 'epoch': 10.34}
788
+ 52%|█████████████████████████████████████████████████▏ | 1200/2320 [27:17<18:40, 1.00s/itSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-1200
789
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/config.json
790
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/model.safetensors
791
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/preprocessor_config.json
792
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/tokenizer_config.json
793
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/special_tokens_map.json
794
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/added_tokens.json
795
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
796
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
797
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
798
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
799
+ {'loss': 0.1485, 'grad_norm': 1.2539469003677368, 'learning_rate': 8.424242424242424e-05, 'epoch': 10.43}
800
+ {'loss': 0.1988, 'grad_norm': 1.357601284980774, 'learning_rate': 8.348484848484849e-05, 'epoch': 10.52}
801
+ 53%|██████████████████████████████████████████████████▏ | 1227/2320 [27:45<19:01, 1.04s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
802
+ return F.conv1d(input, weight, bias, self.stride,
803
+ {'loss': 0.137, 'grad_norm': 2.0564587116241455, 'learning_rate': 8.272727272727273e-05, 'epoch': 10.6}
804
+ {'loss': 0.1245, 'grad_norm': 2.48364520072937, 'learning_rate': 8.196969696969698e-05, 'epoch': 10.69}
805
+ {'loss': 0.1602, 'grad_norm': 1.015891671180725, 'learning_rate': 8.121212121212121e-05, 'epoch': 10.78}
806
+ {'loss': 0.1215, 'grad_norm': 1.1023950576782227, 'learning_rate': 8.045454545454546e-05, 'epoch': 10.86}
807
+ {'loss': 0.1621, 'grad_norm': 2.703427791595459, 'learning_rate': 7.96969696969697e-05, 'epoch': 10.95}
808
+ {'loss': 0.1651, 'grad_norm': 1.1821691989898682, 'learning_rate': 7.893939393939395e-05, 'epoch': 11.03}
809
+ {'loss': 0.1066, 'grad_norm': 0.930283784866333, 'learning_rate': 7.818181818181818e-05, 'epoch': 11.12}
810
+ {'loss': 0.1085, 'grad_norm': 1.6548758745193481, 'learning_rate': 7.742424242424243e-05, 'epoch': 11.21}
811
+ 56%|█████████████████████████████████████████████████████▏ | 1300/2320 [28:53<12:42, 1.34it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
812
+ ***** Running Evaluation *****
813
+ Num examples = 1344
814
+ Batch size = 1
815
+ {'eval_loss': 0.4466467499732971, 'eval_wer': 0.46207775653282346, 'eval_runtime': 39.8633, 'eval_samples_per_second': 33.715, 'eval_steps_per_second': 33.715, 'epoch': 11.21}
816
+ {'loss': 0.1418, 'grad_norm': 1.1760716438293457, 'learning_rate': 7.666666666666667e-05, 'epoch': 11.29}
817
+ {'loss': 0.1133, 'grad_norm': 2.1062755584716797, 'learning_rate': 7.59090909090909e-05, 'epoch': 11.38}
818
+ {'loss': 0.1318, 'grad_norm': 2.67399001121521, 'learning_rate': 7.515151515151515e-05, 'epoch': 11.47}
819
+ {'loss': 0.1474, 'grad_norm': 1.0049142837524414, 'learning_rate': 7.439393939393939e-05, 'epoch': 11.55}
820
+ {'loss': 0.0908, 'grad_norm': 1.586559772491455, 'learning_rate': 7.363636363636364e-05, 'epoch': 11.64}
821
+ {'loss': 0.1521, 'grad_norm': 3.784040927886963, 'learning_rate': 7.287878787878788e-05, 'epoch': 11.72}
822
+ {'loss': 0.1163, 'grad_norm': 1.125501275062561, 'learning_rate': 7.212121212121213e-05, 'epoch': 11.81}
823
+ {'loss': 0.1109, 'grad_norm': 2.1989808082580566, 'learning_rate': 7.136363636363636e-05, 'epoch': 11.9}
824
+ {'loss': 0.152, 'grad_norm': 1.1287301778793335, 'learning_rate': 7.060606060606061e-05, 'epoch': 11.98}
825
+ {'loss': 0.098, 'grad_norm': 1.538678765296936, 'learning_rate': 6.984848484848485e-05, 'epoch': 12.07}
826
+ 60%|█████████████████████████████████████████████████████████▎ | 1400/2320 [31:12<18:06, 1.18s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
827
+ ***** Running Evaluation *****
828
+ Num examples = 1344
829
+ Batch size = 1
830
+ {'eval_loss': 0.42302384972572327, 'eval_wer': 0.44933078393881454, 'eval_runtime': 40.1773, 'eval_samples_per_second': 33.452, 'eval_steps_per_second': 33.452, 'epoch': 12.07}
831
+ {'loss': 0.092, 'grad_norm': 1.400772213935852, 'learning_rate': 6.90909090909091e-05, 'epoch': 12.16}
832
+ {'loss': 0.1649, 'grad_norm': 3.6780846118927, 'learning_rate': 6.833333333333333e-05, 'epoch': 12.24}
833
+ {'loss': 0.091, 'grad_norm': 1.5424057245254517, 'learning_rate': 6.757575757575758e-05, 'epoch': 12.33}
834
+ {'loss': 0.0869, 'grad_norm': 1.4868180751800537, 'learning_rate': 6.681818181818183e-05, 'epoch': 12.41}
835
+ {'loss': 0.1499, 'grad_norm': 1.1947145462036133, 'learning_rate': 6.606060606060607e-05, 'epoch': 12.5}
836
+ {'loss': 0.0954, 'grad_norm': 1.0430784225463867, 'learning_rate': 6.530303030303032e-05, 'epoch': 12.59}
837
+ {'loss': 0.1032, 'grad_norm': 2.4261584281921387, 'learning_rate': 6.454545454545455e-05, 'epoch': 12.67}
838
+ {'loss': 0.1158, 'grad_norm': 1.033467411994934, 'learning_rate': 6.37878787878788e-05, 'epoch': 12.76}
839
+ {'loss': 0.0864, 'grad_norm': 1.1535651683807373, 'learning_rate': 6.303030303030302e-05, 'epoch': 12.84}
840
+ {'loss': 0.1219, 'grad_norm': 1.28826105594635, 'learning_rate': 6.227272727272727e-05, 'epoch': 12.93}
841
+ 65%|█████████████████████████████████████████████████████████████▍ | 1500/2320 [33:26<10:01, 1.36it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
842
+ ***** Running Evaluation *****
843
+ Num examples = 1344
844
+ Batch size = 1
845
+ {'eval_loss': 0.418023020029068, 'eval_wer': 0.44596194118182647, 'eval_runtime': 40.2192, 'eval_samples_per_second': 33.417, 'eval_steps_per_second': 33.417, 'epoch': 12.93}
846
+ {'loss': 0.1289, 'grad_norm': 1.055411458015442, 'learning_rate': 6.151515151515151e-05, 'epoch': 13.02}
847
+ {'loss': 0.0776, 'grad_norm': 1.1269094944000244, 'learning_rate': 6.075757575757576e-05, 'epoch': 13.1}
848
+ {'loss': 0.0871, 'grad_norm': 1.7149118185043335, 'learning_rate': 6e-05, 'epoch': 13.19}
849
+ {'loss': 0.1087, 'grad_norm': 1.7456856966018677, 'learning_rate': 5.9242424242424244e-05, 'epoch': 13.28}
850
+ {'loss': 0.0821, 'grad_norm': 1.3434715270996094, 'learning_rate': 5.848484848484849e-05, 'epoch': 13.36}
851
+ {'loss': 0.0878, 'grad_norm': 2.103512763977051, 'learning_rate': 5.772727272727273e-05, 'epoch': 13.45}
852
+ {'loss': 0.1044, 'grad_norm': 1.240224838256836, 'learning_rate': 5.696969696969697e-05, 'epoch': 13.53}
853
+ {'loss': 0.0753, 'grad_norm': 0.7336703538894653, 'learning_rate': 5.6212121212121215e-05, 'epoch': 13.62}
854
+ {'loss': 0.1059, 'grad_norm': 2.293342351913452, 'learning_rate': 5.545454545454546e-05, 'epoch': 13.71}
855
+ {'loss': 0.1021, 'grad_norm': 1.1853971481323242, 'learning_rate': 5.46969696969697e-05, 'epoch': 13.79}
856
+ 69%|█████████████████████████████████████████████████████████████████▌ | 1600/2320 [35:45<13:55, 1.16s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
857
+ ***** Running Evaluation *****
858
+ Num examples = 1344
859
+ Batch size = 1
860
+ {'eval_loss': 0.41785839200019836, 'eval_wer': 0.4405900027314941, 'eval_runtime': 40.2906, 'eval_samples_per_second': 33.358, 'eval_steps_per_second': 33.358, 'epoch': 13.79}
861
+ 69%|█████████████████████████████████████████████████████████████████▌ | 1600/2320 [36:25<13:55, 1.16s/itSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-1600
862
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/config.json
863
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/model.safetensors
864
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/preprocessor_config.json
865
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/tokenizer_config.json
866
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/special_tokens_map.json
867
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/added_tokens.json
868
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
869
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
870
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
871
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
872
+ Deleting older checkpoint [wav2vec2-base-timit-fine-tuned/checkpoint-400] due to args.save_total_limit
873
+ {'loss': 0.0648, 'grad_norm': 1.331200361251831, 'learning_rate': 5.393939393939394e-05, 'epoch': 13.88}
874
+ {'loss': 0.1121, 'grad_norm': 2.28397536277771, 'learning_rate': 5.3181818181818186e-05, 'epoch': 13.97}
875
+ {'loss': 0.0725, 'grad_norm': 0.9436893463134766, 'learning_rate': 5.242424242424243e-05, 'epoch': 14.05}
876
+ {'loss': 0.0691, 'grad_norm': 1.6113288402557373, 'learning_rate': 5.166666666666667e-05, 'epoch': 14.14}
877
+ {'loss': 0.0979, 'grad_norm': 2.479888439178467, 'learning_rate': 5.090909090909091e-05, 'epoch': 14.22}
878
+ {'loss': 0.0909, 'grad_norm': 1.006616473197937, 'learning_rate': 5.015151515151515e-05, 'epoch': 14.31}
879
+ 72%|████████████████████████████████████████████████████████████████████ | 1663/2320 [37:27<11:20, 1.04s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
880
+ return F.conv1d(input, weight, bias, self.stride,
881
+ {'loss': 0.0761, 'grad_norm': 1.4571704864501953, 'learning_rate': 4.93939393939394e-05, 'epoch': 14.4}
882
+ {'loss': 0.0862, 'grad_norm': 1.5729875564575195, 'learning_rate': 4.863636363636364e-05, 'epoch': 14.48}
883
+ {'loss': 0.0646, 'grad_norm': 1.2180376052856445, 'learning_rate': 4.787878787878788e-05, 'epoch': 14.57}
884
+ {'loss': 0.0741, 'grad_norm': 1.7464072704315186, 'learning_rate': 4.712121212121212e-05, 'epoch': 14.66}
885
+ 73%|█████████████████████████████████████████████████████████████████████▌ | 1700/2320 [38:02<08:27, 1.22it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
886
+ ***** Running Evaluation *****
887
+ Num examples = 1344
888
+ Batch size = 1
889
+ {'eval_loss': 0.4113341271877289, 'eval_wer': 0.4309387234817445, 'eval_runtime': 40.2841, 'eval_samples_per_second': 33.363, 'eval_steps_per_second': 33.363, 'epoch': 14.66}
890
+ {'loss': 0.1315, 'grad_norm': 0.8571386337280273, 'learning_rate': 4.6439393939393944e-05, 'epoch': 14.74}
891
+ {'loss': 0.0603, 'grad_norm': 1.331377387046814, 'learning_rate': 4.5681818181818186e-05, 'epoch': 14.83}
892
+ {'loss': 0.0796, 'grad_norm': 1.5398732423782349, 'learning_rate': 4.492424242424242e-05, 'epoch': 14.91}
893
+ {'loss': 0.085, 'grad_norm': 3.689671754837036, 'learning_rate': 4.4166666666666665e-05, 'epoch': 15.0}
894
+ {'loss': 0.0544, 'grad_norm': 1.132613182067871, 'learning_rate': 4.340909090909091e-05, 'epoch': 15.09}
895
+ {'loss': 0.0601, 'grad_norm': 1.5951859951019287, 'learning_rate': 4.265151515151515e-05, 'epoch': 15.17}
896
+ {'loss': 0.097, 'grad_norm': 0.5179944634437561, 'learning_rate': 4.189393939393939e-05, 'epoch': 15.26}
897
+ {'loss': 0.0596, 'grad_norm': 0.9744370579719543, 'learning_rate': 4.113636363636364e-05, 'epoch': 15.34}
898
+ {'loss': 0.0677, 'grad_norm': 1.8794275522232056, 'learning_rate': 4.0378787878787885e-05, 'epoch': 15.43}
899
+ {'loss': 0.0896, 'grad_norm': 0.748386025428772, 'learning_rate': 3.962121212121213e-05, 'epoch': 15.52}
900
+ 78%|█████████████████████████████████████████████████████████████████████████▋ | 1800/2320 [40:18<11:05, 1.28s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
901
+ ***** Running Evaluation *****
902
+ Num examples = 1344
903
+ Batch size = 1
904
+ {'eval_loss': 0.43920788168907166, 'eval_wer': 0.4307566238732587, 'eval_runtime': 40.1997, 'eval_samples_per_second': 33.433, 'eval_steps_per_second': 33.433, 'epoch': 15.52}
905
+ {'loss': 0.0604, 'grad_norm': 0.9639837145805359, 'learning_rate': 3.8863636363636364e-05, 'epoch': 15.6}
906
+ {'loss': 0.0711, 'grad_norm': 1.9640839099884033, 'learning_rate': 3.810606060606061e-05, 'epoch': 15.69}
907
+ {'loss': 0.0867, 'grad_norm': 1.4438735246658325, 'learning_rate': 3.734848484848485e-05, 'epoch': 15.78}
908
+ {'loss': 0.0605, 'grad_norm': 1.0062426328659058, 'learning_rate': 3.659090909090909e-05, 'epoch': 15.86}
909
+ {'loss': 0.0662, 'grad_norm': 1.6331523656845093, 'learning_rate': 3.5833333333333335e-05, 'epoch': 15.95}
910
+ {'loss': 0.0765, 'grad_norm': 0.8070217370986938, 'learning_rate': 3.507575757575758e-05, 'epoch': 16.03}
911
+ {'loss': 0.0537, 'grad_norm': 1.4137670993804932, 'learning_rate': 3.431818181818182e-05, 'epoch': 16.12}
912
+ {'loss': 0.0684, 'grad_norm': 1.5437769889831543, 'learning_rate': 3.356060606060606e-05, 'epoch': 16.21}
913
+ {'loss': 0.0744, 'grad_norm': 0.90281081199646, 'learning_rate': 3.2803030303030305e-05, 'epoch': 16.29}
914
+ {'loss': 0.0492, 'grad_norm': 1.139837622642517, 'learning_rate': 3.204545454545455e-05, 'epoch': 16.38}
915
+ 82%|█████████████████████████████████████████████████████████████████████████████▊ | 1900/2320 [42:36<06:26, 1.09it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
916
+ ***** Running Evaluation *****
917
+ Num examples = 1344
918
+ Batch size = 1
919
+ {'eval_loss': 0.4201890528202057, 'eval_wer': 0.4313029226987162, 'eval_runtime': 40.1502, 'eval_samples_per_second': 33.474, 'eval_steps_per_second': 33.474, 'epoch': 16.38}
920
+ {'loss': 0.0652, 'grad_norm': 1.679457426071167, 'learning_rate': 3.128787878787879e-05, 'epoch': 16.47}
921
+ {'loss': 0.0649, 'grad_norm': 0.6661111116409302, 'learning_rate': 3.0530303030303034e-05, 'epoch': 16.55}
922
+ {'loss': 0.0469, 'grad_norm': 1.1774355173110962, 'learning_rate': 2.9772727272727273e-05, 'epoch': 16.64}
923
+ {'loss': 0.0752, 'grad_norm': 1.783923864364624, 'learning_rate': 2.901515151515152e-05, 'epoch': 16.72}
924
+ {'loss': 0.0519, 'grad_norm': 1.176321268081665, 'learning_rate': 2.825757575757576e-05, 'epoch': 16.81}
925
+ {'loss': 0.0547, 'grad_norm': 1.3150608539581299, 'learning_rate': 2.7500000000000004e-05, 'epoch': 16.9}
926
+ {'loss': 0.0799, 'grad_norm': 0.983769953250885, 'learning_rate': 2.674242424242424e-05, 'epoch': 16.98}
927
+ {'loss': 0.0577, 'grad_norm': 0.996890127658844, 'learning_rate': 2.5984848484848483e-05, 'epoch': 17.07}
928
+ {'loss': 0.0515, 'grad_norm': 2.3034253120422363, 'learning_rate': 2.5227272727272726e-05, 'epoch': 17.16}
929
+ {'loss': 0.0759, 'grad_norm': 3.7528610229492188, 'learning_rate': 2.4469696969696972e-05, 'epoch': 17.24}
930
+ 86%|█████████████████████████████████████████████████████████████████████████████████▉ | 2000/2320 [44:50<03:48, 1.40it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
931
+ ***** Running Evaluation *****
932
+ Num examples = 1344
933
+ Batch size = 1
934
+ {'eval_loss': 0.43480169773101807, 'eval_wer': 0.4207411454065374, 'eval_runtime': 40.017, 'eval_samples_per_second': 33.586, 'eval_steps_per_second': 33.586, 'epoch': 17.24}
935
+ 86%|█████████████████████████████████████████████████████████████████████████████████▉ | 2000/2320 [45:30<03:48, 1.40it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-2000
936
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/config.json
937
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/model.safetensors
938
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/preprocessor_config.json
939
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/tokenizer_config.json
940
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/special_tokens_map.json
941
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/added_tokens.json
942
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
943
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
944
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
945
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
946
+ Deleting older checkpoint [wav2vec2-base-timit-fine-tuned/checkpoint-800] due to args.save_total_limit
947
+ {'loss': 0.0419, 'grad_norm': 0.6646668314933777, 'learning_rate': 2.3712121212121214e-05, 'epoch': 17.33}
948
+ {'loss': 0.0595, 'grad_norm': 1.3250740766525269, 'learning_rate': 2.2954545454545457e-05, 'epoch': 17.41}
949
+ {'loss': 0.0691, 'grad_norm': 0.8094995021820068, 'learning_rate': 2.21969696969697e-05, 'epoch': 17.5}
950
+ {'loss': 0.052, 'grad_norm': 0.846946120262146, 'learning_rate': 2.143939393939394e-05, 'epoch': 17.59}
951
+ {'loss': 0.0565, 'grad_norm': 1.652417540550232, 'learning_rate': 2.0681818181818182e-05, 'epoch': 17.67}
952
+ {'loss': 0.0745, 'grad_norm': 1.0080279111862183, 'learning_rate': 1.9924242424242425e-05, 'epoch': 17.76}
953
+ 89%|████████████████████████████████████████████████████████████████████████████████████▌ | 2064/2320 [46:36<04:53, 1.15s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
954
+ return F.conv1d(input, weight, bias, self.stride,
955
+ {'loss': 0.0513, 'grad_norm': 0.7252691388130188, 'learning_rate': 1.9166666666666667e-05, 'epoch': 17.84}
956
+ {'loss': 0.055, 'grad_norm': 1.58548903465271, 'learning_rate': 1.840909090909091e-05, 'epoch': 17.93}
957
+ {'loss': 0.0658, 'grad_norm': 0.6634634733200073, 'learning_rate': 1.7651515151515153e-05, 'epoch': 18.02}
958
+ {'loss': 0.0406, 'grad_norm': 1.1495524644851685, 'learning_rate': 1.6893939393939395e-05, 'epoch': 18.1}
959
+ 91%|█████████████████████████████████████████████████████████████████████████████████████▉ | 2100/2320 [47:11<03:46, 1.03s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
960
+ ***** Running Evaluation *****
961
+ Num examples = 1344
962
+ Batch size = 1
963
+ {'eval_loss': 0.44191813468933105, 'eval_wer': 0.42046799599380863, 'eval_runtime': 40.0967, 'eval_samples_per_second': 33.519, 'eval_steps_per_second': 33.519, 'epoch': 18.1}
964
+ {'loss': 0.0381, 'grad_norm': 0.9788354635238647, 'learning_rate': 1.6136363636363638e-05, 'epoch': 18.19}
965
+ {'loss': 0.071, 'grad_norm': 1.093633770942688, 'learning_rate': 1.5378787878787877e-05, 'epoch': 18.28}
966
+ {'loss': 0.0439, 'grad_norm': 0.7164376974105835, 'learning_rate': 1.4621212121212122e-05, 'epoch': 18.36}
967
+ {'loss': 0.0481, 'grad_norm': 0.9887032508850098, 'learning_rate': 1.3863636363636364e-05, 'epoch': 18.45}
968
+ {'loss': 0.0571, 'grad_norm': 0.45052286982536316, 'learning_rate': 1.3106060606060607e-05, 'epoch': 18.53}
969
+ {'loss': 0.0452, 'grad_norm': 1.167181134223938, 'learning_rate': 1.234848484848485e-05, 'epoch': 18.62}
970
+ {'loss': 0.0643, 'grad_norm': 1.378661870956421, 'learning_rate': 1.159090909090909e-05, 'epoch': 18.71}
971
+ {'loss': 0.0587, 'grad_norm': 0.854932963848114, 'learning_rate': 1.0833333333333334e-05, 'epoch': 18.79}
972
+ {'loss': 0.0395, 'grad_norm': 0.8007526397705078, 'learning_rate': 1.0075757575757576e-05, 'epoch': 18.88}
973
+ {'loss': 0.074, 'grad_norm': 3.317830801010132, 'learning_rate': 9.318181818181819e-06, 'epoch': 18.97}
974
+ 95%|██████████████████████████████████████████████████████████████████████████████████████████ | 2200/2320 [49:24<01:19, 1.51it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
975
+ ***** Running Evaluation *****
976
+ Num examples = 1344
977
+ Batch size = 1
978
+ {'eval_loss': 0.43061742186546326, 'eval_wer': 0.420012746972594, 'eval_runtime': 40.0034, 'eval_samples_per_second': 33.597, 'eval_steps_per_second': 33.597, 'epoch': 18.97}
979
+ {'loss': 0.046, 'grad_norm': 0.7710875272750854, 'learning_rate': 8.56060606060606e-06, 'epoch': 19.05}
980
+ {'loss': 0.0394, 'grad_norm': 0.5200530886650085, 'learning_rate': 7.803030303030304e-06, 'epoch': 19.14}
981
+ {'loss': 0.0582, 'grad_norm': 1.3544327020645142, 'learning_rate': 7.045454545454545e-06, 'epoch': 19.22}
982
+ {'loss': 0.0606, 'grad_norm': 0.8653574585914612, 'learning_rate': 6.287878787878789e-06, 'epoch': 19.31}
983
+ {'loss': 0.0367, 'grad_norm': 1.5852700471878052, 'learning_rate': 5.530303030303031e-06, 'epoch': 19.4}
984
+ {'loss': 0.0782, 'grad_norm': 2.2167246341705322, 'learning_rate': 4.772727272727273e-06, 'epoch': 19.48}
985
+ {'loss': 0.0416, 'grad_norm': 0.5891330242156982, 'learning_rate': 4.015151515151515e-06, 'epoch': 19.57}
986
+ {'loss': 0.0515, 'grad_norm': 1.1137330532073975, 'learning_rate': 3.257575757575758e-06, 'epoch': 19.66}
987
+ {'loss': 0.0512, 'grad_norm': 0.8132285475730896, 'learning_rate': 2.5e-06, 'epoch': 19.74}
988
+ {'loss': 0.0378, 'grad_norm': 0.7994781136512756, 'learning_rate': 1.7424242424242427e-06, 'epoch': 19.83}
989
+ 99%|██████████████████████████████████████████████████████████████████████████████████████████████▏| 2300/2320 [51:43<00:20, 1.02s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
990
+ ***** Running Evaluation *****
991
+ Num examples = 1344
992
+ Batch size = 1
993
+ {'eval_loss': 0.4273350238800049, 'eval_wer': 0.41728125284530637, 'eval_runtime': 40.0934, 'eval_samples_per_second': 33.522, 'eval_steps_per_second': 33.522, 'epoch': 19.83}
994
+ {'loss': 0.0489, 'grad_norm': 0.9775754809379578, 'learning_rate': 9.848484848484847e-07, 'epoch': 19.91}
995
+ {'loss': 0.0554, 'grad_norm': 0.8857516050338745, 'learning_rate': 2.2727272727272726e-07, 'epoch': 20.0}
996
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2320/2320 [52:39<00:00, 1.41it/s]
997
+
998
+ Training completed. Do not forget to share your model on huggingface.co/models =)
999
+
1000
+
1001
+ {'train_runtime': 3159.4128, 'train_samples_per_second': 23.397, 'train_steps_per_second': 0.734, 'train_loss': 0.8618391515622879, 'epoch': 20.0}
1002
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2320/2320 [52:39<00:00, 1.36s/it]
1003
+ Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned
1004
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
1005
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors
1006
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
1007
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
1008
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
1009
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
1010
+ Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned
1011
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
1012
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors
1013
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
1014
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
1015
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
1016
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
1017
+ events.out.tfevents.1716174523.tz579-raptorlake.65634.0: 100%|██████████████████████████████████████| 63.2k/63.2k [00:00<00:00, 232kB/s]
1018
+ model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████| 378M/378M [03:30<00:00, 1.79MB/s]
1019
+ Upload 2 LFS files: 100%|████████████████████████████████████████████████████████████████████████████████| 2/2 [03:31<00:00, 105.69s/it]
1020
+ ***** train metrics *****████████████████████████████████████████ | 1/2 [03:31<03:31, 211.39s/it]
1021
+ epoch = 20.0
1022
+ total_flos = 2000175347GF
1023
+ train_loss = 0.8618
1024
+ train_runtime = 0:52:39.41
1025
+ train_samples = 3696
1026
+ train_samples_per_second = 23.397
1027
+ train_steps_per_second = 0.734
1028
+ 05/19/2024 23:04:57 - INFO - __main__ - *** Evaluate ***
1029
+ The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.
1030
+ ***** Running Evaluation *****
1031
+ Num examples = 1344
1032
+ Batch size = 1
1033
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:39<00:00, 34.00it/s]
1034
+ ***** eval metrics *****
1035
+ epoch = 20.0
1036
+ eval_loss = 0.4275
1037
+ eval_runtime = 0:00:39.60
1038
+ eval_samples = 1344
1039
+ eval_samples_per_second = 33.935
1040
+ eval_steps_per_second = 33.935
1041
+ eval_wer = 0.4173
1042
+ Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned
1043
+ Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
1044
+ Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors
1045
+ Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
1046
+ tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
1047
+ Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
1048
+ added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
1049
+ events.out.tfevents.1716177937.tz579-raptorlake.65634.1: 100%|███████████████████████████████████████████| 406/406 [00:00<00:00, 884B/s]
run.timit.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export HF_TOKEN=`cat /home/huggingface.token`
2
+ export HF_HOME="/home/Work/common_huggingface"
3
+
4
+ python run_speech_recognition_ctc.py \
5
+ --token="${HF_TOKEN}" \
6
+ --dataset_name="timit_asr" \
7
+ --dataset_path="/home/Work_/common_darpa/Timit_data/data" \
8
+ --model_name_or_path="facebook/wav2vec2-base" \
9
+ --overwrite_output_dir \
10
+ --output_dir="./wav2vec2-base-timit-fine-tuned" \
11
+ --train_split_name="train" \
12
+ --num_train_epochs="20" \
13
+ --per_device_train_batch_size="32" \
14
+ --per_device_eval_batch_size="1" \
15
+ --weight_decay="0.005" \
16
+ --learning_rate="1e-4" \
17
+ --warmup_steps="1000" \
18
+ --evaluation_strategy="steps" \
19
+ --text_column_name="text" \
20
+ --save_steps="400" \
21
+ --eval_steps="100" \
22
+ --logging_steps="10" \
23
+ --layerdrop="0.0" \
24
+ --save_total_limit="3" \
25
+ --freeze_feature_encoder \
26
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
27
+ --fp16 \
28
+ --group_by_length \
29
+ --push_to_hub \
30
+ --do_train --do_eval \
run_speech_recognition_ctc.py ADDED
@@ -0,0 +1,840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
18
+
19
+ import functools
20
+ import json
21
+ import logging
22
+ import os
23
+ import re
24
+ import sys
25
+ import warnings
26
+ from dataclasses import dataclass, field
27
+ from typing import Dict, List, Optional, Union
28
+
29
+ import datasets
30
+ import evaluate
31
+ import torch
32
+ from datasets import DatasetDict, load_dataset
33
+
34
+ import transformers
35
+ from transformers import (
36
+ AutoConfig,
37
+ AutoFeatureExtractor,
38
+ AutoModelForCTC,
39
+ AutoProcessor,
40
+ AutoTokenizer,
41
+ HfArgumentParser,
42
+ Trainer,
43
+ TrainingArguments,
44
+ Wav2Vec2Processor,
45
+ set_seed,
46
+ )
47
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
48
+ from transformers.utils import check_min_version, send_example_telemetry
49
+ from transformers.utils.versions import require_version
50
+
51
+
52
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
53
+ check_min_version("4.41.0.dev0")
54
+
55
+ require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
56
+
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ def list_field(default=None, metadata=None):
62
+ return field(default_factory=lambda: default, metadata=metadata)
63
+
64
+
65
+ @dataclass
66
+ class ModelArguments:
67
+ """
68
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
69
+ """
70
+
71
+ model_name_or_path: str = field(
72
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
73
+ )
74
+ tokenizer_name_or_path: Optional[str] = field(
75
+ default=None,
76
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
77
+ )
78
+ cache_dir: Optional[str] = field(
79
+ default=None,
80
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
81
+ )
82
+ freeze_feature_encoder: bool = field(
83
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
84
+ )
85
+ attention_dropout: float = field(
86
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
87
+ )
88
+ activation_dropout: float = field(
89
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
90
+ )
91
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
92
+ hidden_dropout: float = field(
93
+ default=0.0,
94
+ metadata={
95
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
96
+ },
97
+ )
98
+ final_dropout: float = field(
99
+ default=0.0,
100
+ metadata={"help": "The dropout probability for the final projection layer."},
101
+ )
102
+ mask_time_prob: float = field(
103
+ default=0.05,
104
+ metadata={
105
+ "help": (
106
+ "Probability of each feature vector along the time axis to be chosen as the start of the vector "
107
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
108
+ "vectors will be masked along the time axis."
109
+ )
110
+ },
111
+ )
112
+ mask_time_length: int = field(
113
+ default=10,
114
+ metadata={"help": "Length of vector span to mask along the time axis."},
115
+ )
116
+ mask_feature_prob: float = field(
117
+ default=0.0,
118
+ metadata={
119
+ "help": (
120
+ "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
121
+ " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
122
+ " bins will be masked along the time axis."
123
+ )
124
+ },
125
+ )
126
+ mask_feature_length: int = field(
127
+ default=10,
128
+ metadata={"help": "Length of vector span to mask along the feature axis."},
129
+ )
130
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
131
+ ctc_loss_reduction: Optional[str] = field(
132
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
133
+ )
134
+ ctc_zero_infinity: Optional[bool] = field(
135
+ default=False,
136
+ metadata={
137
+ "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly"
138
+ " occur when the inputs are too short to be aligned to the targets."
139
+ },
140
+ )
141
+ add_adapter: Optional[bool] = field(
142
+ default=False,
143
+ metadata={
144
+ "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very"
145
+ "useful to downsample the output length."
146
+ },
147
+ )
148
+
149
+
150
+ @dataclass
151
+ class DataTrainingArguments:
152
+ """
153
+ Arguments pertaining to what data we are going to input our model for training and eval.
154
+
155
+ Using `HfArgumentParser` we can turn this class
156
+ into argparse arguments to be able to specify them on
157
+ the command line.
158
+ """
159
+
160
+ dataset_name: str = field(
161
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
162
+ )
163
+ dataset_path: str = field(
164
+ default=None, metadata={"help": "The configuration path of the dataset to use (via the datasets library)."}
165
+ )
166
+ dataset_config_name: str = field(
167
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
168
+ )
169
+ train_split_name: str = field(
170
+ default="train+validation",
171
+ metadata={
172
+ "help": (
173
+ "The name of the training data set split to use (via the datasets library). Defaults to "
174
+ "'train+validation'"
175
+ )
176
+ },
177
+ )
178
+ eval_split_name: str = field(
179
+ default="test",
180
+ metadata={
181
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
182
+ },
183
+ )
184
+ audio_column_name: str = field(
185
+ default="audio",
186
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
187
+ )
188
+ text_column_name: str = field(
189
+ default="text",
190
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
191
+ )
192
+ overwrite_cache: bool = field(
193
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
194
+ )
195
+ preprocessing_num_workers: Optional[int] = field(
196
+ default=None,
197
+ metadata={"help": "The number of processes to use for the preprocessing."},
198
+ )
199
+ max_train_samples: Optional[int] = field(
200
+ default=None,
201
+ metadata={
202
+ "help": (
203
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
204
+ "value if set."
205
+ )
206
+ },
207
+ )
208
+ max_eval_samples: Optional[int] = field(
209
+ default=None,
210
+ metadata={
211
+ "help": (
212
+ "For debugging purposes or quicker training, truncate the number of validation examples to this "
213
+ "value if set."
214
+ )
215
+ },
216
+ )
217
+ chars_to_ignore: Optional[List[str]] = list_field(
218
+ default=None,
219
+ metadata={"help": "A list of characters to remove from the transcripts."},
220
+ )
221
+ eval_metrics: List[str] = list_field(
222
+ default=["wer"],
223
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
224
+ )
225
+ max_duration_in_seconds: float = field(
226
+ default=20.0,
227
+ metadata={
228
+ "help": (
229
+ "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
230
+ " 'max_duration_in_seconds`"
231
+ )
232
+ },
233
+ )
234
+ min_duration_in_seconds: float = field(
235
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
236
+ )
237
+ preprocessing_only: bool = field(
238
+ default=False,
239
+ metadata={
240
+ "help": (
241
+ "Whether to only do data preprocessing and skip training. This is especially useful when data"
242
+ " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
243
+ " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
244
+ " can consequently be loaded in distributed training"
245
+ )
246
+ },
247
+ )
248
+ token: str = field(
249
+ default=None,
250
+ metadata={
251
+ "help": (
252
+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
253
+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
254
+ )
255
+ },
256
+ )
257
+ use_auth_token: bool = field(
258
+ default=None,
259
+ metadata={
260
+ "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
261
+ },
262
+ )
263
+ trust_remote_code: bool = field(
264
+ default=False,
265
+ metadata={
266
+ "help": (
267
+ "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
268
+ "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
269
+ "execute code present on the Hub on your local machine."
270
+ )
271
+ },
272
+ )
273
+ unk_token: str = field(
274
+ default="[UNK]",
275
+ metadata={"help": "The unk token for the tokenizer"},
276
+ )
277
+ pad_token: str = field(
278
+ default="[PAD]",
279
+ metadata={"help": "The padding token for the tokenizer"},
280
+ )
281
+ word_delimiter_token: str = field(
282
+ default="|",
283
+ metadata={"help": "The word delimiter token for the tokenizer"},
284
+ )
285
+ phoneme_language: Optional[str] = field(
286
+ default=None,
287
+ metadata={
288
+ "help": (
289
+ "The target language that should be used be"
290
+ " passed to the tokenizer for tokenization. Note that"
291
+ " this is only relevant if the model classifies the"
292
+ " input audio to a sequence of phoneme sequences."
293
+ )
294
+ },
295
+ )
296
+
297
+
298
+ @dataclass
299
+ class DataCollatorCTCWithPadding:
300
+ """
301
+ Data collator that will dynamically pad the inputs received.
302
+ Args:
303
+ processor (:class:`~transformers.AutoProcessor`)
304
+ The processor used for proccessing the data.
305
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
306
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
307
+ among:
308
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
309
+ sequence if provided).
310
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
311
+ maximum acceptable input length for the model if that argument is not provided.
312
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
313
+ different lengths).
314
+ max_length (:obj:`int`, `optional`):
315
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
316
+ max_length_labels (:obj:`int`, `optional`):
317
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
318
+ pad_to_multiple_of (:obj:`int`, `optional`):
319
+ If set will pad the sequence to a multiple of the provided value.
320
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
321
+ 7.5 (Volta).
322
+ """
323
+
324
+ processor: AutoProcessor
325
+ padding: Union[bool, str] = "longest"
326
+ pad_to_multiple_of: Optional[int] = None
327
+ pad_to_multiple_of_labels: Optional[int] = None
328
+ feature_extractor_input_name: Optional[str] = "input_values"
329
+
330
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
331
+ # split inputs and labels since they have to be of different lengths and need
332
+ # different padding methods
333
+ input_features = [
334
+ {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features
335
+ ]
336
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
337
+
338
+ batch = self.processor.pad(
339
+ input_features,
340
+ padding=self.padding,
341
+ pad_to_multiple_of=self.pad_to_multiple_of,
342
+ return_tensors="pt",
343
+ )
344
+
345
+ labels_batch = self.processor.pad(
346
+ labels=label_features,
347
+ padding=self.padding,
348
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
349
+ return_tensors="pt",
350
+ )
351
+
352
+ # replace padding with -100 to ignore loss correctly
353
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
354
+
355
+ batch["labels"] = labels
356
+ if "attention_mask" in batch:
357
+ batch["attention_mask"] = batch["attention_mask"].to(torch.long)
358
+
359
+ return batch
360
+
361
+
362
+ def create_vocabulary_from_data(
363
+ datasets: DatasetDict,
364
+ word_delimiter_token: Optional[str] = None,
365
+ unk_token: Optional[str] = None,
366
+ pad_token: Optional[str] = None,
367
+ ):
368
+ # Given training and test labels create vocabulary
369
+ def extract_all_chars(batch):
370
+ all_text = " ".join(batch["target_text"])
371
+ vocab = list(set(all_text))
372
+ return {"vocab": [vocab], "all_text": [all_text]}
373
+
374
+ vocabs = datasets.map(
375
+ extract_all_chars,
376
+ batched=True,
377
+ batch_size=-1,
378
+ keep_in_memory=True,
379
+ remove_columns=datasets["train"].column_names,
380
+ )
381
+
382
+ # take union of all unique characters in each dataset
383
+ vocab_set = functools.reduce(
384
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
385
+ )
386
+
387
+ vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
388
+
389
+ # replace white space with delimiter token
390
+ if word_delimiter_token is not None:
391
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
392
+ del vocab_dict[" "]
393
+
394
+ # add unk and pad token
395
+ if unk_token is not None:
396
+ vocab_dict[unk_token] = len(vocab_dict)
397
+
398
+ if pad_token is not None:
399
+ vocab_dict[pad_token] = len(vocab_dict)
400
+
401
+ return vocab_dict
402
+
403
+
404
+ def main():
405
+ # See all possible arguments in src/transformers/training_args.py
406
+ # or by passing the --help flag to this script.
407
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
408
+
409
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
410
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
411
+ # If we pass only one argument to the script and it's the path to a json file,
412
+ # let's parse it to get our arguments.
413
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
414
+ else:
415
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
416
+
417
+ if data_args.use_auth_token is not None:
418
+ warnings.warn(
419
+ "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
420
+ FutureWarning,
421
+ )
422
+ if data_args.token is not None:
423
+ raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
424
+ data_args.token = data_args.use_auth_token
425
+
426
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
427
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
428
+ send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
429
+
430
+ # Detecting last checkpoint.
431
+ last_checkpoint = None
432
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
433
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
434
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
435
+ raise ValueError(
436
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
437
+ "Use --overwrite_output_dir to overcome."
438
+ )
439
+ elif last_checkpoint is not None:
440
+ logger.info(
441
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
442
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
443
+ )
444
+
445
+ # Setup logging
446
+ logging.basicConfig(
447
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
448
+ datefmt="%m/%d/%Y %H:%M:%S",
449
+ handlers=[logging.StreamHandler(sys.stdout)],
450
+ )
451
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
452
+
453
+ # Log on each process the small summary:
454
+ logger.warning(
455
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
456
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
457
+ )
458
+ # Set the verbosity to info of the Transformers logger (on main process only):
459
+ if is_main_process(training_args.local_rank):
460
+ transformers.utils.logging.set_verbosity_info()
461
+ logger.info("Training/evaluation parameters %s", training_args)
462
+
463
+ # Set seed before initializing model.
464
+ set_seed(training_args.seed)
465
+
466
+ # 1. First, let's load the dataset
467
+ raw_datasets = DatasetDict()
468
+
469
+ if training_args.do_train:
470
+ raw_datasets["train"] = load_dataset(
471
+ data_args.dataset_name,
472
+ data_args.dataset_config_name,
473
+ data_dir=data_args.dataset_path,
474
+ split=data_args.train_split_name,
475
+ token=data_args.token,
476
+ )
477
+
478
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
479
+ raise ValueError(
480
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
481
+ " Make sure to set `--audio_column_name` to the correct audio column - one of"
482
+ f" {', '.join(raw_datasets['train'].column_names)}."
483
+ )
484
+
485
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
486
+ raise ValueError(
487
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
488
+ "Make sure to set `--text_column_name` to the correct text column - one of "
489
+ f"{', '.join(raw_datasets['train'].column_names)}."
490
+ )
491
+
492
+ if data_args.max_train_samples is not None:
493
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
494
+
495
+ if training_args.do_eval:
496
+ raw_datasets["eval"] = load_dataset(
497
+ data_args.dataset_name,
498
+ data_args.dataset_config_name,
499
+ data_dir=data_args.dataset_path,
500
+ split=data_args.eval_split_name,
501
+ token=data_args.token,
502
+ )
503
+
504
+ if data_args.max_eval_samples is not None:
505
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
506
+
507
+ # 2. We remove some special characters from the datasets
508
+ # that make training complicated and do not help in transcribing the speech
509
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
510
+ # that could be easily picked up by the model
511
+ chars_to_ignore_regex = (
512
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
513
+ )
514
+ text_column_name = data_args.text_column_name
515
+
516
+ def remove_special_characters(batch):
517
+ if chars_to_ignore_regex is not None:
518
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
519
+ else:
520
+ batch["target_text"] = batch[text_column_name].lower() + " "
521
+ return batch
522
+
523
+ with training_args.main_process_first(desc="dataset map special characters removal"):
524
+ raw_datasets = raw_datasets.map(
525
+ remove_special_characters,
526
+ remove_columns=[text_column_name],
527
+ desc="remove special characters from datasets",
528
+ )
529
+
530
+ # save special tokens for tokenizer
531
+ word_delimiter_token = data_args.word_delimiter_token
532
+ unk_token = data_args.unk_token
533
+ pad_token = data_args.pad_token
534
+
535
+ # 3. Next, let's load the config as we might need it to create
536
+ # the tokenizer
537
+ # load config
538
+ config = AutoConfig.from_pretrained(
539
+ model_args.model_name_or_path,
540
+ cache_dir=model_args.cache_dir,
541
+ token=data_args.token,
542
+ trust_remote_code=data_args.trust_remote_code,
543
+ )
544
+
545
+ # 4. Next, if no tokenizer file is defined,
546
+ # we create the vocabulary of the model by extracting all unique characters from
547
+ # the training and evaluation datasets
548
+ # We need to make sure that only first rank saves vocabulary
549
+ # make sure all processes wait until vocab is created
550
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
551
+ tokenizer_kwargs = {}
552
+ if tokenizer_name_or_path is None:
553
+ # save vocab in training output dir
554
+ tokenizer_name_or_path = training_args.output_dir
555
+
556
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
557
+
558
+ with training_args.main_process_first():
559
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
560
+ try:
561
+ os.remove(vocab_file)
562
+ except OSError:
563
+ # in shared file-systems it might be the case that
564
+ # two processes try to delete the vocab file at the some time
565
+ pass
566
+
567
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
568
+ if not os.path.isfile(vocab_file):
569
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
570
+ vocab_dict = create_vocabulary_from_data(
571
+ raw_datasets,
572
+ word_delimiter_token=word_delimiter_token,
573
+ unk_token=unk_token,
574
+ pad_token=pad_token,
575
+ )
576
+
577
+ # save vocab dict to be loaded into tokenizer
578
+ with open(vocab_file, "w") as file:
579
+ json.dump(vocab_dict, file)
580
+
581
+ # if tokenizer has just been created
582
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
583
+ tokenizer_kwargs = {
584
+ "config": config if config.tokenizer_class is not None else None,
585
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
586
+ "unk_token": unk_token,
587
+ "pad_token": pad_token,
588
+ "word_delimiter_token": word_delimiter_token,
589
+ }
590
+
591
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
592
+ # Note for distributed training, the .from_pretrained methods guarantee that only
593
+ # one local process can concurrently download model & vocab.
594
+
595
+ # load feature_extractor and tokenizer
596
+ tokenizer = AutoTokenizer.from_pretrained(
597
+ tokenizer_name_or_path,
598
+ token=data_args.token,
599
+ trust_remote_code=data_args.trust_remote_code,
600
+ **tokenizer_kwargs,
601
+ )
602
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
603
+ model_args.model_name_or_path,
604
+ cache_dir=model_args.cache_dir,
605
+ token=data_args.token,
606
+ trust_remote_code=data_args.trust_remote_code,
607
+ )
608
+
609
+ # adapt config
610
+ config.update(
611
+ {
612
+ "feat_proj_dropout": model_args.feat_proj_dropout,
613
+ "attention_dropout": model_args.attention_dropout,
614
+ "hidden_dropout": model_args.hidden_dropout,
615
+ "final_dropout": model_args.final_dropout,
616
+ "mask_time_prob": model_args.mask_time_prob,
617
+ "mask_time_length": model_args.mask_time_length,
618
+ "mask_feature_prob": model_args.mask_feature_prob,
619
+ "mask_feature_length": model_args.mask_feature_length,
620
+ "gradient_checkpointing": training_args.gradient_checkpointing,
621
+ "layerdrop": model_args.layerdrop,
622
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
623
+ "ctc_zero_infinity": model_args.ctc_zero_infinity,
624
+ "pad_token_id": tokenizer.pad_token_id,
625
+ "vocab_size": len(tokenizer),
626
+ "activation_dropout": model_args.activation_dropout,
627
+ "add_adapter": model_args.add_adapter,
628
+ }
629
+ )
630
+
631
+ # create model
632
+ model = AutoModelForCTC.from_pretrained(
633
+ model_args.model_name_or_path,
634
+ cache_dir=model_args.cache_dir,
635
+ config=config,
636
+ token=data_args.token,
637
+ trust_remote_code=data_args.trust_remote_code,
638
+ )
639
+
640
+ # freeze encoder
641
+ if model_args.freeze_feature_encoder:
642
+ model.freeze_feature_encoder()
643
+
644
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
645
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
646
+ # so that we just need to set the correct target sampling rate and normalize the input
647
+ # via the `feature_extractor`
648
+
649
+ # make sure that dataset decodes audio with correct sampling rate
650
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
651
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
652
+ raw_datasets = raw_datasets.cast_column(
653
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
654
+ )
655
+
656
+ # derive max & min input length for sample rate & max duration
657
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
658
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
659
+ audio_column_name = data_args.audio_column_name
660
+ num_workers = data_args.preprocessing_num_workers
661
+ feature_extractor_input_name = feature_extractor.model_input_names[0]
662
+
663
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
664
+ phoneme_language = data_args.phoneme_language
665
+
666
+ # Preprocessing the datasets.
667
+ # We need to read the audio files as arrays and tokenize the targets.
668
+ def prepare_dataset(batch):
669
+ # load audio
670
+ sample = batch[audio_column_name]
671
+
672
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
673
+ batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0]
674
+ # take length of raw audio waveform
675
+ batch["input_length"] = len(sample["array"].squeeze())
676
+
677
+ # encode targets
678
+ additional_kwargs = {}
679
+ if phoneme_language is not None:
680
+ additional_kwargs["phonemizer_lang"] = phoneme_language
681
+
682
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
683
+ return batch
684
+
685
+ with training_args.main_process_first(desc="dataset map preprocessing"):
686
+ vectorized_datasets = raw_datasets.map(
687
+ prepare_dataset,
688
+ remove_columns=next(iter(raw_datasets.values())).column_names,
689
+ num_proc=num_workers,
690
+ desc="preprocess datasets",
691
+ )
692
+
693
+ def is_audio_in_length_range(length):
694
+ return length > min_input_length and length < max_input_length
695
+
696
+ # filter data that is shorter than min_input_length
697
+ vectorized_datasets = vectorized_datasets.filter(
698
+ is_audio_in_length_range,
699
+ num_proc=num_workers,
700
+ input_columns=["input_length"],
701
+ )
702
+
703
+ # 7. Next, we can prepare the training.
704
+ # Let's use word error rate (WER) as our evaluation metric,
705
+ # instantiate a data collator and the trainer
706
+
707
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
708
+ eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics}
709
+
710
+ # for large datasets it is advised to run the preprocessing on a
711
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
712
+ # be a timeout when running the script in distributed mode.
713
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
714
+ # cached dataset
715
+ if data_args.preprocessing_only:
716
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
717
+ return
718
+
719
+ # For languages like Chinese with large vocabulary size, we need to discard logits
720
+ # and only keep the argmax, otherwise we run out of memory during evaluation.
721
+ def preprocess_logits_for_metrics(logits, labels):
722
+ pred_ids = torch.argmax(logits, dim=-1)
723
+ return pred_ids, labels
724
+
725
+ def compute_metrics(pred):
726
+ pred_ids = pred.predictions[0]
727
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
728
+
729
+ pred_str = tokenizer.batch_decode(pred_ids)
730
+ # we do not want to group tokens when computing the metrics
731
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
732
+
733
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
734
+
735
+ return metrics
736
+
737
+ # Now save everything to be able to create a single processor later
738
+ # make sure all processes wait until data is saved
739
+ with training_args.main_process_first():
740
+ # only the main process saves them
741
+ if is_main_process(training_args.local_rank):
742
+ # save feature extractor, tokenizer and config
743
+ feature_extractor.save_pretrained(training_args.output_dir)
744
+ tokenizer.save_pretrained(training_args.output_dir)
745
+ config.save_pretrained(training_args.output_dir)
746
+
747
+ try:
748
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
749
+ except (OSError, KeyError):
750
+ warnings.warn(
751
+ "Loading a processor from a feature extractor config that does not"
752
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
753
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
754
+ " `'processor_class': 'Wav2Vec2Processor'`",
755
+ FutureWarning,
756
+ )
757
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
758
+
759
+ # Instantiate custom data collator
760
+ data_collator = DataCollatorCTCWithPadding(
761
+ processor=processor, feature_extractor_input_name=feature_extractor_input_name
762
+ )
763
+
764
+ # Initialize Trainer
765
+ trainer = Trainer(
766
+ model=model,
767
+ data_collator=data_collator,
768
+ args=training_args,
769
+ compute_metrics=compute_metrics,
770
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
771
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
772
+ tokenizer=processor,
773
+ preprocess_logits_for_metrics=preprocess_logits_for_metrics,
774
+ )
775
+
776
+ # 8. Finally, we can start training
777
+
778
+ # Training
779
+ if training_args.do_train:
780
+ # use last checkpoint if exist
781
+ if last_checkpoint is not None:
782
+ checkpoint = last_checkpoint
783
+ elif os.path.isdir(model_args.model_name_or_path):
784
+ checkpoint = model_args.model_name_or_path
785
+ else:
786
+ checkpoint = None
787
+
788
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
789
+ trainer.save_model()
790
+
791
+ metrics = train_result.metrics
792
+ max_train_samples = (
793
+ data_args.max_train_samples
794
+ if data_args.max_train_samples is not None
795
+ else len(vectorized_datasets["train"])
796
+ )
797
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
798
+
799
+ trainer.log_metrics("train", metrics)
800
+ trainer.save_metrics("train", metrics)
801
+ trainer.save_state()
802
+
803
+ # Evaluation
804
+ results = {}
805
+ if training_args.do_eval:
806
+ logger.info("*** Evaluate ***")
807
+ metrics = trainer.evaluate()
808
+ max_eval_samples = (
809
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
810
+ )
811
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
812
+
813
+ trainer.log_metrics("eval", metrics)
814
+ trainer.save_metrics("eval", metrics)
815
+
816
+ # Write model card and (optionally) push to hub
817
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
818
+ kwargs = {
819
+ "finetuned_from": model_args.model_name_or_path,
820
+ "tasks": "automatic-speech-recognition",
821
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
822
+ "dataset_args": (
823
+ f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
824
+ f" {data_args.eval_split_name}"
825
+ ),
826
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
827
+ }
828
+ if "common_voice" in data_args.dataset_name:
829
+ kwargs["language"] = config_name
830
+
831
+ if training_args.push_to_hub:
832
+ trainer.push_to_hub(**kwargs)
833
+ else:
834
+ trainer.create_model_card(**kwargs)
835
+
836
+ return results
837
+
838
+
839
+ if __name__ == "__main__":
840
+ main()
run_speech_recognition_ctc.py. ADDED
@@ -0,0 +1,835 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
18
+
19
+ import functools
20
+ import json
21
+ import logging
22
+ import os
23
+ import re
24
+ import sys
25
+ import warnings
26
+ from dataclasses import dataclass, field
27
+ from typing import Dict, List, Optional, Union
28
+
29
+ import datasets
30
+ import evaluate
31
+ import torch
32
+ from datasets import DatasetDict, load_dataset
33
+
34
+ import transformers
35
+ from transformers import (
36
+ AutoConfig,
37
+ AutoFeatureExtractor,
38
+ AutoModelForCTC,
39
+ AutoProcessor,
40
+ AutoTokenizer,
41
+ HfArgumentParser,
42
+ Trainer,
43
+ TrainingArguments,
44
+ Wav2Vec2Processor,
45
+ set_seed,
46
+ )
47
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
48
+ from transformers.utils import check_min_version, send_example_telemetry
49
+ from transformers.utils.versions import require_version
50
+
51
+
52
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
53
+ check_min_version("4.41.0.dev0")
54
+
55
+ require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
56
+
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ def list_field(default=None, metadata=None):
62
+ return field(default_factory=lambda: default, metadata=metadata)
63
+
64
+
65
+ @dataclass
66
+ class ModelArguments:
67
+ """
68
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
69
+ """
70
+
71
+ model_name_or_path: str = field(
72
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
73
+ )
74
+ tokenizer_name_or_path: Optional[str] = field(
75
+ default=None,
76
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
77
+ )
78
+ cache_dir: Optional[str] = field(
79
+ default=None,
80
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
81
+ )
82
+ freeze_feature_encoder: bool = field(
83
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
84
+ )
85
+ attention_dropout: float = field(
86
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
87
+ )
88
+ activation_dropout: float = field(
89
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
90
+ )
91
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
92
+ hidden_dropout: float = field(
93
+ default=0.0,
94
+ metadata={
95
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
96
+ },
97
+ )
98
+ final_dropout: float = field(
99
+ default=0.0,
100
+ metadata={"help": "The dropout probability for the final projection layer."},
101
+ )
102
+ mask_time_prob: float = field(
103
+ default=0.05,
104
+ metadata={
105
+ "help": (
106
+ "Probability of each feature vector along the time axis to be chosen as the start of the vector "
107
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
108
+ "vectors will be masked along the time axis."
109
+ )
110
+ },
111
+ )
112
+ mask_time_length: int = field(
113
+ default=10,
114
+ metadata={"help": "Length of vector span to mask along the time axis."},
115
+ )
116
+ mask_feature_prob: float = field(
117
+ default=0.0,
118
+ metadata={
119
+ "help": (
120
+ "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
121
+ " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
122
+ " bins will be masked along the time axis."
123
+ )
124
+ },
125
+ )
126
+ mask_feature_length: int = field(
127
+ default=10,
128
+ metadata={"help": "Length of vector span to mask along the feature axis."},
129
+ )
130
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
131
+ ctc_loss_reduction: Optional[str] = field(
132
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
133
+ )
134
+ ctc_zero_infinity: Optional[bool] = field(
135
+ default=False,
136
+ metadata={
137
+ "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly"
138
+ " occur when the inputs are too short to be aligned to the targets."
139
+ },
140
+ )
141
+ add_adapter: Optional[bool] = field(
142
+ default=False,
143
+ metadata={
144
+ "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very"
145
+ "useful to downsample the output length."
146
+ },
147
+ )
148
+
149
+
150
+ @dataclass
151
+ class DataTrainingArguments:
152
+ """
153
+ Arguments pertaining to what data we are going to input our model for training and eval.
154
+
155
+ Using `HfArgumentParser` we can turn this class
156
+ into argparse arguments to be able to specify them on
157
+ the command line.
158
+ """
159
+
160
+ dataset_name: str = field(
161
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
162
+ )
163
+ dataset_config_name: str = field(
164
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
165
+ )
166
+ train_split_name: str = field(
167
+ default="train+validation",
168
+ metadata={
169
+ "help": (
170
+ "The name of the training data set split to use (via the datasets library). Defaults to "
171
+ "'train+validation'"
172
+ )
173
+ },
174
+ )
175
+ eval_split_name: str = field(
176
+ default="test",
177
+ metadata={
178
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
179
+ },
180
+ )
181
+ audio_column_name: str = field(
182
+ default="audio",
183
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
184
+ )
185
+ text_column_name: str = field(
186
+ default="text",
187
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
188
+ )
189
+ overwrite_cache: bool = field(
190
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
191
+ )
192
+ preprocessing_num_workers: Optional[int] = field(
193
+ default=None,
194
+ metadata={"help": "The number of processes to use for the preprocessing."},
195
+ )
196
+ max_train_samples: Optional[int] = field(
197
+ default=None,
198
+ metadata={
199
+ "help": (
200
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
201
+ "value if set."
202
+ )
203
+ },
204
+ )
205
+ max_eval_samples: Optional[int] = field(
206
+ default=None,
207
+ metadata={
208
+ "help": (
209
+ "For debugging purposes or quicker training, truncate the number of validation examples to this "
210
+ "value if set."
211
+ )
212
+ },
213
+ )
214
+ chars_to_ignore: Optional[List[str]] = list_field(
215
+ default=None,
216
+ metadata={"help": "A list of characters to remove from the transcripts."},
217
+ )
218
+ eval_metrics: List[str] = list_field(
219
+ default=["wer"],
220
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
221
+ )
222
+ max_duration_in_seconds: float = field(
223
+ default=20.0,
224
+ metadata={
225
+ "help": (
226
+ "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
227
+ " 'max_duration_in_seconds`"
228
+ )
229
+ },
230
+ )
231
+ min_duration_in_seconds: float = field(
232
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
233
+ )
234
+ preprocessing_only: bool = field(
235
+ default=False,
236
+ metadata={
237
+ "help": (
238
+ "Whether to only do data preprocessing and skip training. This is especially useful when data"
239
+ " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
240
+ " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
241
+ " can consequently be loaded in distributed training"
242
+ )
243
+ },
244
+ )
245
+ token: str = field(
246
+ default=None,
247
+ metadata={
248
+ "help": (
249
+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
250
+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
251
+ )
252
+ },
253
+ )
254
+ use_auth_token: bool = field(
255
+ default=None,
256
+ metadata={
257
+ "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
258
+ },
259
+ )
260
+ trust_remote_code: bool = field(
261
+ default=False,
262
+ metadata={
263
+ "help": (
264
+ "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
265
+ "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
266
+ "execute code present on the Hub on your local machine."
267
+ )
268
+ },
269
+ )
270
+ unk_token: str = field(
271
+ default="[UNK]",
272
+ metadata={"help": "The unk token for the tokenizer"},
273
+ )
274
+ pad_token: str = field(
275
+ default="[PAD]",
276
+ metadata={"help": "The padding token for the tokenizer"},
277
+ )
278
+ word_delimiter_token: str = field(
279
+ default="|",
280
+ metadata={"help": "The word delimiter token for the tokenizer"},
281
+ )
282
+ phoneme_language: Optional[str] = field(
283
+ default=None,
284
+ metadata={
285
+ "help": (
286
+ "The target language that should be used be"
287
+ " passed to the tokenizer for tokenization. Note that"
288
+ " this is only relevant if the model classifies the"
289
+ " input audio to a sequence of phoneme sequences."
290
+ )
291
+ },
292
+ )
293
+
294
+
295
+ @dataclass
296
+ class DataCollatorCTCWithPadding:
297
+ """
298
+ Data collator that will dynamically pad the inputs received.
299
+ Args:
300
+ processor (:class:`~transformers.AutoProcessor`)
301
+ The processor used for proccessing the data.
302
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
303
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
304
+ among:
305
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
306
+ sequence if provided).
307
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
308
+ maximum acceptable input length for the model if that argument is not provided.
309
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
310
+ different lengths).
311
+ max_length (:obj:`int`, `optional`):
312
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
313
+ max_length_labels (:obj:`int`, `optional`):
314
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
315
+ pad_to_multiple_of (:obj:`int`, `optional`):
316
+ If set will pad the sequence to a multiple of the provided value.
317
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
318
+ 7.5 (Volta).
319
+ """
320
+
321
+ processor: AutoProcessor
322
+ padding: Union[bool, str] = "longest"
323
+ pad_to_multiple_of: Optional[int] = None
324
+ pad_to_multiple_of_labels: Optional[int] = None
325
+ feature_extractor_input_name: Optional[str] = "input_values"
326
+
327
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
328
+ # split inputs and labels since they have to be of different lengths and need
329
+ # different padding methods
330
+ input_features = [
331
+ {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features
332
+ ]
333
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
334
+
335
+ batch = self.processor.pad(
336
+ input_features,
337
+ padding=self.padding,
338
+ pad_to_multiple_of=self.pad_to_multiple_of,
339
+ return_tensors="pt",
340
+ )
341
+
342
+ labels_batch = self.processor.pad(
343
+ labels=label_features,
344
+ padding=self.padding,
345
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
346
+ return_tensors="pt",
347
+ )
348
+
349
+ # replace padding with -100 to ignore loss correctly
350
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
351
+
352
+ batch["labels"] = labels
353
+ if "attention_mask" in batch:
354
+ batch["attention_mask"] = batch["attention_mask"].to(torch.long)
355
+
356
+ return batch
357
+
358
+
359
+ def create_vocabulary_from_data(
360
+ datasets: DatasetDict,
361
+ word_delimiter_token: Optional[str] = None,
362
+ unk_token: Optional[str] = None,
363
+ pad_token: Optional[str] = None,
364
+ ):
365
+ # Given training and test labels create vocabulary
366
+ def extract_all_chars(batch):
367
+ all_text = " ".join(batch["target_text"])
368
+ vocab = list(set(all_text))
369
+ return {"vocab": [vocab], "all_text": [all_text]}
370
+
371
+ vocabs = datasets.map(
372
+ extract_all_chars,
373
+ batched=True,
374
+ batch_size=-1,
375
+ keep_in_memory=True,
376
+ remove_columns=datasets["train"].column_names,
377
+ )
378
+
379
+ # take union of all unique characters in each dataset
380
+ vocab_set = functools.reduce(
381
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
382
+ )
383
+
384
+ vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
385
+
386
+ # replace white space with delimiter token
387
+ if word_delimiter_token is not None:
388
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
389
+ del vocab_dict[" "]
390
+
391
+ # add unk and pad token
392
+ if unk_token is not None:
393
+ vocab_dict[unk_token] = len(vocab_dict)
394
+
395
+ if pad_token is not None:
396
+ vocab_dict[pad_token] = len(vocab_dict)
397
+
398
+ return vocab_dict
399
+
400
+
401
+ def main():
402
+ # See all possible arguments in src/transformers/training_args.py
403
+ # or by passing the --help flag to this script.
404
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
405
+
406
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
407
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
408
+ # If we pass only one argument to the script and it's the path to a json file,
409
+ # let's parse it to get our arguments.
410
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
411
+ else:
412
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
413
+
414
+ if data_args.use_auth_token is not None:
415
+ warnings.warn(
416
+ "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
417
+ FutureWarning,
418
+ )
419
+ if data_args.token is not None:
420
+ raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
421
+ data_args.token = data_args.use_auth_token
422
+
423
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
424
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
425
+ send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
426
+
427
+ # Detecting last checkpoint.
428
+ last_checkpoint = None
429
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
430
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
431
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
432
+ raise ValueError(
433
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
434
+ "Use --overwrite_output_dir to overcome."
435
+ )
436
+ elif last_checkpoint is not None:
437
+ logger.info(
438
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
439
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
440
+ )
441
+
442
+ # Setup logging
443
+ logging.basicConfig(
444
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
445
+ datefmt="%m/%d/%Y %H:%M:%S",
446
+ handlers=[logging.StreamHandler(sys.stdout)],
447
+ )
448
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
449
+
450
+ # Log on each process the small summary:
451
+ logger.warning(
452
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
453
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
454
+ )
455
+ # Set the verbosity to info of the Transformers logger (on main process only):
456
+ if is_main_process(training_args.local_rank):
457
+ transformers.utils.logging.set_verbosity_info()
458
+ logger.info("Training/evaluation parameters %s", training_args)
459
+
460
+ # Set seed before initializing model.
461
+ set_seed(training_args.seed)
462
+
463
+ # 1. First, let's load the dataset
464
+ raw_datasets = DatasetDict()
465
+
466
+ if training_args.do_train:
467
+ raw_datasets["train"] = load_dataset(
468
+ data_args.dataset_name,
469
+ data_args.dataset_config_name,
470
+ split=data_args.train_split_name,
471
+ token=data_args.token,
472
+ )
473
+
474
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
475
+ raise ValueError(
476
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
477
+ " Make sure to set `--audio_column_name` to the correct audio column - one of"
478
+ f" {', '.join(raw_datasets['train'].column_names)}."
479
+ )
480
+
481
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
482
+ raise ValueError(
483
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
484
+ "Make sure to set `--text_column_name` to the correct text column - one of "
485
+ f"{', '.join(raw_datasets['train'].column_names)}."
486
+ )
487
+
488
+ if data_args.max_train_samples is not None:
489
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
490
+
491
+ if training_args.do_eval:
492
+ raw_datasets["eval"] = load_dataset(
493
+ data_args.dataset_name,
494
+ data_args.dataset_config_name,
495
+ split=data_args.eval_split_name,
496
+ token=data_args.token,
497
+ )
498
+
499
+ if data_args.max_eval_samples is not None:
500
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
501
+
502
+ # 2. We remove some special characters from the datasets
503
+ # that make training complicated and do not help in transcribing the speech
504
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
505
+ # that could be easily picked up by the model
506
+ chars_to_ignore_regex = (
507
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
508
+ )
509
+ text_column_name = data_args.text_column_name
510
+
511
+ def remove_special_characters(batch):
512
+ if chars_to_ignore_regex is not None:
513
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
514
+ else:
515
+ batch["target_text"] = batch[text_column_name].lower() + " "
516
+ return batch
517
+
518
+ with training_args.main_process_first(desc="dataset map special characters removal"):
519
+ raw_datasets = raw_datasets.map(
520
+ remove_special_characters,
521
+ remove_columns=[text_column_name],
522
+ desc="remove special characters from datasets",
523
+ )
524
+
525
+ # save special tokens for tokenizer
526
+ word_delimiter_token = data_args.word_delimiter_token
527
+ unk_token = data_args.unk_token
528
+ pad_token = data_args.pad_token
529
+
530
+ # 3. Next, let's load the config as we might need it to create
531
+ # the tokenizer
532
+ # load config
533
+ config = AutoConfig.from_pretrained(
534
+ model_args.model_name_or_path,
535
+ cache_dir=model_args.cache_dir,
536
+ token=data_args.token,
537
+ trust_remote_code=data_args.trust_remote_code,
538
+ )
539
+
540
+ # 4. Next, if no tokenizer file is defined,
541
+ # we create the vocabulary of the model by extracting all unique characters from
542
+ # the training and evaluation datasets
543
+ # We need to make sure that only first rank saves vocabulary
544
+ # make sure all processes wait until vocab is created
545
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
546
+ tokenizer_kwargs = {}
547
+ if tokenizer_name_or_path is None:
548
+ # save vocab in training output dir
549
+ tokenizer_name_or_path = training_args.output_dir
550
+
551
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
552
+
553
+ with training_args.main_process_first():
554
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
555
+ try:
556
+ os.remove(vocab_file)
557
+ except OSError:
558
+ # in shared file-systems it might be the case that
559
+ # two processes try to delete the vocab file at the some time
560
+ pass
561
+
562
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
563
+ if not os.path.isfile(vocab_file):
564
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
565
+ vocab_dict = create_vocabulary_from_data(
566
+ raw_datasets,
567
+ word_delimiter_token=word_delimiter_token,
568
+ unk_token=unk_token,
569
+ pad_token=pad_token,
570
+ )
571
+
572
+ # save vocab dict to be loaded into tokenizer
573
+ with open(vocab_file, "w") as file:
574
+ json.dump(vocab_dict, file)
575
+
576
+ # if tokenizer has just been created
577
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
578
+ tokenizer_kwargs = {
579
+ "config": config if config.tokenizer_class is not None else None,
580
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
581
+ "unk_token": unk_token,
582
+ "pad_token": pad_token,
583
+ "word_delimiter_token": word_delimiter_token,
584
+ }
585
+
586
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
587
+ # Note for distributed training, the .from_pretrained methods guarantee that only
588
+ # one local process can concurrently download model & vocab.
589
+
590
+ # load feature_extractor and tokenizer
591
+ tokenizer = AutoTokenizer.from_pretrained(
592
+ tokenizer_name_or_path,
593
+ token=data_args.token,
594
+ trust_remote_code=data_args.trust_remote_code,
595
+ **tokenizer_kwargs,
596
+ )
597
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
598
+ model_args.model_name_or_path,
599
+ cache_dir=model_args.cache_dir,
600
+ token=data_args.token,
601
+ trust_remote_code=data_args.trust_remote_code,
602
+ )
603
+
604
+ # adapt config
605
+ config.update(
606
+ {
607
+ "feat_proj_dropout": model_args.feat_proj_dropout,
608
+ "attention_dropout": model_args.attention_dropout,
609
+ "hidden_dropout": model_args.hidden_dropout,
610
+ "final_dropout": model_args.final_dropout,
611
+ "mask_time_prob": model_args.mask_time_prob,
612
+ "mask_time_length": model_args.mask_time_length,
613
+ "mask_feature_prob": model_args.mask_feature_prob,
614
+ "mask_feature_length": model_args.mask_feature_length,
615
+ "gradient_checkpointing": training_args.gradient_checkpointing,
616
+ "layerdrop": model_args.layerdrop,
617
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
618
+ "ctc_zero_infinity": model_args.ctc_zero_infinity,
619
+ "pad_token_id": tokenizer.pad_token_id,
620
+ "vocab_size": len(tokenizer),
621
+ "activation_dropout": model_args.activation_dropout,
622
+ "add_adapter": model_args.add_adapter,
623
+ }
624
+ )
625
+
626
+ # create model
627
+ model = AutoModelForCTC.from_pretrained(
628
+ model_args.model_name_or_path,
629
+ cache_dir=model_args.cache_dir,
630
+ config=config,
631
+ token=data_args.token,
632
+ trust_remote_code=data_args.trust_remote_code,
633
+ )
634
+
635
+ # freeze encoder
636
+ if model_args.freeze_feature_encoder:
637
+ model.freeze_feature_encoder()
638
+
639
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
640
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
641
+ # so that we just need to set the correct target sampling rate and normalize the input
642
+ # via the `feature_extractor`
643
+
644
+ # make sure that dataset decodes audio with correct sampling rate
645
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
646
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
647
+ raw_datasets = raw_datasets.cast_column(
648
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
649
+ )
650
+
651
+ # derive max & min input length for sample rate & max duration
652
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
653
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
654
+ audio_column_name = data_args.audio_column_name
655
+ num_workers = data_args.preprocessing_num_workers
656
+ feature_extractor_input_name = feature_extractor.model_input_names[0]
657
+
658
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
659
+ phoneme_language = data_args.phoneme_language
660
+
661
+ # Preprocessing the datasets.
662
+ # We need to read the audio files as arrays and tokenize the targets.
663
+ def prepare_dataset(batch):
664
+ # load audio
665
+ sample = batch[audio_column_name]
666
+
667
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
668
+ batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0]
669
+ # take length of raw audio waveform
670
+ batch["input_length"] = len(sample["array"].squeeze())
671
+
672
+ # encode targets
673
+ additional_kwargs = {}
674
+ if phoneme_language is not None:
675
+ additional_kwargs["phonemizer_lang"] = phoneme_language
676
+
677
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
678
+ return batch
679
+
680
+ with training_args.main_process_first(desc="dataset map preprocessing"):
681
+ vectorized_datasets = raw_datasets.map(
682
+ prepare_dataset,
683
+ remove_columns=next(iter(raw_datasets.values())).column_names,
684
+ num_proc=num_workers,
685
+ desc="preprocess datasets",
686
+ )
687
+
688
+ def is_audio_in_length_range(length):
689
+ return length > min_input_length and length < max_input_length
690
+
691
+ # filter data that is shorter than min_input_length
692
+ vectorized_datasets = vectorized_datasets.filter(
693
+ is_audio_in_length_range,
694
+ num_proc=num_workers,
695
+ input_columns=["input_length"],
696
+ )
697
+
698
+ # 7. Next, we can prepare the training.
699
+ # Let's use word error rate (WER) as our evaluation metric,
700
+ # instantiate a data collator and the trainer
701
+
702
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
703
+ eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics}
704
+
705
+ # for large datasets it is advised to run the preprocessing on a
706
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
707
+ # be a timeout when running the script in distributed mode.
708
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
709
+ # cached dataset
710
+ if data_args.preprocessing_only:
711
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
712
+ return
713
+
714
+ # For languages like Chinese with large vocabulary size, we need to discard logits
715
+ # and only keep the argmax, otherwise we run out of memory during evaluation.
716
+ def preprocess_logits_for_metrics(logits, labels):
717
+ pred_ids = torch.argmax(logits, dim=-1)
718
+ return pred_ids, labels
719
+
720
+ def compute_metrics(pred):
721
+ pred_ids = pred.predictions[0]
722
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
723
+
724
+ pred_str = tokenizer.batch_decode(pred_ids)
725
+ # we do not want to group tokens when computing the metrics
726
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
727
+
728
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
729
+
730
+ return metrics
731
+
732
+ # Now save everything to be able to create a single processor later
733
+ # make sure all processes wait until data is saved
734
+ with training_args.main_process_first():
735
+ # only the main process saves them
736
+ if is_main_process(training_args.local_rank):
737
+ # save feature extractor, tokenizer and config
738
+ feature_extractor.save_pretrained(training_args.output_dir)
739
+ tokenizer.save_pretrained(training_args.output_dir)
740
+ config.save_pretrained(training_args.output_dir)
741
+
742
+ try:
743
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
744
+ except (OSError, KeyError):
745
+ warnings.warn(
746
+ "Loading a processor from a feature extractor config that does not"
747
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
748
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
749
+ " `'processor_class': 'Wav2Vec2Processor'`",
750
+ FutureWarning,
751
+ )
752
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
753
+
754
+ # Instantiate custom data collator
755
+ data_collator = DataCollatorCTCWithPadding(
756
+ processor=processor, feature_extractor_input_name=feature_extractor_input_name
757
+ )
758
+
759
+ # Initialize Trainer
760
+ trainer = Trainer(
761
+ model=model,
762
+ data_collator=data_collator,
763
+ args=training_args,
764
+ compute_metrics=compute_metrics,
765
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
766
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
767
+ tokenizer=processor,
768
+ preprocess_logits_for_metrics=preprocess_logits_for_metrics,
769
+ )
770
+
771
+ # 8. Finally, we can start training
772
+
773
+ # Training
774
+ if training_args.do_train:
775
+ # use last checkpoint if exist
776
+ if last_checkpoint is not None:
777
+ checkpoint = last_checkpoint
778
+ elif os.path.isdir(model_args.model_name_or_path):
779
+ checkpoint = model_args.model_name_or_path
780
+ else:
781
+ checkpoint = None
782
+
783
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
784
+ trainer.save_model()
785
+
786
+ metrics = train_result.metrics
787
+ max_train_samples = (
788
+ data_args.max_train_samples
789
+ if data_args.max_train_samples is not None
790
+ else len(vectorized_datasets["train"])
791
+ )
792
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
793
+
794
+ trainer.log_metrics("train", metrics)
795
+ trainer.save_metrics("train", metrics)
796
+ trainer.save_state()
797
+
798
+ # Evaluation
799
+ results = {}
800
+ if training_args.do_eval:
801
+ logger.info("*** Evaluate ***")
802
+ metrics = trainer.evaluate()
803
+ max_eval_samples = (
804
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
805
+ )
806
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
807
+
808
+ trainer.log_metrics("eval", metrics)
809
+ trainer.save_metrics("eval", metrics)
810
+
811
+ # Write model card and (optionally) push to hub
812
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
813
+ kwargs = {
814
+ "finetuned_from": model_args.model_name_or_path,
815
+ "tasks": "automatic-speech-recognition",
816
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
817
+ "dataset_args": (
818
+ f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
819
+ f" {data_args.eval_split_name}"
820
+ ),
821
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
822
+ }
823
+ if "common_voice" in data_args.dataset_name:
824
+ kwargs["language"] = config_name
825
+
826
+ if training_args.push_to_hub:
827
+ trainer.push_to_hub(**kwargs)
828
+ else:
829
+ trainer.create_model_card(**kwargs)
830
+
831
+ return results
832
+
833
+
834
+ if __name__ == "__main__":
835
+ main()
runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71563281c4fabcd575cd0a3087d26a44f9ce3cb361c20f297e548acb0eb445c9
3
+ size 6192
runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953c2638529ff4539eb1a8e0ae75d76c54e1cbca06486770b8edf490b9a48786
3
+ size 6192
runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f56de21e92a371394b396346e90360ea59c7a18cf2c51d858605250a065be8d4
3
+ size 6192
runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17633b209ac376da406114e746fca837219aa83d66d90d0aaf0816712bc41868
3
+ size 6192
runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:074142b6f3a30c0127fde6336a36a48e016a9bad0bd48ad40f6d74dec816ce41
3
+ size 6192
runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2079110f867071fbc495cd436b0a1360f44dbf8fcc3c41a0217289c6def7d32c
3
+ size 6604
runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca4979180e1ca2d32ca5566488d6178248d8aa0bd1caf46ed9f6285c6d11f7d
3
+ size 6811
runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:301dd9f07f5890a3822f9edc73ca7e5b85cd3e4d20c17b4e2068856f1b89aaf2
3
+ size 6604
runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34d45df376f452d62294b65ea3032948330a86c9f276f172cc5c542030498b41
3
+ size 6604
runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5c9eba6b7b28381ba8f17287db0093268fa02468f97b2be4a3d643ef2cb185d
3
+ size 158868
runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3f1c7a3afdbaaf70f0ab6eb3465364ec36b297ba72135f39057eeb3a379306d
3
+ size 21715
runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a39bb0aaef6ce9ea8f32d9ffba51fae1a21efd988361cd138b3bad7827ab8bc0
3
+ size 16542
runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dc9f6dad36f586790d3e4754501f93d810f582a8e97a2dd84ae3cc2683a992e
3
+ size 2705590
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": true,
19
+ "normalized": false,
20
+ "rstrip": true,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "[UNK]",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": true,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "28": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "29": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "30": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "31": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "[UNK]",
47
+ "word_delimiter_token": "|"
48
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ae2085582750eed1574146e140f321d91e803d129c7d445814e499b412abc85
3
+ size 5048
vocab.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "@": 1,
3
+ "[PAD]": 29,
4
+ "[UNK]": 28,
5
+ "a": 2,
6
+ "b": 3,
7
+ "c": 4,
8
+ "d": 5,
9
+ "e": 6,
10
+ "f": 7,
11
+ "g": 8,
12
+ "h": 9,
13
+ "i": 10,
14
+ "j": 11,
15
+ "k": 12,
16
+ "l": 13,
17
+ "m": 14,
18
+ "n": 15,
19
+ "o": 16,
20
+ "p": 17,
21
+ "q": 18,
22
+ "r": 19,
23
+ "s": 20,
24
+ "t": 21,
25
+ "u": 22,
26
+ "v": 23,
27
+ "w": 24,
28
+ "x": 25,
29
+ "y": 26,
30
+ "z": 27,
31
+ "|": 0
32
+ }
wav2vec2-base-timit-fine-tuned./README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: facebook/wav2vec2-base
4
+ tags:
5
+ - automatic-speech-recognition
6
+ - timit_asr
7
+ - generated_from_trainer
8
+ datasets:
9
+ - timit_asr
10
+ metrics:
11
+ - wer
12
+ model-index:
13
+ - name: wav2vec2-base-timit-fine-tuned
14
+ results:
15
+ - task:
16
+ name: Automatic Speech Recognition
17
+ type: automatic-speech-recognition
18
+ dataset:
19
+ name: TIMIT_ASR - NA
20
+ type: timit_asr
21
+ config: clean
22
+ split: test
23
+ args: 'Config: na, Training split: train, Eval split: test'
24
+ metrics:
25
+ - name: Wer
26
+ type: wer
27
+ value: 0.41728125284530637
28
+ ---
29
+
30
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
31
+ should probably proofread and complete it, then remove this comment. -->
32
+
33
+ # wav2vec2-base-timit-fine-tuned
34
+
35
+ This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the TIMIT_ASR - NA dataset.
36
+ It achieves the following results on the evaluation set:
37
+ - Loss: 0.4275
38
+ - Wer: 0.4173
39
+
40
+ ## Model description
41
+
42
+ More information needed
43
+
44
+ ## Intended uses & limitations
45
+
46
+ More information needed
47
+
48
+ ## Training and evaluation data
49
+
50
+ More information needed
51
+
52
+ ## Training procedure
53
+
54
+ ### Training hyperparameters
55
+
56
+ The following hyperparameters were used during training:
57
+ - learning_rate: 0.0001
58
+ - train_batch_size: 32
59
+ - eval_batch_size: 1
60
+ - seed: 42
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: linear
63
+ - lr_scheduler_warmup_steps: 1000
64
+ - num_epochs: 20.0
65
+ - mixed_precision_training: Native AMP
66
+
67
+ ### Training results
68
+
69
+ | Training Loss | Epoch | Step | Validation Loss | Wer |
70
+ |:-------------:|:-------:|:----:|:---------------:|:------:|
71
+ | 3.1618 | 0.8621 | 100 | 3.1117 | 1.0 |
72
+ | 2.9798 | 1.7241 | 200 | 2.9736 | 1.0 |
73
+ | 2.9144 | 2.5862 | 300 | 2.9075 | 1.0 |
74
+ | 2.1714 | 3.4483 | 400 | 2.0945 | 1.0325 |
75
+ | 1.1579 | 4.3103 | 500 | 1.0451 | 0.8299 |
76
+ | 0.6087 | 5.1724 | 600 | 0.6754 | 0.6441 |
77
+ | 0.481 | 6.0345 | 700 | 0.5275 | 0.5761 |
78
+ | 0.3072 | 6.8966 | 800 | 0.4836 | 0.5264 |
79
+ | 0.332 | 7.7586 | 900 | 0.4403 | 0.5234 |
80
+ | 0.1876 | 8.6207 | 1000 | 0.4758 | 0.5222 |
81
+ | 0.2232 | 9.4828 | 1100 | 0.4508 | 0.4892 |
82
+ | 0.1332 | 10.3448 | 1200 | 0.4394 | 0.4740 |
83
+ | 0.1085 | 11.2069 | 1300 | 0.4466 | 0.4621 |
84
+ | 0.098 | 12.0690 | 1400 | 0.4230 | 0.4493 |
85
+ | 0.1219 | 12.9310 | 1500 | 0.4180 | 0.4460 |
86
+ | 0.1021 | 13.7931 | 1600 | 0.4179 | 0.4406 |
87
+ | 0.0741 | 14.6552 | 1700 | 0.4113 | 0.4309 |
88
+ | 0.0896 | 15.5172 | 1800 | 0.4392 | 0.4308 |
89
+ | 0.0492 | 16.3793 | 1900 | 0.4202 | 0.4313 |
90
+ | 0.0759 | 17.2414 | 2000 | 0.4348 | 0.4207 |
91
+ | 0.0406 | 18.1034 | 2100 | 0.4419 | 0.4205 |
92
+ | 0.074 | 18.9655 | 2200 | 0.4306 | 0.4200 |
93
+ | 0.0378 | 19.8276 | 2300 | 0.4273 | 0.4173 |
94
+
95
+
96
+ ### Framework versions
97
+
98
+ - Transformers 4.42.0.dev0
99
+ - Pytorch 2.3.0.post300
100
+ - Datasets 2.19.1
101
+ - Tokenizers 0.19.1
wav2vec2-base-timit-fine-tuned./added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 30,
3
+ "<s>": 29
4
+ }
wav2vec2-base-timit-fine-tuned./all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_loss": 0.42749759554862976,
4
+ "eval_runtime": 39.6053,
5
+ "eval_samples": 1344,
6
+ "eval_samples_per_second": 33.935,
7
+ "eval_steps_per_second": 33.935,
8
+ "eval_wer": 0.41728125284530637,
9
+ "total_flos": 2.1476719263248095e+18,
10
+ "train_loss": 0.8618391515622879,
11
+ "train_runtime": 3159.4128,
12
+ "train_samples": 3696,
13
+ "train_samples_per_second": 23.397,
14
+ "train_steps_per_second": 0.734
15
+ }
wav2vec2-base-timit-fine-tuned./config.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "freeze_feat_extract_train": true,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.0,
59
+ "hidden_size": 768,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 3072,
62
+ "layer_norm_eps": 1e-05,
63
+ "layerdrop": 0.0,
64
+ "mask_channel_length": 10,
65
+ "mask_channel_min_space": 1,
66
+ "mask_channel_other": 0.0,
67
+ "mask_channel_prob": 0.0,
68
+ "mask_channel_selection": "static",
69
+ "mask_feature_length": 10,
70
+ "mask_feature_min_masks": 0,
71
+ "mask_feature_prob": 0.0,
72
+ "mask_time_length": 10,
73
+ "mask_time_min_masks": 2,
74
+ "mask_time_min_space": 1,
75
+ "mask_time_other": 0.0,
76
+ "mask_time_prob": 0.05,
77
+ "mask_time_selection": "static",
78
+ "model_type": "wav2vec2",
79
+ "no_mask_channel_overlap": false,
80
+ "no_mask_time_overlap": false,
81
+ "num_adapter_layers": 3,
82
+ "num_attention_heads": 12,
83
+ "num_codevector_groups": 2,
84
+ "num_codevectors_per_group": 320,
85
+ "num_conv_pos_embedding_groups": 16,
86
+ "num_conv_pos_embeddings": 128,
87
+ "num_feat_extract_layers": 7,
88
+ "num_hidden_layers": 12,
89
+ "num_negatives": 100,
90
+ "output_hidden_size": 768,
91
+ "pad_token_id": 28,
92
+ "proj_codevector_dim": 256,
93
+ "tdnn_dilation": [
94
+ 1,
95
+ 2,
96
+ 3,
97
+ 1,
98
+ 1
99
+ ],
100
+ "tdnn_dim": [
101
+ 512,
102
+ 512,
103
+ 512,
104
+ 512,
105
+ 1500
106
+ ],
107
+ "tdnn_kernel": [
108
+ 5,
109
+ 3,
110
+ 3,
111
+ 1,
112
+ 1
113
+ ],
114
+ "torch_dtype": "float32",
115
+ "transformers_version": "4.42.0.dev0",
116
+ "use_weighted_layer_sum": false,
117
+ "vocab_size": 31,
118
+ "xvector_output_dim": 512
119
+ }
wav2vec2-base-timit-fine-tuned./eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_loss": 0.42749759554862976,
4
+ "eval_runtime": 39.6053,
5
+ "eval_samples": 1344,
6
+ "eval_samples_per_second": 33.935,
7
+ "eval_steps_per_second": 33.935,
8
+ "eval_wer": 0.41728125284530637
9
+ }
wav2vec2-base-timit-fine-tuned./preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": false,
9
+ "sampling_rate": 16000
10
+ }
wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1499de7f8d44ad8690a4fee9818a4ec46085f303e71f1d916a3979f95334b4f
3
+ size 63169
wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:761f8f6656c0c227f5c72fd2abed63841c5757356b4cb775dfa24da593234fff
3
+ size 406
wav2vec2-base-timit-fine-tuned./special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": true,
19
+ "normalized": false,
20
+ "rstrip": true,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "[UNK]",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": true,
28
+ "single_word": false
29
+ }
30
+ }
wav2vec2-base-timit-fine-tuned./tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "27": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "28": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "29": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "30": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "[UNK]",
47
+ "word_delimiter_token": "|"
48
+ }
wav2vec2-base-timit-fine-tuned./train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 2.1476719263248095e+18,
4
+ "train_loss": 0.8618391515622879,
5
+ "train_runtime": 3159.4128,
6
+ "train_samples": 3696,
7
+ "train_samples_per_second": 23.397,
8
+ "train_steps_per_second": 0.734
9
+ }
wav2vec2-base-timit-fine-tuned./trainer_state.json ADDED
@@ -0,0 +1,1873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
+ "eval_steps": 100,
6
+ "global_step": 2320,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08620689655172414,
13
+ "grad_norm": 9.595185279846191,
14
+ "learning_rate": 9e-07,
15
+ "loss": 9.1142,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.1724137931034483,
20
+ "grad_norm": 9.732986450195312,
21
+ "learning_rate": 1.9e-06,
22
+ "loss": 8.3446,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.25862068965517243,
27
+ "grad_norm": 14.272214889526367,
28
+ "learning_rate": 2.8000000000000003e-06,
29
+ "loss": 8.6592,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.3448275862068966,
34
+ "grad_norm": 15.0160493850708,
35
+ "learning_rate": 3.8e-06,
36
+ "loss": 7.6985,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.43103448275862066,
41
+ "grad_norm": 16.610979080200195,
42
+ "learning_rate": 4.800000000000001e-06,
43
+ "loss": 6.9688,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.5172413793103449,
48
+ "grad_norm": 17.26924705505371,
49
+ "learning_rate": 5.8e-06,
50
+ "loss": 6.232,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.603448275862069,
55
+ "grad_norm": 11.347734451293945,
56
+ "learning_rate": 6.800000000000001e-06,
57
+ "loss": 4.7271,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.6896551724137931,
62
+ "grad_norm": 4.237112045288086,
63
+ "learning_rate": 7.8e-06,
64
+ "loss": 3.7919,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.7758620689655172,
69
+ "grad_norm": 1.8833028078079224,
70
+ "learning_rate": 8.8e-06,
71
+ "loss": 3.3967,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.8620689655172413,
76
+ "grad_norm": 1.3788093328475952,
77
+ "learning_rate": 9.800000000000001e-06,
78
+ "loss": 3.1618,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.8620689655172413,
83
+ "eval_loss": 3.1117007732391357,
84
+ "eval_runtime": 40.0512,
85
+ "eval_samples_per_second": 33.557,
86
+ "eval_steps_per_second": 33.557,
87
+ "eval_wer": 1.0,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.9482758620689655,
92
+ "grad_norm": 1.729278802871704,
93
+ "learning_rate": 1.08e-05,
94
+ "loss": 3.0865,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 1.0344827586206897,
99
+ "grad_norm": 1.905969500541687,
100
+ "learning_rate": 1.18e-05,
101
+ "loss": 3.0809,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 1.1206896551724137,
106
+ "grad_norm": 0.8360918760299683,
107
+ "learning_rate": 1.2800000000000001e-05,
108
+ "loss": 3.0346,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 1.206896551724138,
113
+ "grad_norm": 0.7653716206550598,
114
+ "learning_rate": 1.3800000000000002e-05,
115
+ "loss": 3.0106,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 1.293103448275862,
120
+ "grad_norm": 0.94779372215271,
121
+ "learning_rate": 1.48e-05,
122
+ "loss": 3.0165,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 1.3793103448275863,
127
+ "grad_norm": 0.8457741737365723,
128
+ "learning_rate": 1.58e-05,
129
+ "loss": 3.0,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 1.4655172413793103,
134
+ "grad_norm": 1.4369837045669556,
135
+ "learning_rate": 1.6800000000000002e-05,
136
+ "loss": 2.9903,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 1.5517241379310345,
141
+ "grad_norm": 1.8290436267852783,
142
+ "learning_rate": 1.78e-05,
143
+ "loss": 2.9852,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.6379310344827587,
148
+ "grad_norm": 1.1530190706253052,
149
+ "learning_rate": 1.88e-05,
150
+ "loss": 2.99,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 1.7241379310344827,
155
+ "grad_norm": 1.1261711120605469,
156
+ "learning_rate": 1.9800000000000004e-05,
157
+ "loss": 2.9798,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 1.7241379310344827,
162
+ "eval_loss": 2.9736363887786865,
163
+ "eval_runtime": 39.6236,
164
+ "eval_samples_per_second": 33.919,
165
+ "eval_steps_per_second": 33.919,
166
+ "eval_wer": 1.0,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 1.8103448275862069,
171
+ "grad_norm": 0.903380811214447,
172
+ "learning_rate": 2.08e-05,
173
+ "loss": 2.9718,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 1.896551724137931,
178
+ "grad_norm": 0.4889620244503021,
179
+ "learning_rate": 2.18e-05,
180
+ "loss": 2.9766,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 1.9827586206896552,
185
+ "grad_norm": 1.3861790895462036,
186
+ "learning_rate": 2.2800000000000002e-05,
187
+ "loss": 2.9658,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 2.0689655172413794,
192
+ "grad_norm": 0.7976490259170532,
193
+ "learning_rate": 2.38e-05,
194
+ "loss": 2.9588,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 2.1551724137931036,
199
+ "grad_norm": 0.698798418045044,
200
+ "learning_rate": 2.48e-05,
201
+ "loss": 2.9523,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 2.2413793103448274,
206
+ "grad_norm": 1.0858148336410522,
207
+ "learning_rate": 2.58e-05,
208
+ "loss": 2.9496,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 2.3275862068965516,
213
+ "grad_norm": 0.5658290386199951,
214
+ "learning_rate": 2.6800000000000004e-05,
215
+ "loss": 2.9421,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 2.413793103448276,
220
+ "grad_norm": 0.5713534355163574,
221
+ "learning_rate": 2.7800000000000005e-05,
222
+ "loss": 2.9427,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 2.5,
227
+ "grad_norm": 0.7386118769645691,
228
+ "learning_rate": 2.88e-05,
229
+ "loss": 2.9228,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 2.586206896551724,
234
+ "grad_norm": 0.767816960811615,
235
+ "learning_rate": 2.98e-05,
236
+ "loss": 2.9144,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 2.586206896551724,
241
+ "eval_loss": 2.9074809551239014,
242
+ "eval_runtime": 39.8997,
243
+ "eval_samples_per_second": 33.684,
244
+ "eval_steps_per_second": 33.684,
245
+ "eval_wer": 1.0,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 2.6724137931034484,
250
+ "grad_norm": 0.8676608204841614,
251
+ "learning_rate": 3.08e-05,
252
+ "loss": 2.8965,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 2.7586206896551726,
257
+ "grad_norm": 1.6954621076583862,
258
+ "learning_rate": 3.18e-05,
259
+ "loss": 2.8815,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 2.844827586206897,
264
+ "grad_norm": 1.1631884574890137,
265
+ "learning_rate": 3.2800000000000004e-05,
266
+ "loss": 2.855,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 2.9310344827586206,
271
+ "grad_norm": 1.625454306602478,
272
+ "learning_rate": 3.38e-05,
273
+ "loss": 2.781,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 3.0172413793103448,
278
+ "grad_norm": 2.0763564109802246,
279
+ "learning_rate": 3.48e-05,
280
+ "loss": 2.7756,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 3.103448275862069,
285
+ "grad_norm": 2.036031723022461,
286
+ "learning_rate": 3.58e-05,
287
+ "loss": 2.6458,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 3.189655172413793,
292
+ "grad_norm": 1.366801142692566,
293
+ "learning_rate": 3.68e-05,
294
+ "loss": 2.5189,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 3.2758620689655173,
299
+ "grad_norm": 2.034527540206909,
300
+ "learning_rate": 3.7800000000000004e-05,
301
+ "loss": 2.433,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 3.3620689655172415,
306
+ "grad_norm": 3.8338165283203125,
307
+ "learning_rate": 3.88e-05,
308
+ "loss": 2.2885,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 3.4482758620689653,
313
+ "grad_norm": 2.3443217277526855,
314
+ "learning_rate": 3.9800000000000005e-05,
315
+ "loss": 2.1714,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 3.4482758620689653,
320
+ "eval_loss": 2.0944502353668213,
321
+ "eval_runtime": 39.7668,
322
+ "eval_samples_per_second": 33.797,
323
+ "eval_steps_per_second": 33.797,
324
+ "eval_wer": 1.0325047801147227,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 3.5344827586206895,
329
+ "grad_norm": 4.349735260009766,
330
+ "learning_rate": 4.08e-05,
331
+ "loss": 2.0881,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 3.6206896551724137,
336
+ "grad_norm": 2.450747489929199,
337
+ "learning_rate": 4.18e-05,
338
+ "loss": 1.9522,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 3.706896551724138,
343
+ "grad_norm": 2.2519729137420654,
344
+ "learning_rate": 4.2800000000000004e-05,
345
+ "loss": 1.8395,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 3.793103448275862,
350
+ "grad_norm": 2.693664789199829,
351
+ "learning_rate": 4.38e-05,
352
+ "loss": 1.7525,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 3.8793103448275863,
357
+ "grad_norm": 1.9744929075241089,
358
+ "learning_rate": 4.4800000000000005e-05,
359
+ "loss": 1.6222,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 3.9655172413793105,
364
+ "grad_norm": 3.802494764328003,
365
+ "learning_rate": 4.58e-05,
366
+ "loss": 1.5397,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 4.051724137931035,
371
+ "grad_norm": 2.301044225692749,
372
+ "learning_rate": 4.6800000000000006e-05,
373
+ "loss": 1.4376,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 4.137931034482759,
378
+ "grad_norm": 2.279372215270996,
379
+ "learning_rate": 4.78e-05,
380
+ "loss": 1.2829,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 4.224137931034483,
385
+ "grad_norm": 3.314736843109131,
386
+ "learning_rate": 4.88e-05,
387
+ "loss": 1.1976,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 4.310344827586207,
392
+ "grad_norm": 2.434694290161133,
393
+ "learning_rate": 4.9800000000000004e-05,
394
+ "loss": 1.1579,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 4.310344827586207,
399
+ "eval_loss": 1.045101284980774,
400
+ "eval_runtime": 39.7455,
401
+ "eval_samples_per_second": 33.815,
402
+ "eval_steps_per_second": 33.815,
403
+ "eval_wer": 0.8299189656742239,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 4.396551724137931,
408
+ "grad_norm": 1.8384031057357788,
409
+ "learning_rate": 5.08e-05,
410
+ "loss": 1.0684,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 4.482758620689655,
415
+ "grad_norm": 3.599148988723755,
416
+ "learning_rate": 5.1800000000000005e-05,
417
+ "loss": 1.0319,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 4.568965517241379,
422
+ "grad_norm": 2.066476583480835,
423
+ "learning_rate": 5.28e-05,
424
+ "loss": 0.9179,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 4.655172413793103,
429
+ "grad_norm": 2.2173750400543213,
430
+ "learning_rate": 5.380000000000001e-05,
431
+ "loss": 0.8838,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 4.741379310344827,
436
+ "grad_norm": 2.427091121673584,
437
+ "learning_rate": 5.4800000000000004e-05,
438
+ "loss": 0.8991,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 4.827586206896552,
443
+ "grad_norm": 2.7432241439819336,
444
+ "learning_rate": 5.580000000000001e-05,
445
+ "loss": 0.8,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 4.913793103448276,
450
+ "grad_norm": 3.254221200942993,
451
+ "learning_rate": 5.68e-05,
452
+ "loss": 0.7803,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 5.0,
457
+ "grad_norm": 4.457448482513428,
458
+ "learning_rate": 5.7799999999999995e-05,
459
+ "loss": 0.8205,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 5.086206896551724,
464
+ "grad_norm": 3.1023166179656982,
465
+ "learning_rate": 5.88e-05,
466
+ "loss": 0.6703,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 5.172413793103448,
471
+ "grad_norm": 2.5916504859924316,
472
+ "learning_rate": 5.9800000000000003e-05,
473
+ "loss": 0.6087,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 5.172413793103448,
478
+ "eval_loss": 0.6753795146942139,
479
+ "eval_runtime": 39.7485,
480
+ "eval_samples_per_second": 33.813,
481
+ "eval_steps_per_second": 33.813,
482
+ "eval_wer": 0.6440863152144223,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 5.258620689655173,
487
+ "grad_norm": 2.1707613468170166,
488
+ "learning_rate": 6.08e-05,
489
+ "loss": 0.6569,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 5.344827586206897,
494
+ "grad_norm": 2.4291555881500244,
495
+ "learning_rate": 6.18e-05,
496
+ "loss": 0.5627,
497
+ "step": 620
498
+ },
499
+ {
500
+ "epoch": 5.431034482758621,
501
+ "grad_norm": 2.249617338180542,
502
+ "learning_rate": 6.280000000000001e-05,
503
+ "loss": 0.5381,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 5.517241379310345,
508
+ "grad_norm": 1.6661946773529053,
509
+ "learning_rate": 6.38e-05,
510
+ "loss": 0.6338,
511
+ "step": 640
512
+ },
513
+ {
514
+ "epoch": 5.603448275862069,
515
+ "grad_norm": 2.60294771194458,
516
+ "learning_rate": 6.48e-05,
517
+ "loss": 0.5181,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 5.689655172413794,
522
+ "grad_norm": 3.3003089427948,
523
+ "learning_rate": 6.58e-05,
524
+ "loss": 0.5189,
525
+ "step": 660
526
+ },
527
+ {
528
+ "epoch": 5.775862068965517,
529
+ "grad_norm": 1.880764126777649,
530
+ "learning_rate": 6.680000000000001e-05,
531
+ "loss": 0.564,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 5.862068965517241,
536
+ "grad_norm": 2.0575127601623535,
537
+ "learning_rate": 6.780000000000001e-05,
538
+ "loss": 0.4729,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 5.948275862068965,
543
+ "grad_norm": 2.5159761905670166,
544
+ "learning_rate": 6.879999999999999e-05,
545
+ "loss": 0.4899,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 6.0344827586206895,
550
+ "grad_norm": 1.4463504552841187,
551
+ "learning_rate": 6.98e-05,
552
+ "loss": 0.481,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 6.0344827586206895,
557
+ "eval_loss": 0.5275412201881409,
558
+ "eval_runtime": 39.9601,
559
+ "eval_samples_per_second": 33.634,
560
+ "eval_steps_per_second": 33.634,
561
+ "eval_wer": 0.5760721114449604,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 6.120689655172414,
566
+ "grad_norm": 1.788765549659729,
567
+ "learning_rate": 7.08e-05,
568
+ "loss": 0.3865,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 6.206896551724138,
573
+ "grad_norm": 1.862762212753296,
574
+ "learning_rate": 7.18e-05,
575
+ "loss": 0.3726,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 6.293103448275862,
580
+ "grad_norm": 1.6512093544006348,
581
+ "learning_rate": 7.280000000000001e-05,
582
+ "loss": 0.4116,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 6.379310344827586,
587
+ "grad_norm": 2.098067045211792,
588
+ "learning_rate": 7.38e-05,
589
+ "loss": 0.3779,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 6.4655172413793105,
594
+ "grad_norm": 3.3030078411102295,
595
+ "learning_rate": 7.48e-05,
596
+ "loss": 0.3728,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 6.551724137931035,
601
+ "grad_norm": 2.1799120903015137,
602
+ "learning_rate": 7.58e-05,
603
+ "loss": 0.4047,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 6.637931034482759,
608
+ "grad_norm": 1.862434983253479,
609
+ "learning_rate": 7.680000000000001e-05,
610
+ "loss": 0.313,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 6.724137931034483,
615
+ "grad_norm": 6.29113245010376,
616
+ "learning_rate": 7.780000000000001e-05,
617
+ "loss": 0.4052,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 6.810344827586206,
622
+ "grad_norm": 1.4220325946807861,
623
+ "learning_rate": 7.88e-05,
624
+ "loss": 0.3218,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 6.896551724137931,
629
+ "grad_norm": 2.586819648742676,
630
+ "learning_rate": 7.98e-05,
631
+ "loss": 0.3072,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 6.896551724137931,
636
+ "eval_loss": 0.4836220443248749,
637
+ "eval_runtime": 39.8762,
638
+ "eval_samples_per_second": 33.704,
639
+ "eval_steps_per_second": 33.704,
640
+ "eval_wer": 0.5264499681325685,
641
+ "step": 800
642
+ },
643
+ {
644
+ "epoch": 6.982758620689655,
645
+ "grad_norm": 1.6589460372924805,
646
+ "learning_rate": 8.080000000000001e-05,
647
+ "loss": 0.3862,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 7.068965517241379,
652
+ "grad_norm": 1.7299175262451172,
653
+ "learning_rate": 8.18e-05,
654
+ "loss": 0.2938,
655
+ "step": 820
656
+ },
657
+ {
658
+ "epoch": 7.155172413793103,
659
+ "grad_norm": 2.0545098781585693,
660
+ "learning_rate": 8.28e-05,
661
+ "loss": 0.249,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 7.241379310344827,
666
+ "grad_norm": 24.935670852661133,
667
+ "learning_rate": 8.38e-05,
668
+ "loss": 0.3202,
669
+ "step": 840
670
+ },
671
+ {
672
+ "epoch": 7.327586206896552,
673
+ "grad_norm": 2.497840642929077,
674
+ "learning_rate": 8.48e-05,
675
+ "loss": 0.2803,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 7.413793103448276,
680
+ "grad_norm": 2.698636531829834,
681
+ "learning_rate": 8.58e-05,
682
+ "loss": 0.2473,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 7.5,
687
+ "grad_norm": 1.4561227560043335,
688
+ "learning_rate": 8.680000000000001e-05,
689
+ "loss": 0.3223,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 7.586206896551724,
694
+ "grad_norm": 1.7760556936264038,
695
+ "learning_rate": 8.78e-05,
696
+ "loss": 0.2481,
697
+ "step": 880
698
+ },
699
+ {
700
+ "epoch": 7.672413793103448,
701
+ "grad_norm": 2.308103084564209,
702
+ "learning_rate": 8.88e-05,
703
+ "loss": 0.2545,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 7.758620689655173,
708
+ "grad_norm": 1.4128385782241821,
709
+ "learning_rate": 8.98e-05,
710
+ "loss": 0.332,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 7.758620689655173,
715
+ "eval_loss": 0.44030094146728516,
716
+ "eval_runtime": 39.9401,
717
+ "eval_samples_per_second": 33.65,
718
+ "eval_steps_per_second": 33.65,
719
+ "eval_wer": 0.5233542747883092,
720
+ "step": 900
721
+ },
722
+ {
723
+ "epoch": 7.844827586206897,
724
+ "grad_norm": 1.7903906106948853,
725
+ "learning_rate": 9.080000000000001e-05,
726
+ "loss": 0.2411,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 7.931034482758621,
731
+ "grad_norm": 2.0804216861724854,
732
+ "learning_rate": 9.180000000000001e-05,
733
+ "loss": 0.2707,
734
+ "step": 920
735
+ },
736
+ {
737
+ "epoch": 8.017241379310345,
738
+ "grad_norm": 1.4420605897903442,
739
+ "learning_rate": 9.28e-05,
740
+ "loss": 0.3186,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 8.10344827586207,
745
+ "grad_norm": 2.2910854816436768,
746
+ "learning_rate": 9.38e-05,
747
+ "loss": 0.1937,
748
+ "step": 940
749
+ },
750
+ {
751
+ "epoch": 8.189655172413794,
752
+ "grad_norm": 3.5892796516418457,
753
+ "learning_rate": 9.48e-05,
754
+ "loss": 0.2321,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 8.275862068965518,
759
+ "grad_norm": 1.6509956121444702,
760
+ "learning_rate": 9.58e-05,
761
+ "loss": 0.2868,
762
+ "step": 960
763
+ },
764
+ {
765
+ "epoch": 8.362068965517242,
766
+ "grad_norm": 1.6983604431152344,
767
+ "learning_rate": 9.680000000000001e-05,
768
+ "loss": 0.2004,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 8.448275862068966,
773
+ "grad_norm": 2.061176061630249,
774
+ "learning_rate": 9.78e-05,
775
+ "loss": 0.2025,
776
+ "step": 980
777
+ },
778
+ {
779
+ "epoch": 8.53448275862069,
780
+ "grad_norm": 1.7732270956039429,
781
+ "learning_rate": 9.88e-05,
782
+ "loss": 0.2598,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 8.620689655172415,
787
+ "grad_norm": 1.8335466384887695,
788
+ "learning_rate": 9.98e-05,
789
+ "loss": 0.1876,
790
+ "step": 1000
791
+ },
792
+ {
793
+ "epoch": 8.620689655172415,
794
+ "eval_loss": 0.4757933020591736,
795
+ "eval_runtime": 39.8291,
796
+ "eval_samples_per_second": 33.744,
797
+ "eval_steps_per_second": 33.744,
798
+ "eval_wer": 0.5221706273331512,
799
+ "step": 1000
800
+ },
801
+ {
802
+ "epoch": 8.706896551724139,
803
+ "grad_norm": 2.52902889251709,
804
+ "learning_rate": 9.939393939393939e-05,
805
+ "loss": 0.2456,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 8.793103448275861,
810
+ "grad_norm": 1.7294162511825562,
811
+ "learning_rate": 9.863636363636364e-05,
812
+ "loss": 0.2499,
813
+ "step": 1020
814
+ },
815
+ {
816
+ "epoch": 8.879310344827585,
817
+ "grad_norm": 21.9121150970459,
818
+ "learning_rate": 9.787878787878789e-05,
819
+ "loss": 0.1854,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 8.96551724137931,
824
+ "grad_norm": 3.9164559841156006,
825
+ "learning_rate": 9.712121212121212e-05,
826
+ "loss": 0.2576,
827
+ "step": 1040
828
+ },
829
+ {
830
+ "epoch": 9.051724137931034,
831
+ "grad_norm": 1.239221215248108,
832
+ "learning_rate": 9.636363636363637e-05,
833
+ "loss": 0.2118,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 9.137931034482758,
838
+ "grad_norm": 3.1416544914245605,
839
+ "learning_rate": 9.560606060606061e-05,
840
+ "loss": 0.1577,
841
+ "step": 1060
842
+ },
843
+ {
844
+ "epoch": 9.224137931034482,
845
+ "grad_norm": 2.4253621101379395,
846
+ "learning_rate": 9.484848484848486e-05,
847
+ "loss": 0.2092,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 9.310344827586206,
852
+ "grad_norm": 1.194345474243164,
853
+ "learning_rate": 9.40909090909091e-05,
854
+ "loss": 0.1876,
855
+ "step": 1080
856
+ },
857
+ {
858
+ "epoch": 9.39655172413793,
859
+ "grad_norm": 2.411029100418091,
860
+ "learning_rate": 9.333333333333334e-05,
861
+ "loss": 0.1546,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 9.482758620689655,
866
+ "grad_norm": 3.246082067489624,
867
+ "learning_rate": 9.257575757575758e-05,
868
+ "loss": 0.2232,
869
+ "step": 1100
870
+ },
871
+ {
872
+ "epoch": 9.482758620689655,
873
+ "eval_loss": 0.45077577233314514,
874
+ "eval_runtime": 39.9221,
875
+ "eval_samples_per_second": 33.666,
876
+ "eval_steps_per_second": 33.666,
877
+ "eval_wer": 0.48921059819721385,
878
+ "step": 1100
879
+ },
880
+ {
881
+ "epoch": 9.568965517241379,
882
+ "grad_norm": 1.3427454233169556,
883
+ "learning_rate": 9.181818181818183e-05,
884
+ "loss": 0.1777,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 9.655172413793103,
889
+ "grad_norm": 1.5090447664260864,
890
+ "learning_rate": 9.106060606060606e-05,
891
+ "loss": 0.1646,
892
+ "step": 1120
893
+ },
894
+ {
895
+ "epoch": 9.741379310344827,
896
+ "grad_norm": 1.3060975074768066,
897
+ "learning_rate": 9.030303030303031e-05,
898
+ "loss": 0.225,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 9.827586206896552,
903
+ "grad_norm": 1.3011540174484253,
904
+ "learning_rate": 8.954545454545455e-05,
905
+ "loss": 0.1552,
906
+ "step": 1140
907
+ },
908
+ {
909
+ "epoch": 9.913793103448276,
910
+ "grad_norm": 1.9938538074493408,
911
+ "learning_rate": 8.87878787878788e-05,
912
+ "loss": 0.1715,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 10.0,
917
+ "grad_norm": 3.334385395050049,
918
+ "learning_rate": 8.803030303030304e-05,
919
+ "loss": 0.2092,
920
+ "step": 1160
921
+ },
922
+ {
923
+ "epoch": 10.086206896551724,
924
+ "grad_norm": 1.011092185974121,
925
+ "learning_rate": 8.727272727272727e-05,
926
+ "loss": 0.14,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 10.172413793103448,
931
+ "grad_norm": 2.517902135848999,
932
+ "learning_rate": 8.651515151515152e-05,
933
+ "loss": 0.1512,
934
+ "step": 1180
935
+ },
936
+ {
937
+ "epoch": 10.258620689655173,
938
+ "grad_norm": 1.2418378591537476,
939
+ "learning_rate": 8.575757575757576e-05,
940
+ "loss": 0.1846,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 10.344827586206897,
945
+ "grad_norm": 1.5885329246520996,
946
+ "learning_rate": 8.5e-05,
947
+ "loss": 0.1332,
948
+ "step": 1200
949
+ },
950
+ {
951
+ "epoch": 10.344827586206897,
952
+ "eval_loss": 0.4394075274467468,
953
+ "eval_runtime": 39.9367,
954
+ "eval_samples_per_second": 33.653,
955
+ "eval_steps_per_second": 33.653,
956
+ "eval_wer": 0.4740052808886461,
957
+ "step": 1200
958
+ },
959
+ {
960
+ "epoch": 10.431034482758621,
961
+ "grad_norm": 1.2539469003677368,
962
+ "learning_rate": 8.424242424242424e-05,
963
+ "loss": 0.1485,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 10.517241379310345,
968
+ "grad_norm": 1.357601284980774,
969
+ "learning_rate": 8.348484848484849e-05,
970
+ "loss": 0.1988,
971
+ "step": 1220
972
+ },
973
+ {
974
+ "epoch": 10.60344827586207,
975
+ "grad_norm": 2.0564587116241455,
976
+ "learning_rate": 8.272727272727273e-05,
977
+ "loss": 0.137,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 10.689655172413794,
982
+ "grad_norm": 2.48364520072937,
983
+ "learning_rate": 8.196969696969698e-05,
984
+ "loss": 0.1245,
985
+ "step": 1240
986
+ },
987
+ {
988
+ "epoch": 10.775862068965518,
989
+ "grad_norm": 1.015891671180725,
990
+ "learning_rate": 8.121212121212121e-05,
991
+ "loss": 0.1602,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 10.862068965517242,
996
+ "grad_norm": 1.1023950576782227,
997
+ "learning_rate": 8.045454545454546e-05,
998
+ "loss": 0.1215,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 10.948275862068966,
1003
+ "grad_norm": 2.703427791595459,
1004
+ "learning_rate": 7.96969696969697e-05,
1005
+ "loss": 0.1621,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 11.03448275862069,
1010
+ "grad_norm": 1.1821691989898682,
1011
+ "learning_rate": 7.893939393939395e-05,
1012
+ "loss": 0.1651,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 11.120689655172415,
1017
+ "grad_norm": 0.930283784866333,
1018
+ "learning_rate": 7.818181818181818e-05,
1019
+ "loss": 0.1066,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 11.206896551724139,
1024
+ "grad_norm": 1.6548758745193481,
1025
+ "learning_rate": 7.742424242424243e-05,
1026
+ "loss": 0.1085,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 11.206896551724139,
1031
+ "eval_loss": 0.4466467499732971,
1032
+ "eval_runtime": 39.8633,
1033
+ "eval_samples_per_second": 33.715,
1034
+ "eval_steps_per_second": 33.715,
1035
+ "eval_wer": 0.46207775653282346,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 11.293103448275861,
1040
+ "grad_norm": 1.1760716438293457,
1041
+ "learning_rate": 7.666666666666667e-05,
1042
+ "loss": 0.1418,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 11.379310344827585,
1047
+ "grad_norm": 2.1062755584716797,
1048
+ "learning_rate": 7.59090909090909e-05,
1049
+ "loss": 0.1133,
1050
+ "step": 1320
1051
+ },
1052
+ {
1053
+ "epoch": 11.46551724137931,
1054
+ "grad_norm": 2.67399001121521,
1055
+ "learning_rate": 7.515151515151515e-05,
1056
+ "loss": 0.1318,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 11.551724137931034,
1061
+ "grad_norm": 1.0049142837524414,
1062
+ "learning_rate": 7.439393939393939e-05,
1063
+ "loss": 0.1474,
1064
+ "step": 1340
1065
+ },
1066
+ {
1067
+ "epoch": 11.637931034482758,
1068
+ "grad_norm": 1.586559772491455,
1069
+ "learning_rate": 7.363636363636364e-05,
1070
+ "loss": 0.0908,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 11.724137931034482,
1075
+ "grad_norm": 3.784040927886963,
1076
+ "learning_rate": 7.287878787878788e-05,
1077
+ "loss": 0.1521,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 11.810344827586206,
1082
+ "grad_norm": 1.125501275062561,
1083
+ "learning_rate": 7.212121212121213e-05,
1084
+ "loss": 0.1163,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 11.89655172413793,
1089
+ "grad_norm": 2.1989808082580566,
1090
+ "learning_rate": 7.136363636363636e-05,
1091
+ "loss": 0.1109,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 11.982758620689655,
1096
+ "grad_norm": 1.1287301778793335,
1097
+ "learning_rate": 7.060606060606061e-05,
1098
+ "loss": 0.152,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 12.068965517241379,
1103
+ "grad_norm": 1.538678765296936,
1104
+ "learning_rate": 6.984848484848485e-05,
1105
+ "loss": 0.098,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 12.068965517241379,
1110
+ "eval_loss": 0.42302384972572327,
1111
+ "eval_runtime": 40.1773,
1112
+ "eval_samples_per_second": 33.452,
1113
+ "eval_steps_per_second": 33.452,
1114
+ "eval_wer": 0.44933078393881454,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 12.155172413793103,
1119
+ "grad_norm": 1.400772213935852,
1120
+ "learning_rate": 6.90909090909091e-05,
1121
+ "loss": 0.092,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 12.241379310344827,
1126
+ "grad_norm": 3.6780846118927,
1127
+ "learning_rate": 6.833333333333333e-05,
1128
+ "loss": 0.1649,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 12.327586206896552,
1133
+ "grad_norm": 1.5424057245254517,
1134
+ "learning_rate": 6.757575757575758e-05,
1135
+ "loss": 0.091,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 12.413793103448276,
1140
+ "grad_norm": 1.4868180751800537,
1141
+ "learning_rate": 6.681818181818183e-05,
1142
+ "loss": 0.0869,
1143
+ "step": 1440
1144
+ },
1145
+ {
1146
+ "epoch": 12.5,
1147
+ "grad_norm": 1.1947145462036133,
1148
+ "learning_rate": 6.606060606060607e-05,
1149
+ "loss": 0.1499,
1150
+ "step": 1450
1151
+ },
1152
+ {
1153
+ "epoch": 12.586206896551724,
1154
+ "grad_norm": 1.0430784225463867,
1155
+ "learning_rate": 6.530303030303032e-05,
1156
+ "loss": 0.0954,
1157
+ "step": 1460
1158
+ },
1159
+ {
1160
+ "epoch": 12.672413793103448,
1161
+ "grad_norm": 2.4261584281921387,
1162
+ "learning_rate": 6.454545454545455e-05,
1163
+ "loss": 0.1032,
1164
+ "step": 1470
1165
+ },
1166
+ {
1167
+ "epoch": 12.758620689655173,
1168
+ "grad_norm": 1.033467411994934,
1169
+ "learning_rate": 6.37878787878788e-05,
1170
+ "loss": 0.1158,
1171
+ "step": 1480
1172
+ },
1173
+ {
1174
+ "epoch": 12.844827586206897,
1175
+ "grad_norm": 1.1535651683807373,
1176
+ "learning_rate": 6.303030303030302e-05,
1177
+ "loss": 0.0864,
1178
+ "step": 1490
1179
+ },
1180
+ {
1181
+ "epoch": 12.931034482758621,
1182
+ "grad_norm": 1.28826105594635,
1183
+ "learning_rate": 6.227272727272727e-05,
1184
+ "loss": 0.1219,
1185
+ "step": 1500
1186
+ },
1187
+ {
1188
+ "epoch": 12.931034482758621,
1189
+ "eval_loss": 0.418023020029068,
1190
+ "eval_runtime": 40.2192,
1191
+ "eval_samples_per_second": 33.417,
1192
+ "eval_steps_per_second": 33.417,
1193
+ "eval_wer": 0.44596194118182647,
1194
+ "step": 1500
1195
+ },
1196
+ {
1197
+ "epoch": 13.017241379310345,
1198
+ "grad_norm": 1.055411458015442,
1199
+ "learning_rate": 6.151515151515151e-05,
1200
+ "loss": 0.1289,
1201
+ "step": 1510
1202
+ },
1203
+ {
1204
+ "epoch": 13.10344827586207,
1205
+ "grad_norm": 1.1269094944000244,
1206
+ "learning_rate": 6.075757575757576e-05,
1207
+ "loss": 0.0776,
1208
+ "step": 1520
1209
+ },
1210
+ {
1211
+ "epoch": 13.189655172413794,
1212
+ "grad_norm": 1.7149118185043335,
1213
+ "learning_rate": 6e-05,
1214
+ "loss": 0.0871,
1215
+ "step": 1530
1216
+ },
1217
+ {
1218
+ "epoch": 13.275862068965518,
1219
+ "grad_norm": 1.7456856966018677,
1220
+ "learning_rate": 5.9242424242424244e-05,
1221
+ "loss": 0.1087,
1222
+ "step": 1540
1223
+ },
1224
+ {
1225
+ "epoch": 13.362068965517242,
1226
+ "grad_norm": 1.3434715270996094,
1227
+ "learning_rate": 5.848484848484849e-05,
1228
+ "loss": 0.0821,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 13.448275862068966,
1233
+ "grad_norm": 2.103512763977051,
1234
+ "learning_rate": 5.772727272727273e-05,
1235
+ "loss": 0.0878,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 13.53448275862069,
1240
+ "grad_norm": 1.240224838256836,
1241
+ "learning_rate": 5.696969696969697e-05,
1242
+ "loss": 0.1044,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 13.620689655172415,
1247
+ "grad_norm": 0.7336703538894653,
1248
+ "learning_rate": 5.6212121212121215e-05,
1249
+ "loss": 0.0753,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 13.706896551724139,
1254
+ "grad_norm": 2.293342351913452,
1255
+ "learning_rate": 5.545454545454546e-05,
1256
+ "loss": 0.1059,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 13.793103448275861,
1261
+ "grad_norm": 1.1853971481323242,
1262
+ "learning_rate": 5.46969696969697e-05,
1263
+ "loss": 0.1021,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 13.793103448275861,
1268
+ "eval_loss": 0.41785839200019836,
1269
+ "eval_runtime": 40.2906,
1270
+ "eval_samples_per_second": 33.358,
1271
+ "eval_steps_per_second": 33.358,
1272
+ "eval_wer": 0.4405900027314941,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 13.879310344827585,
1277
+ "grad_norm": 1.331200361251831,
1278
+ "learning_rate": 5.393939393939394e-05,
1279
+ "loss": 0.0648,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 13.96551724137931,
1284
+ "grad_norm": 2.28397536277771,
1285
+ "learning_rate": 5.3181818181818186e-05,
1286
+ "loss": 0.1121,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 14.051724137931034,
1291
+ "grad_norm": 0.9436893463134766,
1292
+ "learning_rate": 5.242424242424243e-05,
1293
+ "loss": 0.0725,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 14.137931034482758,
1298
+ "grad_norm": 1.6113288402557373,
1299
+ "learning_rate": 5.166666666666667e-05,
1300
+ "loss": 0.0691,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 14.224137931034482,
1305
+ "grad_norm": 2.479888439178467,
1306
+ "learning_rate": 5.090909090909091e-05,
1307
+ "loss": 0.0979,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 14.310344827586206,
1312
+ "grad_norm": 1.006616473197937,
1313
+ "learning_rate": 5.015151515151515e-05,
1314
+ "loss": 0.0909,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 14.39655172413793,
1319
+ "grad_norm": 1.4571704864501953,
1320
+ "learning_rate": 4.93939393939394e-05,
1321
+ "loss": 0.0761,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 14.482758620689655,
1326
+ "grad_norm": 1.5729875564575195,
1327
+ "learning_rate": 4.863636363636364e-05,
1328
+ "loss": 0.0862,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 14.568965517241379,
1333
+ "grad_norm": 1.2180376052856445,
1334
+ "learning_rate": 4.787878787878788e-05,
1335
+ "loss": 0.0646,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 14.655172413793103,
1340
+ "grad_norm": 1.7464072704315186,
1341
+ "learning_rate": 4.712121212121212e-05,
1342
+ "loss": 0.0741,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 14.655172413793103,
1347
+ "eval_loss": 0.4113341271877289,
1348
+ "eval_runtime": 40.2841,
1349
+ "eval_samples_per_second": 33.363,
1350
+ "eval_steps_per_second": 33.363,
1351
+ "eval_wer": 0.4309387234817445,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 14.741379310344827,
1356
+ "grad_norm": 0.8571386337280273,
1357
+ "learning_rate": 4.6439393939393944e-05,
1358
+ "loss": 0.1315,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 14.827586206896552,
1363
+ "grad_norm": 1.331377387046814,
1364
+ "learning_rate": 4.5681818181818186e-05,
1365
+ "loss": 0.0603,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 14.913793103448276,
1370
+ "grad_norm": 1.5398732423782349,
1371
+ "learning_rate": 4.492424242424242e-05,
1372
+ "loss": 0.0796,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 15.0,
1377
+ "grad_norm": 3.689671754837036,
1378
+ "learning_rate": 4.4166666666666665e-05,
1379
+ "loss": 0.085,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 15.086206896551724,
1384
+ "grad_norm": 1.132613182067871,
1385
+ "learning_rate": 4.340909090909091e-05,
1386
+ "loss": 0.0544,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 15.172413793103448,
1391
+ "grad_norm": 1.5951859951019287,
1392
+ "learning_rate": 4.265151515151515e-05,
1393
+ "loss": 0.0601,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 15.258620689655173,
1398
+ "grad_norm": 0.5179944634437561,
1399
+ "learning_rate": 4.189393939393939e-05,
1400
+ "loss": 0.097,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 15.344827586206897,
1405
+ "grad_norm": 0.9744370579719543,
1406
+ "learning_rate": 4.113636363636364e-05,
1407
+ "loss": 0.0596,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 15.431034482758621,
1412
+ "grad_norm": 1.8794275522232056,
1413
+ "learning_rate": 4.0378787878787885e-05,
1414
+ "loss": 0.0677,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 15.517241379310345,
1419
+ "grad_norm": 0.748386025428772,
1420
+ "learning_rate": 3.962121212121213e-05,
1421
+ "loss": 0.0896,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 15.517241379310345,
1426
+ "eval_loss": 0.43920788168907166,
1427
+ "eval_runtime": 40.1997,
1428
+ "eval_samples_per_second": 33.433,
1429
+ "eval_steps_per_second": 33.433,
1430
+ "eval_wer": 0.4307566238732587,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 15.60344827586207,
1435
+ "grad_norm": 0.9639837145805359,
1436
+ "learning_rate": 3.8863636363636364e-05,
1437
+ "loss": 0.0604,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 15.689655172413794,
1442
+ "grad_norm": 1.9640839099884033,
1443
+ "learning_rate": 3.810606060606061e-05,
1444
+ "loss": 0.0711,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 15.775862068965518,
1449
+ "grad_norm": 1.4438735246658325,
1450
+ "learning_rate": 3.734848484848485e-05,
1451
+ "loss": 0.0867,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 15.862068965517242,
1456
+ "grad_norm": 1.0062426328659058,
1457
+ "learning_rate": 3.659090909090909e-05,
1458
+ "loss": 0.0605,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 15.948275862068966,
1463
+ "grad_norm": 1.6331523656845093,
1464
+ "learning_rate": 3.5833333333333335e-05,
1465
+ "loss": 0.0662,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 16.03448275862069,
1470
+ "grad_norm": 0.8070217370986938,
1471
+ "learning_rate": 3.507575757575758e-05,
1472
+ "loss": 0.0765,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 16.120689655172413,
1477
+ "grad_norm": 1.4137670993804932,
1478
+ "learning_rate": 3.431818181818182e-05,
1479
+ "loss": 0.0537,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 16.20689655172414,
1484
+ "grad_norm": 1.5437769889831543,
1485
+ "learning_rate": 3.356060606060606e-05,
1486
+ "loss": 0.0684,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 16.29310344827586,
1491
+ "grad_norm": 0.90281081199646,
1492
+ "learning_rate": 3.2803030303030305e-05,
1493
+ "loss": 0.0744,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 16.379310344827587,
1498
+ "grad_norm": 1.139837622642517,
1499
+ "learning_rate": 3.204545454545455e-05,
1500
+ "loss": 0.0492,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 16.379310344827587,
1505
+ "eval_loss": 0.4201890528202057,
1506
+ "eval_runtime": 40.1502,
1507
+ "eval_samples_per_second": 33.474,
1508
+ "eval_steps_per_second": 33.474,
1509
+ "eval_wer": 0.4313029226987162,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 16.46551724137931,
1514
+ "grad_norm": 1.679457426071167,
1515
+ "learning_rate": 3.128787878787879e-05,
1516
+ "loss": 0.0652,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 16.551724137931036,
1521
+ "grad_norm": 0.6661111116409302,
1522
+ "learning_rate": 3.0530303030303034e-05,
1523
+ "loss": 0.0649,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 16.637931034482758,
1528
+ "grad_norm": 1.1774355173110962,
1529
+ "learning_rate": 2.9772727272727273e-05,
1530
+ "loss": 0.0469,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 16.724137931034484,
1535
+ "grad_norm": 1.783923864364624,
1536
+ "learning_rate": 2.901515151515152e-05,
1537
+ "loss": 0.0752,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 16.810344827586206,
1542
+ "grad_norm": 1.176321268081665,
1543
+ "learning_rate": 2.825757575757576e-05,
1544
+ "loss": 0.0519,
1545
+ "step": 1950
1546
+ },
1547
+ {
1548
+ "epoch": 16.896551724137932,
1549
+ "grad_norm": 1.3150608539581299,
1550
+ "learning_rate": 2.7500000000000004e-05,
1551
+ "loss": 0.0547,
1552
+ "step": 1960
1553
+ },
1554
+ {
1555
+ "epoch": 16.982758620689655,
1556
+ "grad_norm": 0.983769953250885,
1557
+ "learning_rate": 2.674242424242424e-05,
1558
+ "loss": 0.0799,
1559
+ "step": 1970
1560
+ },
1561
+ {
1562
+ "epoch": 17.06896551724138,
1563
+ "grad_norm": 0.996890127658844,
1564
+ "learning_rate": 2.5984848484848483e-05,
1565
+ "loss": 0.0577,
1566
+ "step": 1980
1567
+ },
1568
+ {
1569
+ "epoch": 17.155172413793103,
1570
+ "grad_norm": 2.3034253120422363,
1571
+ "learning_rate": 2.5227272727272726e-05,
1572
+ "loss": 0.0515,
1573
+ "step": 1990
1574
+ },
1575
+ {
1576
+ "epoch": 17.24137931034483,
1577
+ "grad_norm": 3.7528610229492188,
1578
+ "learning_rate": 2.4469696969696972e-05,
1579
+ "loss": 0.0759,
1580
+ "step": 2000
1581
+ },
1582
+ {
1583
+ "epoch": 17.24137931034483,
1584
+ "eval_loss": 0.43480169773101807,
1585
+ "eval_runtime": 40.017,
1586
+ "eval_samples_per_second": 33.586,
1587
+ "eval_steps_per_second": 33.586,
1588
+ "eval_wer": 0.4207411454065374,
1589
+ "step": 2000
1590
+ },
1591
+ {
1592
+ "epoch": 17.32758620689655,
1593
+ "grad_norm": 0.6646668314933777,
1594
+ "learning_rate": 2.3712121212121214e-05,
1595
+ "loss": 0.0419,
1596
+ "step": 2010
1597
+ },
1598
+ {
1599
+ "epoch": 17.413793103448278,
1600
+ "grad_norm": 1.3250740766525269,
1601
+ "learning_rate": 2.2954545454545457e-05,
1602
+ "loss": 0.0595,
1603
+ "step": 2020
1604
+ },
1605
+ {
1606
+ "epoch": 17.5,
1607
+ "grad_norm": 0.8094995021820068,
1608
+ "learning_rate": 2.21969696969697e-05,
1609
+ "loss": 0.0691,
1610
+ "step": 2030
1611
+ },
1612
+ {
1613
+ "epoch": 17.586206896551722,
1614
+ "grad_norm": 0.846946120262146,
1615
+ "learning_rate": 2.143939393939394e-05,
1616
+ "loss": 0.052,
1617
+ "step": 2040
1618
+ },
1619
+ {
1620
+ "epoch": 17.67241379310345,
1621
+ "grad_norm": 1.652417540550232,
1622
+ "learning_rate": 2.0681818181818182e-05,
1623
+ "loss": 0.0565,
1624
+ "step": 2050
1625
+ },
1626
+ {
1627
+ "epoch": 17.75862068965517,
1628
+ "grad_norm": 1.0080279111862183,
1629
+ "learning_rate": 1.9924242424242425e-05,
1630
+ "loss": 0.0745,
1631
+ "step": 2060
1632
+ },
1633
+ {
1634
+ "epoch": 17.844827586206897,
1635
+ "grad_norm": 0.7252691388130188,
1636
+ "learning_rate": 1.9166666666666667e-05,
1637
+ "loss": 0.0513,
1638
+ "step": 2070
1639
+ },
1640
+ {
1641
+ "epoch": 17.93103448275862,
1642
+ "grad_norm": 1.58548903465271,
1643
+ "learning_rate": 1.840909090909091e-05,
1644
+ "loss": 0.055,
1645
+ "step": 2080
1646
+ },
1647
+ {
1648
+ "epoch": 18.017241379310345,
1649
+ "grad_norm": 0.6634634733200073,
1650
+ "learning_rate": 1.7651515151515153e-05,
1651
+ "loss": 0.0658,
1652
+ "step": 2090
1653
+ },
1654
+ {
1655
+ "epoch": 18.103448275862068,
1656
+ "grad_norm": 1.1495524644851685,
1657
+ "learning_rate": 1.6893939393939395e-05,
1658
+ "loss": 0.0406,
1659
+ "step": 2100
1660
+ },
1661
+ {
1662
+ "epoch": 18.103448275862068,
1663
+ "eval_loss": 0.44191813468933105,
1664
+ "eval_runtime": 40.0967,
1665
+ "eval_samples_per_second": 33.519,
1666
+ "eval_steps_per_second": 33.519,
1667
+ "eval_wer": 0.42046799599380863,
1668
+ "step": 2100
1669
+ },
1670
+ {
1671
+ "epoch": 18.189655172413794,
1672
+ "grad_norm": 0.9788354635238647,
1673
+ "learning_rate": 1.6136363636363638e-05,
1674
+ "loss": 0.0381,
1675
+ "step": 2110
1676
+ },
1677
+ {
1678
+ "epoch": 18.275862068965516,
1679
+ "grad_norm": 1.093633770942688,
1680
+ "learning_rate": 1.5378787878787877e-05,
1681
+ "loss": 0.071,
1682
+ "step": 2120
1683
+ },
1684
+ {
1685
+ "epoch": 18.362068965517242,
1686
+ "grad_norm": 0.7164376974105835,
1687
+ "learning_rate": 1.4621212121212122e-05,
1688
+ "loss": 0.0439,
1689
+ "step": 2130
1690
+ },
1691
+ {
1692
+ "epoch": 18.448275862068964,
1693
+ "grad_norm": 0.9887032508850098,
1694
+ "learning_rate": 1.3863636363636364e-05,
1695
+ "loss": 0.0481,
1696
+ "step": 2140
1697
+ },
1698
+ {
1699
+ "epoch": 18.53448275862069,
1700
+ "grad_norm": 0.45052286982536316,
1701
+ "learning_rate": 1.3106060606060607e-05,
1702
+ "loss": 0.0571,
1703
+ "step": 2150
1704
+ },
1705
+ {
1706
+ "epoch": 18.620689655172413,
1707
+ "grad_norm": 1.167181134223938,
1708
+ "learning_rate": 1.234848484848485e-05,
1709
+ "loss": 0.0452,
1710
+ "step": 2160
1711
+ },
1712
+ {
1713
+ "epoch": 18.70689655172414,
1714
+ "grad_norm": 1.378661870956421,
1715
+ "learning_rate": 1.159090909090909e-05,
1716
+ "loss": 0.0643,
1717
+ "step": 2170
1718
+ },
1719
+ {
1720
+ "epoch": 18.79310344827586,
1721
+ "grad_norm": 0.854932963848114,
1722
+ "learning_rate": 1.0833333333333334e-05,
1723
+ "loss": 0.0587,
1724
+ "step": 2180
1725
+ },
1726
+ {
1727
+ "epoch": 18.879310344827587,
1728
+ "grad_norm": 0.8007526397705078,
1729
+ "learning_rate": 1.0075757575757576e-05,
1730
+ "loss": 0.0395,
1731
+ "step": 2190
1732
+ },
1733
+ {
1734
+ "epoch": 18.96551724137931,
1735
+ "grad_norm": 3.317830801010132,
1736
+ "learning_rate": 9.318181818181819e-06,
1737
+ "loss": 0.074,
1738
+ "step": 2200
1739
+ },
1740
+ {
1741
+ "epoch": 18.96551724137931,
1742
+ "eval_loss": 0.43061742186546326,
1743
+ "eval_runtime": 40.0034,
1744
+ "eval_samples_per_second": 33.597,
1745
+ "eval_steps_per_second": 33.597,
1746
+ "eval_wer": 0.420012746972594,
1747
+ "step": 2200
1748
+ },
1749
+ {
1750
+ "epoch": 19.051724137931036,
1751
+ "grad_norm": 0.7710875272750854,
1752
+ "learning_rate": 8.56060606060606e-06,
1753
+ "loss": 0.046,
1754
+ "step": 2210
1755
+ },
1756
+ {
1757
+ "epoch": 19.137931034482758,
1758
+ "grad_norm": 0.5200530886650085,
1759
+ "learning_rate": 7.803030303030304e-06,
1760
+ "loss": 0.0394,
1761
+ "step": 2220
1762
+ },
1763
+ {
1764
+ "epoch": 19.224137931034484,
1765
+ "grad_norm": 1.3544327020645142,
1766
+ "learning_rate": 7.045454545454545e-06,
1767
+ "loss": 0.0582,
1768
+ "step": 2230
1769
+ },
1770
+ {
1771
+ "epoch": 19.310344827586206,
1772
+ "grad_norm": 0.8653574585914612,
1773
+ "learning_rate": 6.287878787878789e-06,
1774
+ "loss": 0.0606,
1775
+ "step": 2240
1776
+ },
1777
+ {
1778
+ "epoch": 19.396551724137932,
1779
+ "grad_norm": 1.5852700471878052,
1780
+ "learning_rate": 5.530303030303031e-06,
1781
+ "loss": 0.0367,
1782
+ "step": 2250
1783
+ },
1784
+ {
1785
+ "epoch": 19.482758620689655,
1786
+ "grad_norm": 2.2167246341705322,
1787
+ "learning_rate": 4.772727272727273e-06,
1788
+ "loss": 0.0782,
1789
+ "step": 2260
1790
+ },
1791
+ {
1792
+ "epoch": 19.56896551724138,
1793
+ "grad_norm": 0.5891330242156982,
1794
+ "learning_rate": 4.015151515151515e-06,
1795
+ "loss": 0.0416,
1796
+ "step": 2270
1797
+ },
1798
+ {
1799
+ "epoch": 19.655172413793103,
1800
+ "grad_norm": 1.1137330532073975,
1801
+ "learning_rate": 3.257575757575758e-06,
1802
+ "loss": 0.0515,
1803
+ "step": 2280
1804
+ },
1805
+ {
1806
+ "epoch": 19.74137931034483,
1807
+ "grad_norm": 0.8132285475730896,
1808
+ "learning_rate": 2.5e-06,
1809
+ "loss": 0.0512,
1810
+ "step": 2290
1811
+ },
1812
+ {
1813
+ "epoch": 19.82758620689655,
1814
+ "grad_norm": 0.7994781136512756,
1815
+ "learning_rate": 1.7424242424242427e-06,
1816
+ "loss": 0.0378,
1817
+ "step": 2300
1818
+ },
1819
+ {
1820
+ "epoch": 19.82758620689655,
1821
+ "eval_loss": 0.4273350238800049,
1822
+ "eval_runtime": 40.0934,
1823
+ "eval_samples_per_second": 33.522,
1824
+ "eval_steps_per_second": 33.522,
1825
+ "eval_wer": 0.41728125284530637,
1826
+ "step": 2300
1827
+ },
1828
+ {
1829
+ "epoch": 19.913793103448278,
1830
+ "grad_norm": 0.9775754809379578,
1831
+ "learning_rate": 9.848484848484847e-07,
1832
+ "loss": 0.0489,
1833
+ "step": 2310
1834
+ },
1835
+ {
1836
+ "epoch": 20.0,
1837
+ "grad_norm": 0.8857516050338745,
1838
+ "learning_rate": 2.2727272727272726e-07,
1839
+ "loss": 0.0554,
1840
+ "step": 2320
1841
+ },
1842
+ {
1843
+ "epoch": 20.0,
1844
+ "step": 2320,
1845
+ "total_flos": 2.1476719263248095e+18,
1846
+ "train_loss": 0.8618391515622879,
1847
+ "train_runtime": 3159.4128,
1848
+ "train_samples_per_second": 23.397,
1849
+ "train_steps_per_second": 0.734
1850
+ }
1851
+ ],
1852
+ "logging_steps": 10,
1853
+ "max_steps": 2320,
1854
+ "num_input_tokens_seen": 0,
1855
+ "num_train_epochs": 20,
1856
+ "save_steps": 400,
1857
+ "stateful_callbacks": {
1858
+ "TrainerControl": {
1859
+ "args": {
1860
+ "should_epoch_stop": false,
1861
+ "should_evaluate": false,
1862
+ "should_log": false,
1863
+ "should_save": true,
1864
+ "should_training_stop": false
1865
+ },
1866
+ "attributes": {}
1867
+ }
1868
+ },
1869
+ "total_flos": 2.1476719263248095e+18,
1870
+ "train_batch_size": 32,
1871
+ "trial_name": null,
1872
+ "trial_params": null
1873
+ }
wav2vec2-base-timit-fine-tuned./training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abed99ebdf15c43d2882e1b9d49f7e81da386dc7c0be97a54f7bddbea730415d
3
+ size 5112
wav2vec2-base-timit-fine-tuned./vocab.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 28,
3
+ "[UNK]": 27,
4
+ "a": 1,
5
+ "b": 2,
6
+ "c": 3,
7
+ "d": 4,
8
+ "e": 5,
9
+ "f": 6,
10
+ "g": 7,
11
+ "h": 8,
12
+ "i": 9,
13
+ "j": 10,
14
+ "k": 11,
15
+ "l": 12,
16
+ "m": 13,
17
+ "n": 14,
18
+ "o": 15,
19
+ "p": 16,
20
+ "q": 17,
21
+ "r": 18,
22
+ "s": 19,
23
+ "t": 20,
24
+ "u": 21,
25
+ "v": 22,
26
+ "w": 23,
27
+ "x": 24,
28
+ "y": 25,
29
+ "z": 26,
30
+ "|": 0
31
+ }
wav2vec2-base-timit-fine-tuned/README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: facebook/wav2vec2-base
4
+ tags:
5
+ - automatic-speech-recognition
6
+ - timit_asr
7
+ - generated_from_trainer
8
+ datasets:
9
+ - timit_asr
10
+ metrics:
11
+ - wer
12
+ model-index:
13
+ - name: wav2vec2-base-timit-fine-tuned
14
+ results:
15
+ - task:
16
+ name: Automatic Speech Recognition
17
+ type: automatic-speech-recognition
18
+ dataset:
19
+ name: TIMIT_ASR - NA
20
+ type: timit_asr
21
+ config: clean
22
+ split: test
23
+ args: 'Config: na, Training split: train, Eval split: test'
24
+ metrics:
25
+ - name: Wer
26
+ type: wer
27
+ value: 0.4090867704634435
28
+ ---
29
+
30
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
31
+ should probably proofread and complete it, then remove this comment. -->
32
+
33
+ # wav2vec2-base-timit-fine-tuned
34
+
35
+ This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the TIMIT_ASR - NA dataset.
36
+ It achieves the following results on the evaluation set:
37
+ - Loss: 0.4218
38
+ - Wer: 0.4091
39
+
40
+ ## Model description
41
+
42
+ More information needed
43
+
44
+ ## Intended uses & limitations
45
+
46
+ More information needed
47
+
48
+ ## Training and evaluation data
49
+
50
+ More information needed
51
+
52
+ ## Training procedure
53
+
54
+ ### Training hyperparameters
55
+
56
+ The following hyperparameters were used during training:
57
+ - learning_rate: 0.0001
58
+ - train_batch_size: 32
59
+ - eval_batch_size: 1
60
+ - seed: 42
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: linear
63
+ - lr_scheduler_warmup_steps: 1000
64
+ - num_epochs: 20.0
65
+ - mixed_precision_training: Native AMP
66
+
67
+ ### Training results
68
+
69
+ | Training Loss | Epoch | Step | Validation Loss | Wer |
70
+ |:-------------:|:-------:|:----:|:---------------:|:------:|
71
+ | 3.1612 | 0.8621 | 100 | 3.1181 | 1.0 |
72
+ | 2.978 | 1.7241 | 200 | 2.9722 | 1.0 |
73
+ | 2.9185 | 2.5862 | 300 | 2.9098 | 1.0 |
74
+ | 2.1282 | 3.4483 | 400 | 2.0066 | 1.0247 |
75
+ | 1.1234 | 4.3103 | 500 | 1.0197 | 0.8393 |
76
+ | 0.602 | 5.1724 | 600 | 0.6714 | 0.6600 |
77
+ | 0.5032 | 6.0345 | 700 | 0.5285 | 0.5659 |
78
+ | 0.3101 | 6.8966 | 800 | 0.4819 | 0.5282 |
79
+ | 0.3432 | 7.7586 | 900 | 0.4653 | 0.5272 |
80
+ | 0.1922 | 8.6207 | 1000 | 0.4672 | 0.4918 |
81
+ | 0.2284 | 9.4828 | 1100 | 0.4834 | 0.4870 |
82
+ | 0.1372 | 10.3448 | 1200 | 0.4380 | 0.4727 |
83
+ | 0.1105 | 11.2069 | 1300 | 0.4509 | 0.4594 |
84
+ | 0.0992 | 12.0690 | 1400 | 0.4196 | 0.4544 |
85
+ | 0.1226 | 12.9310 | 1500 | 0.4237 | 0.4321 |
86
+ | 0.1013 | 13.7931 | 1600 | 0.4113 | 0.4298 |
87
+ | 0.0661 | 14.6552 | 1700 | 0.4038 | 0.4276 |
88
+ | 0.0901 | 15.5172 | 1800 | 0.4321 | 0.4225 |
89
+ | 0.053 | 16.3793 | 1900 | 0.4076 | 0.4236 |
90
+ | 0.0805 | 17.2414 | 2000 | 0.4336 | 0.4156 |
91
+ | 0.049 | 18.1034 | 2100 | 0.4193 | 0.4114 |
92
+ | 0.0717 | 18.9655 | 2200 | 0.4139 | 0.4091 |
93
+ | 0.0389 | 19.8276 | 2300 | 0.4216 | 0.4087 |
94
+
95
+
96
+ ### Framework versions
97
+
98
+ - Transformers 4.42.0.dev0
99
+ - Pytorch 2.3.0a0+git71dd2de
100
+ - Datasets 2.19.1
101
+ - Tokenizers 0.19.1
wav2vec2-base-timit-fine-tuned/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 30,
3
+ "<s>": 29
4
+ }
wav2vec2-base-timit-fine-tuned/all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_loss": 0.42176610231399536,
4
+ "eval_runtime": 39.428,
5
+ "eval_samples": 1344,
6
+ "eval_samples_per_second": 34.087,
7
+ "eval_steps_per_second": 34.087,
8
+ "eval_wer": 0.4090867704634435,
9
+ "total_flos": 2.1476719263248095e+18,
10
+ "train_loss": 0.8590125822430027,
11
+ "train_runtime": 3151.1477,
12
+ "train_samples": 3696,
13
+ "train_samples_per_second": 23.458,
14
+ "train_steps_per_second": 0.736
15
+ }
wav2vec2-base-timit-fine-tuned/config.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "freeze_feat_extract_train": true,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.0,
59
+ "hidden_size": 768,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 3072,
62
+ "layer_norm_eps": 1e-05,
63
+ "layerdrop": 0.0,
64
+ "mask_channel_length": 10,
65
+ "mask_channel_min_space": 1,
66
+ "mask_channel_other": 0.0,
67
+ "mask_channel_prob": 0.0,
68
+ "mask_channel_selection": "static",
69
+ "mask_feature_length": 10,
70
+ "mask_feature_min_masks": 0,
71
+ "mask_feature_prob": 0.0,
72
+ "mask_time_length": 10,
73
+ "mask_time_min_masks": 2,
74
+ "mask_time_min_space": 1,
75
+ "mask_time_other": 0.0,
76
+ "mask_time_prob": 0.05,
77
+ "mask_time_selection": "static",
78
+ "model_type": "wav2vec2",
79
+ "no_mask_channel_overlap": false,
80
+ "no_mask_time_overlap": false,
81
+ "num_adapter_layers": 3,
82
+ "num_attention_heads": 12,
83
+ "num_codevector_groups": 2,
84
+ "num_codevectors_per_group": 320,
85
+ "num_conv_pos_embedding_groups": 16,
86
+ "num_conv_pos_embeddings": 128,
87
+ "num_feat_extract_layers": 7,
88
+ "num_hidden_layers": 12,
89
+ "num_negatives": 100,
90
+ "output_hidden_size": 768,
91
+ "pad_token_id": 28,
92
+ "proj_codevector_dim": 256,
93
+ "tdnn_dilation": [
94
+ 1,
95
+ 2,
96
+ 3,
97
+ 1,
98
+ 1
99
+ ],
100
+ "tdnn_dim": [
101
+ 512,
102
+ 512,
103
+ 512,
104
+ 512,
105
+ 1500
106
+ ],
107
+ "tdnn_kernel": [
108
+ 5,
109
+ 3,
110
+ 3,
111
+ 1,
112
+ 1
113
+ ],
114
+ "torch_dtype": "float32",
115
+ "transformers_version": "4.42.0.dev0",
116
+ "use_weighted_layer_sum": false,
117
+ "vocab_size": 31,
118
+ "xvector_output_dim": 512
119
+ }