tz579 commited on May 26

Commit

6e42c7f

•

1 Parent(s): fde5f53

Training in progress, step 12776

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +12 -0
added_tokens.json +4 -0
config.json +109 -0
demo.4gram.py +22 -0
demo.nolm.py +22 -0
hub/version.txt +1 -0
modules/__init__.py +0 -0
preprocessor_config.json +10 -0
run.ami.log +0 -0
run.ami.sh +39 -0
run.timit.log +0 -0
run.timit.log. +1049 -0
run.timit.sh +30 -0
run_speech_recognition_ctc.py +840 -0
run_speech_recognition_ctc.py. +835 -0
runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 +3 -0
runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 +3 -0
runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 +3 -0
runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 +3 -0
runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 +3 -0
runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 +3 -0
runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 +3 -0
runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 +3 -0
runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 +3 -0
runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 +3 -0
runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 +3 -0
runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 +3 -0
runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 +3 -0
special_tokens_map.json +30 -0
tokenizer_config.json +48 -0
training_args.bin +3 -0
vocab.json +32 -0
wav2vec2-base-timit-fine-tuned./README.md +101 -0
wav2vec2-base-timit-fine-tuned./added_tokens.json +4 -0
wav2vec2-base-timit-fine-tuned./all_results.json +15 -0
wav2vec2-base-timit-fine-tuned./config.json +119 -0
wav2vec2-base-timit-fine-tuned./eval_results.json +9 -0
wav2vec2-base-timit-fine-tuned./preprocessor_config.json +10 -0
wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 +3 -0
wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 +3 -0
wav2vec2-base-timit-fine-tuned./special_tokens_map.json +30 -0
wav2vec2-base-timit-fine-tuned./tokenizer_config.json +48 -0
wav2vec2-base-timit-fine-tuned./train_results.json +9 -0
wav2vec2-base-timit-fine-tuned./trainer_state.json +1873 -0
wav2vec2-base-timit-fine-tuned./training_args.bin +3 -0
wav2vec2-base-timit-fine-tuned./vocab.json +31 -0
wav2vec2-base-timit-fine-tuned/README.md +101 -0
wav2vec2-base-timit-fine-tuned/added_tokens.json +4 -0
wav2vec2-base-timit-fine-tuned/all_results.json +15 -0
wav2vec2-base-timit-fine-tuned/config.json +119 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+*/*__pycache__*
+*/checkpoint*/
+*/data*/
+*/mdls*/
+*/model*
+*__pycache__*
+checkpoint*/
+data*/
+mdls*/
+input*/
+output*/
+model*

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</s>": 31,
+  "<s>": 30
+}

config.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "_name_or_path": "facebook/wav2vec2-large-lv60",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 29,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

demo.4gram.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# import
+import librosa
+from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
+# load the processor
+processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+# load the audio data (use your own wav file here!)
+input_audio, sr = librosa.load('my_wav_file.wav', sr=16000)
+# tokenize
+input_values = processor(input_audio, return_tensors="pt", padding="longest").input_values
+# retrieve logits
+logits = model(input_values).logits
+# decode using n-gram
+transcription = processor.batch_decode(logits.detach().numpy()).text
+# print the output
+print(transcription)

demo.nolm.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# import
+import librosa, torch
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
+# load the tokenizer and model
+tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+# load the audio data (use your own wav file here!)
+input_audio, sr = librosa.load('my_wav_file.wav', sr=16000)
+# tokenize
+input_values = tokenizer(input_audio, return_tensors="pt", padding="longest").input_values
+# retrieve logits
+logits = model(input_values).logits
+# take argmax and decode
+transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1))
+# print the output
+print(transcription)

hub/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

modules/__init__.py ADDED Viewed

File without changes

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

run.ami.log ADDED Viewed

The diff for this file is too large to render. See raw diff

run.ami.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+export HF_TOKEN=`cat /home/huggingface.token`
+export HF_HOME="/home/Work/common_huggingface"
+## IMPORTANT: This script was stopped after 1.5 epochs (2400 steps)
+## because the training loss was exploding => the best checkpoint (2000 steps)
+## was then taken.
+## MAKE SURE TO DO HYPER-PARAMETER TUNING TO GET BETTER RESULTS
+python run_speech_recognition_ctc.py \
+	--token="${HF_TOKEN}" \
+	--dataset_name="edinburghcstr/ami" \
+	--model_name_or_path="facebook/wav2vec2-large-lv60" \
+	--dataset_config_name="ihm" \
+	--train_split_name="train" \
+	--eval_split_name="validation" \
+	--output_dir="./" \
+	--preprocessing_num_workers="16" \
+	--overwrite_output_dir \
+	--num_train_epochs="2" \
+	--per_device_train_batch_size="16" \
+	--per_device_eval_batch_size="16" \
+	--gradient_accumulation_steps="1" \
+	--learning_rate="3e-4" \
+	--warmup_steps="500" \
+	--evaluation_strategy="steps" \
+	--text_column_name="text" \
+	--min_duration_in_seconds="0.25" \
+	--save_steps="400" \
+	--eval_steps="1000" \
+	--logging_steps="1" \
+	--layerdrop="0.0" \
+	--save_total_limit="3" \
+	--freeze_feature_encoder \
+	--gradient_checkpointing \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \
+	--fp16 \
+	--group_by_length \
+	--push_to_hub \
+	--do_eval \
+	--do_train --do_eval

run.timit.log ADDED Viewed

The diff for this file is too large to render. See raw diff

run.timit.log. ADDED Viewed

	@@ -0,0 +1,1049 @@

+/opt/conda/lib/python3.12/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+05/19/2024 22:08:09 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True
+05/19/2024 22:08:09 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
+_n_gpu=1,
+accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+batch_eval_metrics=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=True,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_steps=100,
+eval_strategy=IntervalStrategy.STEPS,
+evaluation_strategy=steps,
+fp16=True,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+gradient_accumulation_steps=1,
+gradient_checkpointing=False,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=True,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.0001,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=./wav2vec2-base-timit-fine-tuned/runs/May19_22-08-09_tz579-raptorlake,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_kwargs={},
+lr_scheduler_type=SchedulerType.LINEAR,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=20.0,
+optim=OptimizerNames.ADAMW_TORCH,
+optim_args=None,
+optim_target_modules=None,
+output_dir=./wav2vec2-base-timit-fine-tuned,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=32,
+prediction_loss_only=False,
+push_to_hub=True,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=['tensorboard'],
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+run_name=./wav2vec2-base-timit-fine-tuned,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=400,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=3,
+seed=42,
+skip_memory_metrics=True,
+split_batches=None,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=1000,
+weight_decay=0.005,
+)
+/opt/conda/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for timit_asr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/timit_asr
+You can avoid this message in future by passing the argument `trust_remote_code=True`.
+Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
+  warnings.warn(
+/opt/conda/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
+  warnings.warn(
+loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
+/opt/conda/lib/python3.12/site-packages/transformers/configuration_utils.py:364: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.
+  warnings.warn(
+Model config Wav2Vec2Config {
+  "_name_or_path": "facebook/wav2vec2-base",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "freeze_feat_extract_train": true,
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}
+Map: 100%|███████████████████████████████████████████████████████████████████████████████| 3696/3696 [00:00<00:00, 258999.36 examples/s]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:00<00:00, 582229.35 examples/s]
+`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.
+loading file vocab.json
+loading file tokenizer_config.json
+loading file added_tokens.json
+loading file special_tokens_map.json
+loading file tokenizer.json
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/preprocessor_config.json
+loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
+Model config Wav2Vec2Config {
+  "_name_or_path": "facebook/wav2vec2-base",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "freeze_feat_extract_train": true,
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}
+Feature extractor Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/pytorch_model.bin
+Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
+- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
+loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+loading configuration file ./wav2vec2-base-timit-fine-tuned/config.json
+Model config Wav2Vec2Config {
+  "_name_or_path": "./wav2vec2-base-timit-fine-tuned",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "freeze_feat_extract_train": true,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 28,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 31,
+  "xvector_output_dim": 512
+}
+loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+Feature extractor Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+loading file vocab.json
+loading file tokenizer_config.json
+loading file added_tokens.json
+loading file special_tokens_map.json
+loading file tokenizer.json
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+Processor Wav2Vec2Processor:
+- feature_extractor: Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='./wav2vec2-base-timit-fine-tuned', vocab_size=29, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
+	27: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
+	28: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
+	29: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	30: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+{
+  "processor_class": "Wav2Vec2Processor"
+}
+Using auto half precision backend
+The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running training *****
+  Num examples = 3,696
+  Num Epochs = 20
+  Instantaneous batch size per device = 32
+  Total train batch size (w. parallel, distributed & accumulation) = 32
+  Gradient Accumulation steps = 1
+  Total optimization steps = 2,320
+  Number of trainable parameters = 90,195,103
+  0%|▎                                                                                                 | 7/2320 [00:10<48:36,  1.26s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+{'loss': 9.1142, 'grad_norm': 9.595185279846191, 'learning_rate': 9e-07, 'epoch': 0.09}
+{'loss': 8.3446, 'grad_norm': 9.732986450195312, 'learning_rate': 1.9e-06, 'epoch': 0.17}
+{'loss': 8.6592, 'grad_norm': 14.272214889526367, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.26}
+{'loss': 7.6985, 'grad_norm': 15.0160493850708, 'learning_rate': 3.8e-06, 'epoch': 0.34}
+{'loss': 6.9688, 'grad_norm': 16.610979080200195, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.43}
+{'loss': 6.232, 'grad_norm': 17.26924705505371, 'learning_rate': 5.8e-06, 'epoch': 0.52}
+{'loss': 4.7271, 'grad_norm': 11.347734451293945, 'learning_rate': 6.800000000000001e-06, 'epoch': 0.6}
+{'loss': 3.7919, 'grad_norm': 4.237112045288086, 'learning_rate': 7.8e-06, 'epoch': 0.69}
+{'loss': 3.3967, 'grad_norm': 1.8833028078079224, 'learning_rate': 8.8e-06, 'epoch': 0.78}
+{'loss': 3.1618, 'grad_norm': 1.3788093328475952, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.86}
+  4%|████▏                                                                                           | 100/2320 [01:39<33:07,  1.12it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 3.1117007732391357, 'eval_wer': 1.0, 'eval_runtime': 40.0512, 'eval_samples_per_second': 33.557, 'eval_steps_per_second': 33.557, 'epoch': 0.86}
+{'loss': 3.0865, 'grad_norm': 1.729278802871704, 'learning_rate': 1.08e-05, 'epoch': 0.95}
+{'loss': 3.0809, 'grad_norm': 1.905969500541687, 'learning_rate': 1.18e-05, 'epoch': 1.03}
+{'loss': 3.0346, 'grad_norm': 0.8360918760299683, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.12}
+{'loss': 3.0106, 'grad_norm': 0.7653716206550598, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.21}
+{'loss': 3.0165, 'grad_norm': 0.94779372215271, 'learning_rate': 1.48e-05, 'epoch': 1.29}
+{'loss': 3.0, 'grad_norm': 0.8457741737365723, 'learning_rate': 1.58e-05, 'epoch': 1.38}
+{'loss': 2.9903, 'grad_norm': 1.4369837045669556, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.47}
+{'loss': 2.9852, 'grad_norm': 1.8290436267852783, 'learning_rate': 1.78e-05, 'epoch': 1.55}
+{'loss': 2.99, 'grad_norm': 1.1530190706253052, 'learning_rate': 1.88e-05, 'epoch': 1.64}
+{'loss': 2.9798, 'grad_norm': 1.1261711120605469, 'learning_rate': 1.9800000000000004e-05, 'epoch': 1.72}
+  9%|████████▎                                                                                       | 200/2320 [03:52<24:28,  1.44it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 2.9736363887786865, 'eval_wer': 1.0, 'eval_runtime': 39.6236, 'eval_samples_per_second': 33.919, 'eval_steps_per_second': 33.919, 'epoch': 1.72}
+{'loss': 2.9718, 'grad_norm': 0.903380811214447, 'learning_rate': 2.08e-05, 'epoch': 1.81}
+{'loss': 2.9766, 'grad_norm': 0.4889620244503021, 'learning_rate': 2.18e-05, 'epoch': 1.9}
+{'loss': 2.9658, 'grad_norm': 1.3861790895462036, 'learning_rate': 2.2800000000000002e-05, 'epoch': 1.98}
+{'loss': 2.9588, 'grad_norm': 0.7976490259170532, 'learning_rate': 2.38e-05, 'epoch': 2.07}
+{'loss': 2.9523, 'grad_norm': 0.698798418045044, 'learning_rate': 2.48e-05, 'epoch': 2.16}
+{'loss': 2.9496, 'grad_norm': 1.0858148336410522, 'learning_rate': 2.58e-05, 'epoch': 2.24}
+{'loss': 2.9421, 'grad_norm': 0.5658290386199951, 'learning_rate': 2.6800000000000004e-05, 'epoch': 2.33}
+{'loss': 2.9427, 'grad_norm': 0.5713534355163574, 'learning_rate': 2.7800000000000005e-05, 'epoch': 2.41}
+{'loss': 2.9228, 'grad_norm': 0.7386118769645691, 'learning_rate': 2.88e-05, 'epoch': 2.5}
+{'loss': 2.9144, 'grad_norm': 0.767816960811615, 'learning_rate': 2.98e-05, 'epoch': 2.59}
+ 13%|████████████▍                                                                                   | 300/2320 [06:10<33:46,  1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 2.9074809551239014, 'eval_wer': 1.0, 'eval_runtime': 39.8997, 'eval_samples_per_second': 33.684, 'eval_steps_per_second': 33.684, 'epoch': 2.59}
+{'loss': 2.8965, 'grad_norm': 0.8676608204841614, 'learning_rate': 3.08e-05, 'epoch': 2.67}
+{'loss': 2.8815, 'grad_norm': 1.6954621076583862, 'learning_rate': 3.18e-05, 'epoch': 2.76}
+{'loss': 2.855, 'grad_norm': 1.1631884574890137, 'learning_rate': 3.2800000000000004e-05, 'epoch': 2.84}
+{'loss': 2.781, 'grad_norm': 1.625454306602478, 'learning_rate': 3.38e-05, 'epoch': 2.93}
+{'loss': 2.7756, 'grad_norm': 2.0763564109802246, 'learning_rate': 3.48e-05, 'epoch': 3.02}
+{'loss': 2.6458, 'grad_norm': 2.036031723022461, 'learning_rate': 3.58e-05, 'epoch': 3.1}
+{'loss': 2.5189, 'grad_norm': 1.366801142692566, 'learning_rate': 3.68e-05, 'epoch': 3.19}
+{'loss': 2.433, 'grad_norm': 2.034527540206909, 'learning_rate': 3.7800000000000004e-05, 'epoch': 3.28}
+{'loss': 2.2885, 'grad_norm': 3.8338165283203125, 'learning_rate': 3.88e-05, 'epoch': 3.36}
+{'loss': 2.1714, 'grad_norm': 2.3443217277526855, 'learning_rate': 3.9800000000000005e-05, 'epoch': 3.45}
+ 17%|████████████████▌                                                                               | 400/2320 [08:24<23:08,  1.38it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 2.0944502353668213, 'eval_wer': 1.0325047801147227, 'eval_runtime': 39.7668, 'eval_samples_per_second': 33.797, 'eval_steps_per_second': 33.797, 'epoch': 3.45}
+ 17%|████████████████▌                                                                               | 400/2320 [09:04<23:08,  1.38it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-400
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/added_tokens.json
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+ 17%|████████████████▏                                                                             | 401/2320 [09:06<6:52:25, 12.90s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+{'loss': 2.0881, 'grad_norm': 4.349735260009766, 'learning_rate': 4.08e-05, 'epoch': 3.53}
+{'loss': 1.9522, 'grad_norm': 2.450747489929199, 'learning_rate': 4.18e-05, 'epoch': 3.62}
+{'loss': 1.8395, 'grad_norm': 2.2519729137420654, 'learning_rate': 4.2800000000000004e-05, 'epoch': 3.71}
+{'loss': 1.7525, 'grad_norm': 2.693664789199829, 'learning_rate': 4.38e-05, 'epoch': 3.79}
+{'loss': 1.6222, 'grad_norm': 1.9744929075241089, 'learning_rate': 4.4800000000000005e-05, 'epoch': 3.88}
+{'loss': 1.5397, 'grad_norm': 3.802494764328003, 'learning_rate': 4.58e-05, 'epoch': 3.97}
+{'loss': 1.4376, 'grad_norm': 2.301044225692749, 'learning_rate': 4.6800000000000006e-05, 'epoch': 4.05}
+{'loss': 1.2829, 'grad_norm': 2.279372215270996, 'learning_rate': 4.78e-05, 'epoch': 4.14}
+{'loss': 1.1976, 'grad_norm': 3.314736843109131, 'learning_rate': 4.88e-05, 'epoch': 4.22}
+{'loss': 1.1579, 'grad_norm': 2.434694290161133, 'learning_rate': 4.9800000000000004e-05, 'epoch': 4.31}
+ 22%|████████████████████▋                                                                           | 500/2320 [10:43<34:53,  1.15s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 1.045101284980774, 'eval_wer': 0.8299189656742239, 'eval_runtime': 39.7455, 'eval_samples_per_second': 33.815, 'eval_steps_per_second': 33.815, 'epoch': 4.31}
+{'loss': 1.0684, 'grad_norm': 1.8384031057357788, 'learning_rate': 5.08e-05, 'epoch': 4.4}
+{'loss': 1.0319, 'grad_norm': 3.599148988723755, 'learning_rate': 5.1800000000000005e-05, 'epoch': 4.48}
+{'loss': 0.9179, 'grad_norm': 2.066476583480835, 'learning_rate': 5.28e-05, 'epoch': 4.57}
+{'loss': 0.8838, 'grad_norm': 2.2173750400543213, 'learning_rate': 5.380000000000001e-05, 'epoch': 4.66}
+{'loss': 0.8991, 'grad_norm': 2.427091121673584, 'learning_rate': 5.4800000000000004e-05, 'epoch': 4.74}
+{'loss': 0.8, 'grad_norm': 2.7432241439819336, 'learning_rate': 5.580000000000001e-05, 'epoch': 4.83}
+{'loss': 0.7803, 'grad_norm': 3.254221200942993, 'learning_rate': 5.68e-05, 'epoch': 4.91}
+{'loss': 0.8205, 'grad_norm': 4.457448482513428, 'learning_rate': 5.7799999999999995e-05, 'epoch': 5.0}
+{'loss': 0.6703, 'grad_norm': 3.1023166179656982, 'learning_rate': 5.88e-05, 'epoch': 5.09}
+{'loss': 0.6087, 'grad_norm': 2.5916504859924316, 'learning_rate': 5.9800000000000003e-05, 'epoch': 5.17}
+ 26%|████████████████████████▊                                                                       | 600/2320 [12:58<23:53,  1.20it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.6753795146942139, 'eval_wer': 0.6440863152144223, 'eval_runtime': 39.7485, 'eval_samples_per_second': 33.813, 'eval_steps_per_second': 33.813, 'epoch': 5.17}
+{'loss': 0.6569, 'grad_norm': 2.1707613468170166, 'learning_rate': 6.08e-05, 'epoch': 5.26}
+{'loss': 0.5627, 'grad_norm': 2.4291555881500244, 'learning_rate': 6.18e-05, 'epoch': 5.34}
+{'loss': 0.5381, 'grad_norm': 2.249617338180542, 'learning_rate': 6.280000000000001e-05, 'epoch': 5.43}
+{'loss': 0.6338, 'grad_norm': 1.6661946773529053, 'learning_rate': 6.38e-05, 'epoch': 5.52}
+{'loss': 0.5181, 'grad_norm': 2.60294771194458, 'learning_rate': 6.48e-05, 'epoch': 5.6}
+{'loss': 0.5189, 'grad_norm': 3.3003089427948, 'learning_rate': 6.58e-05, 'epoch': 5.69}
+{'loss': 0.564, 'grad_norm': 1.880764126777649, 'learning_rate': 6.680000000000001e-05, 'epoch': 5.78}
+{'loss': 0.4729, 'grad_norm': 2.0575127601623535, 'learning_rate': 6.780000000000001e-05, 'epoch': 5.86}
+{'loss': 0.4899, 'grad_norm': 2.5159761905670166, 'learning_rate': 6.879999999999999e-05, 'epoch': 5.95}
+{'loss': 0.481, 'grad_norm': 1.4463504552841187, 'learning_rate': 6.98e-05, 'epoch': 6.03}
+ 30%|████████████████████████████▉                                                                   | 700/2320 [15:14<36:18,  1.34s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.5275412201881409, 'eval_wer': 0.5760721114449604, 'eval_runtime': 39.9601, 'eval_samples_per_second': 33.634, 'eval_steps_per_second': 33.634, 'epoch': 6.03}
+{'loss': 0.3865, 'grad_norm': 1.788765549659729, 'learning_rate': 7.08e-05, 'epoch': 6.12}
+{'loss': 0.3726, 'grad_norm': 1.862762212753296, 'learning_rate': 7.18e-05, 'epoch': 6.21}
+{'loss': 0.4116, 'grad_norm': 1.6512093544006348, 'learning_rate': 7.280000000000001e-05, 'epoch': 6.29}
+{'loss': 0.3779, 'grad_norm': 2.098067045211792, 'learning_rate': 7.38e-05, 'epoch': 6.38}
+{'loss': 0.3728, 'grad_norm': 3.3030078411102295, 'learning_rate': 7.48e-05, 'epoch': 6.47}
+{'loss': 0.4047, 'grad_norm': 2.1799120903015137, 'learning_rate': 7.58e-05, 'epoch': 6.55}
+{'loss': 0.313, 'grad_norm': 1.862434983253479, 'learning_rate': 7.680000000000001e-05, 'epoch': 6.64}
+{'loss': 0.4052, 'grad_norm': 6.29113245010376, 'learning_rate': 7.780000000000001e-05, 'epoch': 6.72}
+{'loss': 0.3218, 'grad_norm': 1.4220325946807861, 'learning_rate': 7.88e-05, 'epoch': 6.81}
+{'loss': 0.3072, 'grad_norm': 2.586819648742676, 'learning_rate': 7.98e-05, 'epoch': 6.9}
+ 34%|█████████████████████████████████                                                               | 800/2320 [17:30<20:39,  1.23it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4836220443248749, 'eval_wer': 0.5264499681325685, 'eval_runtime': 39.8762, 'eval_samples_per_second': 33.704, 'eval_steps_per_second': 33.704, 'epoch': 6.9}
+ 34%|█████████████████████████████████                                                               | 800/2320 [18:10<20:39,  1.23it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-800
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/added_tokens.json
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+{'loss': 0.3862, 'grad_norm': 1.6589460372924805, 'learning_rate': 8.080000000000001e-05, 'epoch': 6.98}
+{'loss': 0.2938, 'grad_norm': 1.7299175262451172, 'learning_rate': 8.18e-05, 'epoch': 7.07}
+{'loss': 0.249, 'grad_norm': 2.0545098781585693, 'learning_rate': 8.28e-05, 'epoch': 7.16}
+ 36%|██████████████████████████████████▋                                                             | 837/2320 [18:46<17:32,  1.41it/s]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+{'loss': 0.3202, 'grad_norm': 24.935670852661133, 'learning_rate': 8.38e-05, 'epoch': 7.24}
+{'loss': 0.2803, 'grad_norm': 2.497840642929077, 'learning_rate': 8.48e-05, 'epoch': 7.33}
+{'loss': 0.2473, 'grad_norm': 2.698636531829834, 'learning_rate': 8.58e-05, 'epoch': 7.41}
+{'loss': 0.3223, 'grad_norm': 1.4561227560043335, 'learning_rate': 8.680000000000001e-05, 'epoch': 7.5}
+{'loss': 0.2481, 'grad_norm': 1.7760556936264038, 'learning_rate': 8.78e-05, 'epoch': 7.59}
+{'loss': 0.2545, 'grad_norm': 2.308103084564209, 'learning_rate': 8.88e-05, 'epoch': 7.67}
+{'loss': 0.332, 'grad_norm': 1.4128385782241821, 'learning_rate': 8.98e-05, 'epoch': 7.76}
+ 39%|█████████████████████████████████████▏                                                          | 900/2320 [19:48<29:47,  1.26s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.44030094146728516, 'eval_wer': 0.5233542747883092, 'eval_runtime': 39.9401, 'eval_samples_per_second': 33.65, 'eval_steps_per_second': 33.65, 'epoch': 7.76}
+{'loss': 0.2411, 'grad_norm': 1.7903906106948853, 'learning_rate': 9.080000000000001e-05, 'epoch': 7.84}
+{'loss': 0.2707, 'grad_norm': 2.0804216861724854, 'learning_rate': 9.180000000000001e-05, 'epoch': 7.93}
+{'loss': 0.3186, 'grad_norm': 1.4420605897903442, 'learning_rate': 9.28e-05, 'epoch': 8.02}
+{'loss': 0.1937, 'grad_norm': 2.2910854816436768, 'learning_rate': 9.38e-05, 'epoch': 8.1}
+{'loss': 0.2321, 'grad_norm': 3.5892796516418457, 'learning_rate': 9.48e-05, 'epoch': 8.19}
+{'loss': 0.2868, 'grad_norm': 1.6509956121444702, 'learning_rate': 9.58e-05, 'epoch': 8.28}
+{'loss': 0.2004, 'grad_norm': 1.6983604431152344, 'learning_rate': 9.680000000000001e-05, 'epoch': 8.36}
+{'loss': 0.2025, 'grad_norm': 2.061176061630249, 'learning_rate': 9.78e-05, 'epoch': 8.45}
+{'loss': 0.2598, 'grad_norm': 1.7732270956039429, 'learning_rate': 9.88e-05, 'epoch': 8.53}
+{'loss': 0.1876, 'grad_norm': 1.8335466384887695, 'learning_rate': 9.98e-05, 'epoch': 8.62}
+ 43%|████████████████████████████████████████▉                                                      | 1000/2320 [22:05<20:18,  1.08it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4757933020591736, 'eval_wer': 0.5221706273331512, 'eval_runtime': 39.8291, 'eval_samples_per_second': 33.744, 'eval_steps_per_second': 33.744, 'epoch': 8.62}
+{'loss': 0.2456, 'grad_norm': 2.52902889251709, 'learning_rate': 9.939393939393939e-05, 'epoch': 8.71}
+{'loss': 0.2499, 'grad_norm': 1.7294162511825562, 'learning_rate': 9.863636363636364e-05, 'epoch': 8.79}
+{'loss': 0.1854, 'grad_norm': 21.9121150970459, 'learning_rate': 9.787878787878789e-05, 'epoch': 8.88}
+{'loss': 0.2576, 'grad_norm': 3.9164559841156006, 'learning_rate': 9.712121212121212e-05, 'epoch': 8.97}
+{'loss': 0.2118, 'grad_norm': 1.239221215248108, 'learning_rate': 9.636363636363637e-05, 'epoch': 9.05}
+{'loss': 0.1577, 'grad_norm': 3.1416544914245605, 'learning_rate': 9.560606060606061e-05, 'epoch': 9.14}
+{'loss': 0.2092, 'grad_norm': 2.4253621101379395, 'learning_rate': 9.484848484848486e-05, 'epoch': 9.22}
+{'loss': 0.1876, 'grad_norm': 1.194345474243164, 'learning_rate': 9.40909090909091e-05, 'epoch': 9.31}
+{'loss': 0.1546, 'grad_norm': 2.411029100418091, 'learning_rate': 9.333333333333334e-05, 'epoch': 9.4}
+{'loss': 0.2232, 'grad_norm': 3.246082067489624, 'learning_rate': 9.257575757575758e-05, 'epoch': 9.48}
+ 47%|█████████████████████████████████████████████                                                  | 1100/2320 [24:18<14:01,  1.45it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.45077577233314514, 'eval_wer': 0.48921059819721385, 'eval_runtime': 39.9221, 'eval_samples_per_second': 33.666, 'eval_steps_per_second': 33.666, 'epoch': 9.48}
+{'loss': 0.1777, 'grad_norm': 1.3427454233169556, 'learning_rate': 9.181818181818183e-05, 'epoch': 9.57}
+{'loss': 0.1646, 'grad_norm': 1.5090447664260864, 'learning_rate': 9.106060606060606e-05, 'epoch': 9.66}
+{'loss': 0.225, 'grad_norm': 1.3060975074768066, 'learning_rate': 9.030303030303031e-05, 'epoch': 9.74}
+{'loss': 0.1552, 'grad_norm': 1.3011540174484253, 'learning_rate': 8.954545454545455e-05, 'epoch': 9.83}
+{'loss': 0.1715, 'grad_norm': 1.9938538074493408, 'learning_rate': 8.87878787878788e-05, 'epoch': 9.91}
+{'loss': 0.2092, 'grad_norm': 3.334385395050049, 'learning_rate': 8.803030303030304e-05, 'epoch': 10.0}
+{'loss': 0.14, 'grad_norm': 1.011092185974121, 'learning_rate': 8.727272727272727e-05, 'epoch': 10.09}
+{'loss': 0.1512, 'grad_norm': 2.517902135848999, 'learning_rate': 8.651515151515152e-05, 'epoch': 10.17}
+{'loss': 0.1846, 'grad_norm': 1.2418378591537476, 'learning_rate': 8.575757575757576e-05, 'epoch': 10.26}
+{'loss': 0.1332, 'grad_norm': 1.5885329246520996, 'learning_rate': 8.5e-05, 'epoch': 10.34}
+ 52%|█████████████████████████████████████████████████▏                                             | 1200/2320 [26:37<18:40,  1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4394075274467468, 'eval_wer': 0.4740052808886461, 'eval_runtime': 39.9367, 'eval_samples_per_second': 33.653, 'eval_steps_per_second': 33.653, 'epoch': 10.34}
+ 52%|█████████████████████████████████████████████████▏                                             | 1200/2320 [27:17<18:40,  1.00s/itSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-1200
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/added_tokens.json
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+{'loss': 0.1485, 'grad_norm': 1.2539469003677368, 'learning_rate': 8.424242424242424e-05, 'epoch': 10.43}
+{'loss': 0.1988, 'grad_norm': 1.357601284980774, 'learning_rate': 8.348484848484849e-05, 'epoch': 10.52}
+ 53%|██████████████████████████████████████████████████▏                                            | 1227/2320 [27:45<19:01,  1.04s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+{'loss': 0.137, 'grad_norm': 2.0564587116241455, 'learning_rate': 8.272727272727273e-05, 'epoch': 10.6}
+{'loss': 0.1245, 'grad_norm': 2.48364520072937, 'learning_rate': 8.196969696969698e-05, 'epoch': 10.69}
+{'loss': 0.1602, 'grad_norm': 1.015891671180725, 'learning_rate': 8.121212121212121e-05, 'epoch': 10.78}
+{'loss': 0.1215, 'grad_norm': 1.1023950576782227, 'learning_rate': 8.045454545454546e-05, 'epoch': 10.86}
+{'loss': 0.1621, 'grad_norm': 2.703427791595459, 'learning_rate': 7.96969696969697e-05, 'epoch': 10.95}
+{'loss': 0.1651, 'grad_norm': 1.1821691989898682, 'learning_rate': 7.893939393939395e-05, 'epoch': 11.03}
+{'loss': 0.1066, 'grad_norm': 0.930283784866333, 'learning_rate': 7.818181818181818e-05, 'epoch': 11.12}
+{'loss': 0.1085, 'grad_norm': 1.6548758745193481, 'learning_rate': 7.742424242424243e-05, 'epoch': 11.21}
+ 56%|█████████████████████████████████████████████████████▏                                         | 1300/2320 [28:53<12:42,  1.34it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4466467499732971, 'eval_wer': 0.46207775653282346, 'eval_runtime': 39.8633, 'eval_samples_per_second': 33.715, 'eval_steps_per_second': 33.715, 'epoch': 11.21}
+{'loss': 0.1418, 'grad_norm': 1.1760716438293457, 'learning_rate': 7.666666666666667e-05, 'epoch': 11.29}
+{'loss': 0.1133, 'grad_norm': 2.1062755584716797, 'learning_rate': 7.59090909090909e-05, 'epoch': 11.38}
+{'loss': 0.1318, 'grad_norm': 2.67399001121521, 'learning_rate': 7.515151515151515e-05, 'epoch': 11.47}
+{'loss': 0.1474, 'grad_norm': 1.0049142837524414, 'learning_rate': 7.439393939393939e-05, 'epoch': 11.55}
+{'loss': 0.0908, 'grad_norm': 1.586559772491455, 'learning_rate': 7.363636363636364e-05, 'epoch': 11.64}
+{'loss': 0.1521, 'grad_norm': 3.784040927886963, 'learning_rate': 7.287878787878788e-05, 'epoch': 11.72}
+{'loss': 0.1163, 'grad_norm': 1.125501275062561, 'learning_rate': 7.212121212121213e-05, 'epoch': 11.81}
+{'loss': 0.1109, 'grad_norm': 2.1989808082580566, 'learning_rate': 7.136363636363636e-05, 'epoch': 11.9}
+{'loss': 0.152, 'grad_norm': 1.1287301778793335, 'learning_rate': 7.060606060606061e-05, 'epoch': 11.98}
+{'loss': 0.098, 'grad_norm': 1.538678765296936, 'learning_rate': 6.984848484848485e-05, 'epoch': 12.07}
+ 60%|█████████████████████████████████████████████████████████▎                                     | 1400/2320 [31:12<18:06,  1.18s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.42302384972572327, 'eval_wer': 0.44933078393881454, 'eval_runtime': 40.1773, 'eval_samples_per_second': 33.452, 'eval_steps_per_second': 33.452, 'epoch': 12.07}
+{'loss': 0.092, 'grad_norm': 1.400772213935852, 'learning_rate': 6.90909090909091e-05, 'epoch': 12.16}
+{'loss': 0.1649, 'grad_norm': 3.6780846118927, 'learning_rate': 6.833333333333333e-05, 'epoch': 12.24}
+{'loss': 0.091, 'grad_norm': 1.5424057245254517, 'learning_rate': 6.757575757575758e-05, 'epoch': 12.33}
+{'loss': 0.0869, 'grad_norm': 1.4868180751800537, 'learning_rate': 6.681818181818183e-05, 'epoch': 12.41}
+{'loss': 0.1499, 'grad_norm': 1.1947145462036133, 'learning_rate': 6.606060606060607e-05, 'epoch': 12.5}
+{'loss': 0.0954, 'grad_norm': 1.0430784225463867, 'learning_rate': 6.530303030303032e-05, 'epoch': 12.59}
+{'loss': 0.1032, 'grad_norm': 2.4261584281921387, 'learning_rate': 6.454545454545455e-05, 'epoch': 12.67}
+{'loss': 0.1158, 'grad_norm': 1.033467411994934, 'learning_rate': 6.37878787878788e-05, 'epoch': 12.76}
+{'loss': 0.0864, 'grad_norm': 1.1535651683807373, 'learning_rate': 6.303030303030302e-05, 'epoch': 12.84}
+{'loss': 0.1219, 'grad_norm': 1.28826105594635, 'learning_rate': 6.227272727272727e-05, 'epoch': 12.93}
+ 65%|█████████████████████████████████████████████████████████████▍                                 | 1500/2320 [33:26<10:01,  1.36it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.418023020029068, 'eval_wer': 0.44596194118182647, 'eval_runtime': 40.2192, 'eval_samples_per_second': 33.417, 'eval_steps_per_second': 33.417, 'epoch': 12.93}
+{'loss': 0.1289, 'grad_norm': 1.055411458015442, 'learning_rate': 6.151515151515151e-05, 'epoch': 13.02}
+{'loss': 0.0776, 'grad_norm': 1.1269094944000244, 'learning_rate': 6.075757575757576e-05, 'epoch': 13.1}
+{'loss': 0.0871, 'grad_norm': 1.7149118185043335, 'learning_rate': 6e-05, 'epoch': 13.19}
+{'loss': 0.1087, 'grad_norm': 1.7456856966018677, 'learning_rate': 5.9242424242424244e-05, 'epoch': 13.28}
+{'loss': 0.0821, 'grad_norm': 1.3434715270996094, 'learning_rate': 5.848484848484849e-05, 'epoch': 13.36}
+{'loss': 0.0878, 'grad_norm': 2.103512763977051, 'learning_rate': 5.772727272727273e-05, 'epoch': 13.45}
+{'loss': 0.1044, 'grad_norm': 1.240224838256836, 'learning_rate': 5.696969696969697e-05, 'epoch': 13.53}
+{'loss': 0.0753, 'grad_norm': 0.7336703538894653, 'learning_rate': 5.6212121212121215e-05, 'epoch': 13.62}
+{'loss': 0.1059, 'grad_norm': 2.293342351913452, 'learning_rate': 5.545454545454546e-05, 'epoch': 13.71}
+{'loss': 0.1021, 'grad_norm': 1.1853971481323242, 'learning_rate': 5.46969696969697e-05, 'epoch': 13.79}
+ 69%|█████████████████████████████████████████████████████████████████▌                             | 1600/2320 [35:45<13:55,  1.16s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.41785839200019836, 'eval_wer': 0.4405900027314941, 'eval_runtime': 40.2906, 'eval_samples_per_second': 33.358, 'eval_steps_per_second': 33.358, 'epoch': 13.79}
+ 69%|█████████████████████████████████████████████████████████████████▌                             | 1600/2320 [36:25<13:55,  1.16s/itSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-1600
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/added_tokens.json
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+Deleting older checkpoint [wav2vec2-base-timit-fine-tuned/checkpoint-400] due to args.save_total_limit
+{'loss': 0.0648, 'grad_norm': 1.331200361251831, 'learning_rate': 5.393939393939394e-05, 'epoch': 13.88}
+{'loss': 0.1121, 'grad_norm': 2.28397536277771, 'learning_rate': 5.3181818181818186e-05, 'epoch': 13.97}
+{'loss': 0.0725, 'grad_norm': 0.9436893463134766, 'learning_rate': 5.242424242424243e-05, 'epoch': 14.05}
+{'loss': 0.0691, 'grad_norm': 1.6113288402557373, 'learning_rate': 5.166666666666667e-05, 'epoch': 14.14}
+{'loss': 0.0979, 'grad_norm': 2.479888439178467, 'learning_rate': 5.090909090909091e-05, 'epoch': 14.22}
+{'loss': 0.0909, 'grad_norm': 1.006616473197937, 'learning_rate': 5.015151515151515e-05, 'epoch': 14.31}
+ 72%|████████████████████████████████████████████████████████████████████                           | 1663/2320 [37:27<11:20,  1.04s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+{'loss': 0.0761, 'grad_norm': 1.4571704864501953, 'learning_rate': 4.93939393939394e-05, 'epoch': 14.4}
+{'loss': 0.0862, 'grad_norm': 1.5729875564575195, 'learning_rate': 4.863636363636364e-05, 'epoch': 14.48}
+{'loss': 0.0646, 'grad_norm': 1.2180376052856445, 'learning_rate': 4.787878787878788e-05, 'epoch': 14.57}
+{'loss': 0.0741, 'grad_norm': 1.7464072704315186, 'learning_rate': 4.712121212121212e-05, 'epoch': 14.66}
+ 73%|█████████████████████████████████████████████████████████████████████▌                         | 1700/2320 [38:02<08:27,  1.22it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4113341271877289, 'eval_wer': 0.4309387234817445, 'eval_runtime': 40.2841, 'eval_samples_per_second': 33.363, 'eval_steps_per_second': 33.363, 'epoch': 14.66}
+{'loss': 0.1315, 'grad_norm': 0.8571386337280273, 'learning_rate': 4.6439393939393944e-05, 'epoch': 14.74}
+{'loss': 0.0603, 'grad_norm': 1.331377387046814, 'learning_rate': 4.5681818181818186e-05, 'epoch': 14.83}
+{'loss': 0.0796, 'grad_norm': 1.5398732423782349, 'learning_rate': 4.492424242424242e-05, 'epoch': 14.91}
+{'loss': 0.085, 'grad_norm': 3.689671754837036, 'learning_rate': 4.4166666666666665e-05, 'epoch': 15.0}
+{'loss': 0.0544, 'grad_norm': 1.132613182067871, 'learning_rate': 4.340909090909091e-05, 'epoch': 15.09}
+{'loss': 0.0601, 'grad_norm': 1.5951859951019287, 'learning_rate': 4.265151515151515e-05, 'epoch': 15.17}
+{'loss': 0.097, 'grad_norm': 0.5179944634437561, 'learning_rate': 4.189393939393939e-05, 'epoch': 15.26}
+{'loss': 0.0596, 'grad_norm': 0.9744370579719543, 'learning_rate': 4.113636363636364e-05, 'epoch': 15.34}
+{'loss': 0.0677, 'grad_norm': 1.8794275522232056, 'learning_rate': 4.0378787878787885e-05, 'epoch': 15.43}
+{'loss': 0.0896, 'grad_norm': 0.748386025428772, 'learning_rate': 3.962121212121213e-05, 'epoch': 15.52}
+ 78%|█████████████████████████████████████████████████████████████████████████▋                     | 1800/2320 [40:18<11:05,  1.28s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.43920788168907166, 'eval_wer': 0.4307566238732587, 'eval_runtime': 40.1997, 'eval_samples_per_second': 33.433, 'eval_steps_per_second': 33.433, 'epoch': 15.52}
+{'loss': 0.0604, 'grad_norm': 0.9639837145805359, 'learning_rate': 3.8863636363636364e-05, 'epoch': 15.6}
+{'loss': 0.0711, 'grad_norm': 1.9640839099884033, 'learning_rate': 3.810606060606061e-05, 'epoch': 15.69}
+{'loss': 0.0867, 'grad_norm': 1.4438735246658325, 'learning_rate': 3.734848484848485e-05, 'epoch': 15.78}
+{'loss': 0.0605, 'grad_norm': 1.0062426328659058, 'learning_rate': 3.659090909090909e-05, 'epoch': 15.86}
+{'loss': 0.0662, 'grad_norm': 1.6331523656845093, 'learning_rate': 3.5833333333333335e-05, 'epoch': 15.95}
+{'loss': 0.0765, 'grad_norm': 0.8070217370986938, 'learning_rate': 3.507575757575758e-05, 'epoch': 16.03}
+{'loss': 0.0537, 'grad_norm': 1.4137670993804932, 'learning_rate': 3.431818181818182e-05, 'epoch': 16.12}
+{'loss': 0.0684, 'grad_norm': 1.5437769889831543, 'learning_rate': 3.356060606060606e-05, 'epoch': 16.21}
+{'loss': 0.0744, 'grad_norm': 0.90281081199646, 'learning_rate': 3.2803030303030305e-05, 'epoch': 16.29}
+{'loss': 0.0492, 'grad_norm': 1.139837622642517, 'learning_rate': 3.204545454545455e-05, 'epoch': 16.38}
+ 82%|█████████████████████████████████████████████████████████████████████████████▊                 | 1900/2320 [42:36<06:26,  1.09it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4201890528202057, 'eval_wer': 0.4313029226987162, 'eval_runtime': 40.1502, 'eval_samples_per_second': 33.474, 'eval_steps_per_second': 33.474, 'epoch': 16.38}
+{'loss': 0.0652, 'grad_norm': 1.679457426071167, 'learning_rate': 3.128787878787879e-05, 'epoch': 16.47}
+{'loss': 0.0649, 'grad_norm': 0.6661111116409302, 'learning_rate': 3.0530303030303034e-05, 'epoch': 16.55}
+{'loss': 0.0469, 'grad_norm': 1.1774355173110962, 'learning_rate': 2.9772727272727273e-05, 'epoch': 16.64}
+{'loss': 0.0752, 'grad_norm': 1.783923864364624, 'learning_rate': 2.901515151515152e-05, 'epoch': 16.72}
+{'loss': 0.0519, 'grad_norm': 1.176321268081665, 'learning_rate': 2.825757575757576e-05, 'epoch': 16.81}
+{'loss': 0.0547, 'grad_norm': 1.3150608539581299, 'learning_rate': 2.7500000000000004e-05, 'epoch': 16.9}
+{'loss': 0.0799, 'grad_norm': 0.983769953250885, 'learning_rate': 2.674242424242424e-05, 'epoch': 16.98}
+{'loss': 0.0577, 'grad_norm': 0.996890127658844, 'learning_rate': 2.5984848484848483e-05, 'epoch': 17.07}
+{'loss': 0.0515, 'grad_norm': 2.3034253120422363, 'learning_rate': 2.5227272727272726e-05, 'epoch': 17.16}
+{'loss': 0.0759, 'grad_norm': 3.7528610229492188, 'learning_rate': 2.4469696969696972e-05, 'epoch': 17.24}
+ 86%|█████████████████████████████████████████████████████████████████████████████████▉             | 2000/2320 [44:50<03:48,  1.40it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.43480169773101807, 'eval_wer': 0.4207411454065374, 'eval_runtime': 40.017, 'eval_samples_per_second': 33.586, 'eval_steps_per_second': 33.586, 'epoch': 17.24}
+ 86%|█████████████████████████████████████████████████████████████████████████████████▉             | 2000/2320 [45:30<03:48,  1.40it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-2000
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/added_tokens.json
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+Deleting older checkpoint [wav2vec2-base-timit-fine-tuned/checkpoint-800] due to args.save_total_limit
+{'loss': 0.0419, 'grad_norm': 0.6646668314933777, 'learning_rate': 2.3712121212121214e-05, 'epoch': 17.33}
+{'loss': 0.0595, 'grad_norm': 1.3250740766525269, 'learning_rate': 2.2954545454545457e-05, 'epoch': 17.41}
+{'loss': 0.0691, 'grad_norm': 0.8094995021820068, 'learning_rate': 2.21969696969697e-05, 'epoch': 17.5}
+{'loss': 0.052, 'grad_norm': 0.846946120262146, 'learning_rate': 2.143939393939394e-05, 'epoch': 17.59}
+{'loss': 0.0565, 'grad_norm': 1.652417540550232, 'learning_rate': 2.0681818181818182e-05, 'epoch': 17.67}
+{'loss': 0.0745, 'grad_norm': 1.0080279111862183, 'learning_rate': 1.9924242424242425e-05, 'epoch': 17.76}
+ 89%|████████████████████████████████████████████████████████████████████████████████████▌          | 2064/2320 [46:36<04:53,  1.15s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)
+  return F.conv1d(input, weight, bias, self.stride,
+{'loss': 0.0513, 'grad_norm': 0.7252691388130188, 'learning_rate': 1.9166666666666667e-05, 'epoch': 17.84}
+{'loss': 0.055, 'grad_norm': 1.58548903465271, 'learning_rate': 1.840909090909091e-05, 'epoch': 17.93}
+{'loss': 0.0658, 'grad_norm': 0.6634634733200073, 'learning_rate': 1.7651515151515153e-05, 'epoch': 18.02}
+{'loss': 0.0406, 'grad_norm': 1.1495524644851685, 'learning_rate': 1.6893939393939395e-05, 'epoch': 18.1}
+ 91%|█████████████████████████████████████████████████████████████████████████████████████▉         | 2100/2320 [47:11<03:46,  1.03s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.44191813468933105, 'eval_wer': 0.42046799599380863, 'eval_runtime': 40.0967, 'eval_samples_per_second': 33.519, 'eval_steps_per_second': 33.519, 'epoch': 18.1}
+{'loss': 0.0381, 'grad_norm': 0.9788354635238647, 'learning_rate': 1.6136363636363638e-05, 'epoch': 18.19}
+{'loss': 0.071, 'grad_norm': 1.093633770942688, 'learning_rate': 1.5378787878787877e-05, 'epoch': 18.28}
+{'loss': 0.0439, 'grad_norm': 0.7164376974105835, 'learning_rate': 1.4621212121212122e-05, 'epoch': 18.36}
+{'loss': 0.0481, 'grad_norm': 0.9887032508850098, 'learning_rate': 1.3863636363636364e-05, 'epoch': 18.45}
+{'loss': 0.0571, 'grad_norm': 0.45052286982536316, 'learning_rate': 1.3106060606060607e-05, 'epoch': 18.53}
+{'loss': 0.0452, 'grad_norm': 1.167181134223938, 'learning_rate': 1.234848484848485e-05, 'epoch': 18.62}
+{'loss': 0.0643, 'grad_norm': 1.378661870956421, 'learning_rate': 1.159090909090909e-05, 'epoch': 18.71}
+{'loss': 0.0587, 'grad_norm': 0.854932963848114, 'learning_rate': 1.0833333333333334e-05, 'epoch': 18.79}
+{'loss': 0.0395, 'grad_norm': 0.8007526397705078, 'learning_rate': 1.0075757575757576e-05, 'epoch': 18.88}
+{'loss': 0.074, 'grad_norm': 3.317830801010132, 'learning_rate': 9.318181818181819e-06, 'epoch': 18.97}
+ 95%|██████████████████████████████████████████████████████████████████████████████████████████     | 2200/2320 [49:24<01:19,  1.51it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.43061742186546326, 'eval_wer': 0.420012746972594, 'eval_runtime': 40.0034, 'eval_samples_per_second': 33.597, 'eval_steps_per_second': 33.597, 'epoch': 18.97}
+{'loss': 0.046, 'grad_norm': 0.7710875272750854, 'learning_rate': 8.56060606060606e-06, 'epoch': 19.05}
+{'loss': 0.0394, 'grad_norm': 0.5200530886650085, 'learning_rate': 7.803030303030304e-06, 'epoch': 19.14}
+{'loss': 0.0582, 'grad_norm': 1.3544327020645142, 'learning_rate': 7.045454545454545e-06, 'epoch': 19.22}
+{'loss': 0.0606, 'grad_norm': 0.8653574585914612, 'learning_rate': 6.287878787878789e-06, 'epoch': 19.31}
+{'loss': 0.0367, 'grad_norm': 1.5852700471878052, 'learning_rate': 5.530303030303031e-06, 'epoch': 19.4}
+{'loss': 0.0782, 'grad_norm': 2.2167246341705322, 'learning_rate': 4.772727272727273e-06, 'epoch': 19.48}
+{'loss': 0.0416, 'grad_norm': 0.5891330242156982, 'learning_rate': 4.015151515151515e-06, 'epoch': 19.57}
+{'loss': 0.0515, 'grad_norm': 1.1137330532073975, 'learning_rate': 3.257575757575758e-06, 'epoch': 19.66}
+{'loss': 0.0512, 'grad_norm': 0.8132285475730896, 'learning_rate': 2.5e-06, 'epoch': 19.74}
+{'loss': 0.0378, 'grad_norm': 0.7994781136512756, 'learning_rate': 1.7424242424242427e-06, 'epoch': 19.83}
+ 99%|██████████████████████████████████████████████████████████████████████████████████████████████▏| 2300/2320 [51:43<00:20,  1.02s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+{'eval_loss': 0.4273350238800049, 'eval_wer': 0.41728125284530637, 'eval_runtime': 40.0934, 'eval_samples_per_second': 33.522, 'eval_steps_per_second': 33.522, 'epoch': 19.83}
+{'loss': 0.0489, 'grad_norm': 0.9775754809379578, 'learning_rate': 9.848484848484847e-07, 'epoch': 19.91}
+{'loss': 0.0554, 'grad_norm': 0.8857516050338745, 'learning_rate': 2.2727272727272726e-07, 'epoch': 20.0}
+100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2320/2320 [52:39<00:00,  1.41it/s]
+Training completed. Do not forget to share your model on huggingface.co/models =)
+{'train_runtime': 3159.4128, 'train_samples_per_second': 23.397, 'train_steps_per_second': 0.734, 'train_loss': 0.8618391515622879, 'epoch': 20.0}
+100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2320/2320 [52:39<00:00,  1.36s/it]
+Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+events.out.tfevents.1716174523.tz579-raptorlake.65634.0: 100%|██████████████████████████████████████| 63.2k/63.2k [00:00<00:00, 232kB/s]
+model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████| 378M/378M [03:30<00:00, 1.79MB/s]
+Upload 2 LFS files: 100%|████████████████████████████████████████████████████████████████████████████████| 2/2 [03:31<00:00, 105.69s/it]
+***** train metrics *****████████████████████████████████████████                                        | 1/2 [03:31<03:31, 211.39s/it]
+  epoch                    =         20.0
+  total_flos               = 2000175347GF
+  train_loss               =       0.8618
+  train_runtime            =   0:52:39.41
+  train_samples            =         3696
+  train_samples_per_second =       23.397
+  train_steps_per_second   =        0.734
+05/19/2024 23:04:57 - INFO - __main__ - *** Evaluate ***
+The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 1344
+  Batch size = 1
+100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:39<00:00, 34.00it/s]
+***** eval metrics *****
+  epoch                   =       20.0
+  eval_loss               =     0.4275
+  eval_runtime            = 0:00:39.60
+  eval_samples            =       1344
+  eval_samples_per_second =     33.935
+  eval_steps_per_second   =     33.935
+  eval_wer                =     0.4173
+Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned
+Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json
+Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors
+Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json
+tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json
+Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json
+added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json
+events.out.tfevents.1716177937.tz579-raptorlake.65634.1: 100%|███████████████████████████████████████████| 406/406 [00:00<00:00, 884B/s]

run.timit.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+export HF_TOKEN=`cat /home/huggingface.token`
+export HF_HOME="/home/Work/common_huggingface"
+python run_speech_recognition_ctc.py \
+	--token="${HF_TOKEN}" \
+	--dataset_name="timit_asr" \
+	--dataset_path="/home/Work_/common_darpa/Timit_data/data" \
+	--model_name_or_path="facebook/wav2vec2-base" \
+	--overwrite_output_dir \
+	--output_dir="./wav2vec2-base-timit-fine-tuned" \
+	--train_split_name="train" \
+	--num_train_epochs="20" \
+	--per_device_train_batch_size="32" \
+	--per_device_eval_batch_size="1" \
+	--weight_decay="0.005" \
+	--learning_rate="1e-4" \
+	--warmup_steps="1000" \
+	--evaluation_strategy="steps" \
+	--text_column_name="text" \
+	--save_steps="400" \
+	--eval_steps="100" \
+	--logging_steps="10" \
+	--layerdrop="0.0" \
+	--save_total_limit="3" \
+	--freeze_feature_encoder \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+	--fp16 \
+	--group_by_length \
+	--push_to_hub \
+	--do_train --do_eval \

run_speech_recognition_ctc.py ADDED Viewed

	@@ -0,0 +1,840 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+import datasets
+import evaluate
+import torch
+from datasets import DatasetDict, load_dataset
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.41.0.dev0")
+require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
+logger = logging.getLogger(__name__)
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
+    )
+    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
+    hidden_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the time axis to be chosen as the start of the vector "
+                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
+                "vectors will be masked along the time axis."
+            )
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
+                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
+                " bins will be masked along the time axis."
+            )
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+    ctc_zero_infinity: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly"
+            " occur when the inputs are too short to be aligned to the targets."
+        },
+    )
+    add_adapter: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very"
+            "useful to downsample the output length."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    dataset_path: str = field(
+        default=None, metadata={"help": "The configuration path of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train+validation",
+        metadata={
+            "help": (
+                "The name of the training data set split to use (via the datasets library). Defaults to "
+                "'train+validation'"
+            )
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
+                " 'max_duration_in_seconds`"
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training"
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    phoneme_language: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The target language that should be used be"
+                " passed to the tokenizer for tokenization. Note that"
+                " this is only relevant if the model classifies the"
+                " input audio to a sequence of phoneme sequences."
+            )
+        },
+    )
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.AutoProcessor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: AutoProcessor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    feature_extractor_input_name: Optional[str] = "input_values"
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        input_features = [
+            {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features
+        ]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        batch["labels"] = labels
+        if "attention_mask" in batch:
+            batch["attention_mask"] = batch["attention_mask"].to(torch.long)
+        return batch
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
+    # Given training and test labels create vocabulary
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["target_text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+    vocabs = datasets.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=datasets["train"].column_names,
+    )
+    # take union of all unique characters in each dataset
+    vocab_set = functools.reduce(
+        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
+    )
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+    # add unk and pad token
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)
+    return vocab_dict
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if data_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # 1. First, let's load the dataset
+    raw_datasets = DatasetDict()
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            data_dir=data_args.dataset_path,
+            split=data_args.train_split_name,
+            token=data_args.token,
+        )
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
+                " Make sure to set `--audio_column_name` to the correct audio column - one of"
+                f" {', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            data_dir=data_args.dataset_path,
+            split=data_args.eval_split_name,
+            token=data_args.token,
+        )
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+    # 2. We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    chars_to_ignore_regex = (
+        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+    )
+    text_column_name = data_args.text_column_name
+    def remove_special_characters(batch):
+        if chars_to_ignore_regex is not None:
+            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
+        else:
+            batch["target_text"] = batch[text_column_name].lower() + " "
+        return batch
+    with training_args.main_process_first(desc="dataset map special characters removal"):
+        raw_datasets = raw_datasets.map(
+            remove_special_characters,
+            remove_columns=[text_column_name],
+            desc="remove special characters from datasets",
+        )
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token
+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
+    # load config
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_kwargs = {}
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                try:
+                    os.remove(vocab_file)
+                except OSError:
+                    # in shared file-systems it might be the case that
+                    # two processes try to delete the vocab file at the some time
+                    pass
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                vocab_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_kwargs = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "word_delimiter_token": word_delimiter_token,
+        }
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+    # load feature_extractor and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        **tokenizer_kwargs,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+    # adapt config
+    config.update(
+        {
+            "feat_proj_dropout": model_args.feat_proj_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "hidden_dropout": model_args.hidden_dropout,
+            "final_dropout": model_args.final_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "layerdrop": model_args.layerdrop,
+            "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "ctc_zero_infinity": model_args.ctc_zero_infinity,
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
+            "activation_dropout": model_args.activation_dropout,
+            "add_adapter": model_args.add_adapter,
+        }
+    )
+    # create model
+    model = AutoModelForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+    # freeze encoder
+    if model_args.freeze_feature_encoder:
+        model.freeze_feature_encoder()
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+    # make sure that dataset decodes audio with correct sampling rate
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
+    # derive max & min input length for sample rate & max duration
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    feature_extractor_input_name = feature_extractor.model_input_names[0]
+    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
+    phoneme_language = data_args.phoneme_language
+    # Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    def prepare_dataset(batch):
+        # load audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0]
+        # take length of raw audio waveform
+        batch["input_length"] = len(sample["array"].squeeze())
+        # encode targets
+        additional_kwargs = {}
+        if phoneme_language is not None:
+            additional_kwargs["phonemizer_lang"] = phoneme_language
+        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
+        return batch
+    with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = raw_datasets.map(
+            prepare_dataset,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics}
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
+        return
+    # For languages like Chinese with large vocabulary size, we need to discard logits
+    # and only keep the argmax, otherwise we run out of memory during evaluation.
+    def preprocess_logits_for_metrics(logits, labels):
+        pred_ids = torch.argmax(logits, dim=-1)
+        return pred_ids, labels
+    def compute_metrics(pred):
+        pred_ids = pred.predictions[0]
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
+        return metrics
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    with training_args.main_process_first():
+        # only the main process saves them
+        if is_main_process(training_args.local_rank):
+            # save feature extractor, tokenizer and config
+            feature_extractor.save_pretrained(training_args.output_dir)
+            tokenizer.save_pretrained(training_args.output_dir)
+            config.save_pretrained(training_args.output_dir)
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+    # Instantiate custom data collator
+    data_collator = DataCollatorCTCWithPadding(
+        processor=processor, feature_extractor_input_name=feature_extractor_input_name
+    )
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
+        tokenizer=processor,
+        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+    )
+    # 8. Finally, we can start training
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "automatic-speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name],
+        "dataset_args": (
+            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
+            f" {data_args.eval_split_name}"
+        ),
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+    return results
+if __name__ == "__main__":
+    main()

run_speech_recognition_ctc.py. ADDED Viewed

	@@ -0,0 +1,835 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+import datasets
+import evaluate
+import torch
+from datasets import DatasetDict, load_dataset
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.41.0.dev0")
+require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
+logger = logging.getLogger(__name__)
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
+    )
+    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
+    hidden_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the time axis to be chosen as the start of the vector "
+                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
+                "vectors will be masked along the time axis."
+            )
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
+                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
+                " bins will be masked along the time axis."
+            )
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+    ctc_zero_infinity: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly"
+            " occur when the inputs are too short to be aligned to the targets."
+        },
+    )
+    add_adapter: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very"
+            "useful to downsample the output length."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train+validation",
+        metadata={
+            "help": (
+                "The name of the training data set split to use (via the datasets library). Defaults to "
+                "'train+validation'"
+            )
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
+                " 'max_duration_in_seconds`"
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training"
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    phoneme_language: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The target language that should be used be"
+                " passed to the tokenizer for tokenization. Note that"
+                " this is only relevant if the model classifies the"
+                " input audio to a sequence of phoneme sequences."
+            )
+        },
+    )
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.AutoProcessor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: AutoProcessor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    feature_extractor_input_name: Optional[str] = "input_values"
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        input_features = [
+            {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features
+        ]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        batch["labels"] = labels
+        if "attention_mask" in batch:
+            batch["attention_mask"] = batch["attention_mask"].to(torch.long)
+        return batch
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
+    # Given training and test labels create vocabulary
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["target_text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+    vocabs = datasets.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=datasets["train"].column_names,
+    )
+    # take union of all unique characters in each dataset
+    vocab_set = functools.reduce(
+        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
+    )
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+    # add unk and pad token
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)
+    return vocab_dict
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if data_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # 1. First, let's load the dataset
+    raw_datasets = DatasetDict()
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            token=data_args.token,
+        )
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
+                " Make sure to set `--audio_column_name` to the correct audio column - one of"
+                f" {', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            token=data_args.token,
+        )
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+    # 2. We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    chars_to_ignore_regex = (
+        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+    )
+    text_column_name = data_args.text_column_name
+    def remove_special_characters(batch):
+        if chars_to_ignore_regex is not None:
+            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
+        else:
+            batch["target_text"] = batch[text_column_name].lower() + " "
+        return batch
+    with training_args.main_process_first(desc="dataset map special characters removal"):
+        raw_datasets = raw_datasets.map(
+            remove_special_characters,
+            remove_columns=[text_column_name],
+            desc="remove special characters from datasets",
+        )
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token
+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
+    # load config
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_kwargs = {}
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                try:
+                    os.remove(vocab_file)
+                except OSError:
+                    # in shared file-systems it might be the case that
+                    # two processes try to delete the vocab file at the some time
+                    pass
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                vocab_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_kwargs = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "word_delimiter_token": word_delimiter_token,
+        }
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+    # load feature_extractor and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        **tokenizer_kwargs,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+    # adapt config
+    config.update(
+        {
+            "feat_proj_dropout": model_args.feat_proj_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "hidden_dropout": model_args.hidden_dropout,
+            "final_dropout": model_args.final_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "layerdrop": model_args.layerdrop,
+            "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "ctc_zero_infinity": model_args.ctc_zero_infinity,
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
+            "activation_dropout": model_args.activation_dropout,
+            "add_adapter": model_args.add_adapter,
+        }
+    )
+    # create model
+    model = AutoModelForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+    # freeze encoder
+    if model_args.freeze_feature_encoder:
+        model.freeze_feature_encoder()
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+    # make sure that dataset decodes audio with correct sampling rate
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
+    # derive max & min input length for sample rate & max duration
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    feature_extractor_input_name = feature_extractor.model_input_names[0]
+    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
+    phoneme_language = data_args.phoneme_language
+    # Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    def prepare_dataset(batch):
+        # load audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0]
+        # take length of raw audio waveform
+        batch["input_length"] = len(sample["array"].squeeze())
+        # encode targets
+        additional_kwargs = {}
+        if phoneme_language is not None:
+            additional_kwargs["phonemizer_lang"] = phoneme_language
+        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
+        return batch
+    with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = raw_datasets.map(
+            prepare_dataset,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics}
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
+        return
+    # For languages like Chinese with large vocabulary size, we need to discard logits
+    # and only keep the argmax, otherwise we run out of memory during evaluation.
+    def preprocess_logits_for_metrics(logits, labels):
+        pred_ids = torch.argmax(logits, dim=-1)
+        return pred_ids, labels
+    def compute_metrics(pred):
+        pred_ids = pred.predictions[0]
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
+        return metrics
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    with training_args.main_process_first():
+        # only the main process saves them
+        if is_main_process(training_args.local_rank):
+            # save feature extractor, tokenizer and config
+            feature_extractor.save_pretrained(training_args.output_dir)
+            tokenizer.save_pretrained(training_args.output_dir)
+            config.save_pretrained(training_args.output_dir)
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+    # Instantiate custom data collator
+    data_collator = DataCollatorCTCWithPadding(
+        processor=processor, feature_extractor_input_name=feature_extractor_input_name
+    )
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
+        tokenizer=processor,
+        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+    )
+    # 8. Finally, we can start training
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "automatic-speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name],
+        "dataset_args": (
+            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
+            f" {data_args.eval_split_name}"
+        ),
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+    return results
+if __name__ == "__main__":
+    main()

runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71563281c4fabcd575cd0a3087d26a44f9ce3cb361c20f297e548acb0eb445c9
+size 6192

runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:953c2638529ff4539eb1a8e0ae75d76c54e1cbca06486770b8edf490b9a48786
+size 6192

runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f56de21e92a371394b396346e90360ea59c7a18cf2c51d858605250a065be8d4
+size 6192

runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17633b209ac376da406114e746fca837219aa83d66d90d0aaf0816712bc41868
+size 6192

runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:074142b6f3a30c0127fde6336a36a48e016a9bad0bd48ad40f6d74dec816ce41
+size 6192

runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2079110f867071fbc495cd436b0a1360f44dbf8fcc3c41a0217289c6def7d32c
+size 6604

runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cca4979180e1ca2d32ca5566488d6178248d8aa0bd1caf46ed9f6285c6d11f7d
+size 6811

runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:301dd9f07f5890a3822f9edc73ca7e5b85cd3e4d20c17b4e2068856f1b89aaf2
+size 6604

runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34d45df376f452d62294b65ea3032948330a86c9f276f172cc5c542030498b41
+size 6604

runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5c9eba6b7b28381ba8f17287db0093268fa02468f97b2be4a3d643ef2cb185d
+size 158868

runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3f1c7a3afdbaaf70f0ab6eb3465364ec36b297ba72135f39057eeb3a379306d
+size 21715

runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a39bb0aaef6ce9ea8f32d9ffba51fae1a21efd988361cd138b3bad7827ab8bc0
+size 16542

runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dc9f6dad36f586790d3e4754501f93d810f582a8e97a2dd84ae3cc2683a992e
+size 2705590

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "added_tokens_decoder": {
+    "28": {
+      "content": "[UNK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "[PAD]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "[UNK]",
+  "word_delimiter_token": "|"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae2085582750eed1574146e140f321d91e803d129c7d445814e499b412abc85
+size 5048

vocab.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "@": 1,
+  "[PAD]": 29,
+  "[UNK]": 28,
+  "a": 2,
+  "b": 3,
+  "c": 4,
+  "d": 5,
+  "e": 6,
+  "f": 7,
+  "g": 8,
+  "h": 9,
+  "i": 10,
+  "j": 11,
+  "k": 12,
+  "l": 13,
+  "m": 14,
+  "n": 15,
+  "o": 16,
+  "p": 17,
+  "q": 18,
+  "r": 19,
+  "s": 20,
+  "t": 21,
+  "u": 22,
+  "v": 23,
+  "w": 24,
+  "x": 25,
+  "y": 26,
+  "z": 27,
+  "|": 0
+}

wav2vec2-base-timit-fine-tuned./README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+---
+license: apache-2.0
+base_model: facebook/wav2vec2-base
+tags:
+- automatic-speech-recognition
+- timit_asr
+- generated_from_trainer
+datasets:
+- timit_asr
+metrics:
+- wer
+model-index:
+- name: wav2vec2-base-timit-fine-tuned
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: TIMIT_ASR - NA
+      type: timit_asr
+      config: clean
+      split: test
+      args: 'Config: na, Training split: train, Eval split: test'
+    metrics:
+    - name: Wer
+      type: wer
+      value: 0.41728125284530637
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# wav2vec2-base-timit-fine-tuned
+This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the TIMIT_ASR - NA dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.4275
+- Wer: 0.4173
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 32
+- eval_batch_size: 1
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 1000
+- num_epochs: 20.0
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch   | Step | Validation Loss | Wer    |
+|:-------------:|:-------:|:----:|:---------------:|:------:|
+| 3.1618        | 0.8621  | 100  | 3.1117          | 1.0    |
+| 2.9798        | 1.7241  | 200  | 2.9736          | 1.0    |
+| 2.9144        | 2.5862  | 300  | 2.9075          | 1.0    |
+| 2.1714        | 3.4483  | 400  | 2.0945          | 1.0325 |
+| 1.1579        | 4.3103  | 500  | 1.0451          | 0.8299 |
+| 0.6087        | 5.1724  | 600  | 0.6754          | 0.6441 |
+| 0.481         | 6.0345  | 700  | 0.5275          | 0.5761 |
+| 0.3072        | 6.8966  | 800  | 0.4836          | 0.5264 |
+| 0.332         | 7.7586  | 900  | 0.4403          | 0.5234 |
+| 0.1876        | 8.6207  | 1000 | 0.4758          | 0.5222 |
+| 0.2232        | 9.4828  | 1100 | 0.4508          | 0.4892 |
+| 0.1332        | 10.3448 | 1200 | 0.4394          | 0.4740 |
+| 0.1085        | 11.2069 | 1300 | 0.4466          | 0.4621 |
+| 0.098         | 12.0690 | 1400 | 0.4230          | 0.4493 |
+| 0.1219        | 12.9310 | 1500 | 0.4180          | 0.4460 |
+| 0.1021        | 13.7931 | 1600 | 0.4179          | 0.4406 |
+| 0.0741        | 14.6552 | 1700 | 0.4113          | 0.4309 |
+| 0.0896        | 15.5172 | 1800 | 0.4392          | 0.4308 |
+| 0.0492        | 16.3793 | 1900 | 0.4202          | 0.4313 |
+| 0.0759        | 17.2414 | 2000 | 0.4348          | 0.4207 |
+| 0.0406        | 18.1034 | 2100 | 0.4419          | 0.4205 |
+| 0.074         | 18.9655 | 2200 | 0.4306          | 0.4200 |
+| 0.0378        | 19.8276 | 2300 | 0.4273          | 0.4173 |
+### Framework versions
+- Transformers 4.42.0.dev0
+- Pytorch 2.3.0.post300
+- Datasets 2.19.1
+- Tokenizers 0.19.1

wav2vec2-base-timit-fine-tuned./added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</s>": 30,
+  "<s>": 29
+}

wav2vec2-base-timit-fine-tuned./all_results.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 20.0,
+    "eval_loss": 0.42749759554862976,
+    "eval_runtime": 39.6053,
+    "eval_samples": 1344,
+    "eval_samples_per_second": 33.935,
+    "eval_steps_per_second": 33.935,
+    "eval_wer": 0.41728125284530637,
+    "total_flos": 2.1476719263248095e+18,
+    "train_loss": 0.8618391515622879,
+    "train_runtime": 3159.4128,
+    "train_samples": 3696,
+    "train_samples_per_second": 23.397,
+    "train_steps_per_second": 0.734
+}

wav2vec2-base-timit-fine-tuned./config.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "_name_or_path": "facebook/wav2vec2-base",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "freeze_feat_extract_train": true,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 28,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 31,
+  "xvector_output_dim": 512
+}

wav2vec2-base-timit-fine-tuned./eval_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "eval_loss": 0.42749759554862976,
+    "eval_runtime": 39.6053,
+    "eval_samples": 1344,
+    "eval_samples_per_second": 33.935,
+    "eval_steps_per_second": 33.935,
+    "eval_wer": 0.41728125284530637
+}

wav2vec2-base-timit-fine-tuned./preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1499de7f8d44ad8690a4fee9818a4ec46085f303e71f1d916a3979f95334b4f
+size 63169

wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:761f8f6656c0c227f5c72fd2abed63841c5757356b4cb775dfa24da593234fff
+size 406

wav2vec2-base-timit-fine-tuned./special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  }
+}

wav2vec2-base-timit-fine-tuned./tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "added_tokens_decoder": {
+    "27": {
+      "content": "[UNK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "[PAD]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "[UNK]",
+  "word_delimiter_token": "|"
+}

wav2vec2-base-timit-fine-tuned./train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "total_flos": 2.1476719263248095e+18,
+    "train_loss": 0.8618391515622879,
+    "train_runtime": 3159.4128,
+    "train_samples": 3696,
+    "train_samples_per_second": 23.397,
+    "train_steps_per_second": 0.734
+}

wav2vec2-base-timit-fine-tuned./trainer_state.json ADDED Viewed

	@@ -0,0 +1,1873 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 20.0,
+  "eval_steps": 100,
+  "global_step": 2320,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08620689655172414,
+      "grad_norm": 9.595185279846191,
+      "learning_rate": 9e-07,
+      "loss": 9.1142,
+      "step": 10
+    },
+    {
+      "epoch": 0.1724137931034483,
+      "grad_norm": 9.732986450195312,
+      "learning_rate": 1.9e-06,
+      "loss": 8.3446,
+      "step": 20
+    },
+    {
+      "epoch": 0.25862068965517243,
+      "grad_norm": 14.272214889526367,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 8.6592,
+      "step": 30
+    },
+    {
+      "epoch": 0.3448275862068966,
+      "grad_norm": 15.0160493850708,
+      "learning_rate": 3.8e-06,
+      "loss": 7.6985,
+      "step": 40
+    },
+    {
+      "epoch": 0.43103448275862066,
+      "grad_norm": 16.610979080200195,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 6.9688,
+      "step": 50
+    },
+    {
+      "epoch": 0.5172413793103449,
+      "grad_norm": 17.26924705505371,
+      "learning_rate": 5.8e-06,
+      "loss": 6.232,
+      "step": 60
+    },
+    {
+      "epoch": 0.603448275862069,
+      "grad_norm": 11.347734451293945,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 4.7271,
+      "step": 70
+    },
+    {
+      "epoch": 0.6896551724137931,
+      "grad_norm": 4.237112045288086,
+      "learning_rate": 7.8e-06,
+      "loss": 3.7919,
+      "step": 80
+    },
+    {
+      "epoch": 0.7758620689655172,
+      "grad_norm": 1.8833028078079224,
+      "learning_rate": 8.8e-06,
+      "loss": 3.3967,
+      "step": 90
+    },
+    {
+      "epoch": 0.8620689655172413,
+      "grad_norm": 1.3788093328475952,
+      "learning_rate": 9.800000000000001e-06,
+      "loss": 3.1618,
+      "step": 100
+    },
+    {
+      "epoch": 0.8620689655172413,
+      "eval_loss": 3.1117007732391357,
+      "eval_runtime": 40.0512,
+      "eval_samples_per_second": 33.557,
+      "eval_steps_per_second": 33.557,
+      "eval_wer": 1.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.9482758620689655,
+      "grad_norm": 1.729278802871704,
+      "learning_rate": 1.08e-05,
+      "loss": 3.0865,
+      "step": 110
+    },
+    {
+      "epoch": 1.0344827586206897,
+      "grad_norm": 1.905969500541687,
+      "learning_rate": 1.18e-05,
+      "loss": 3.0809,
+      "step": 120
+    },
+    {
+      "epoch": 1.1206896551724137,
+      "grad_norm": 0.8360918760299683,
+      "learning_rate": 1.2800000000000001e-05,
+      "loss": 3.0346,
+      "step": 130
+    },
+    {
+      "epoch": 1.206896551724138,
+      "grad_norm": 0.7653716206550598,
+      "learning_rate": 1.3800000000000002e-05,
+      "loss": 3.0106,
+      "step": 140
+    },
+    {
+      "epoch": 1.293103448275862,
+      "grad_norm": 0.94779372215271,
+      "learning_rate": 1.48e-05,
+      "loss": 3.0165,
+      "step": 150
+    },
+    {
+      "epoch": 1.3793103448275863,
+      "grad_norm": 0.8457741737365723,
+      "learning_rate": 1.58e-05,
+      "loss": 3.0,
+      "step": 160
+    },
+    {
+      "epoch": 1.4655172413793103,
+      "grad_norm": 1.4369837045669556,
+      "learning_rate": 1.6800000000000002e-05,
+      "loss": 2.9903,
+      "step": 170
+    },
+    {
+      "epoch": 1.5517241379310345,
+      "grad_norm": 1.8290436267852783,
+      "learning_rate": 1.78e-05,
+      "loss": 2.9852,
+      "step": 180
+    },
+    {
+      "epoch": 1.6379310344827587,
+      "grad_norm": 1.1530190706253052,
+      "learning_rate": 1.88e-05,
+      "loss": 2.99,
+      "step": 190
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 1.1261711120605469,
+      "learning_rate": 1.9800000000000004e-05,
+      "loss": 2.9798,
+      "step": 200
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "eval_loss": 2.9736363887786865,
+      "eval_runtime": 39.6236,
+      "eval_samples_per_second": 33.919,
+      "eval_steps_per_second": 33.919,
+      "eval_wer": 1.0,
+      "step": 200
+    },
+    {
+      "epoch": 1.8103448275862069,
+      "grad_norm": 0.903380811214447,
+      "learning_rate": 2.08e-05,
+      "loss": 2.9718,
+      "step": 210
+    },
+    {
+      "epoch": 1.896551724137931,
+      "grad_norm": 0.4889620244503021,
+      "learning_rate": 2.18e-05,
+      "loss": 2.9766,
+      "step": 220
+    },
+    {
+      "epoch": 1.9827586206896552,
+      "grad_norm": 1.3861790895462036,
+      "learning_rate": 2.2800000000000002e-05,
+      "loss": 2.9658,
+      "step": 230
+    },
+    {
+      "epoch": 2.0689655172413794,
+      "grad_norm": 0.7976490259170532,
+      "learning_rate": 2.38e-05,
+      "loss": 2.9588,
+      "step": 240
+    },
+    {
+      "epoch": 2.1551724137931036,
+      "grad_norm": 0.698798418045044,
+      "learning_rate": 2.48e-05,
+      "loss": 2.9523,
+      "step": 250
+    },
+    {
+      "epoch": 2.2413793103448274,
+      "grad_norm": 1.0858148336410522,
+      "learning_rate": 2.58e-05,
+      "loss": 2.9496,
+      "step": 260
+    },
+    {
+      "epoch": 2.3275862068965516,
+      "grad_norm": 0.5658290386199951,
+      "learning_rate": 2.6800000000000004e-05,
+      "loss": 2.9421,
+      "step": 270
+    },
+    {
+      "epoch": 2.413793103448276,
+      "grad_norm": 0.5713534355163574,
+      "learning_rate": 2.7800000000000005e-05,
+      "loss": 2.9427,
+      "step": 280
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.7386118769645691,
+      "learning_rate": 2.88e-05,
+      "loss": 2.9228,
+      "step": 290
+    },
+    {
+      "epoch": 2.586206896551724,
+      "grad_norm": 0.767816960811615,
+      "learning_rate": 2.98e-05,
+      "loss": 2.9144,
+      "step": 300
+    },
+    {
+      "epoch": 2.586206896551724,
+      "eval_loss": 2.9074809551239014,
+      "eval_runtime": 39.8997,
+      "eval_samples_per_second": 33.684,
+      "eval_steps_per_second": 33.684,
+      "eval_wer": 1.0,
+      "step": 300
+    },
+    {
+      "epoch": 2.6724137931034484,
+      "grad_norm": 0.8676608204841614,
+      "learning_rate": 3.08e-05,
+      "loss": 2.8965,
+      "step": 310
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 1.6954621076583862,
+      "learning_rate": 3.18e-05,
+      "loss": 2.8815,
+      "step": 320
+    },
+    {
+      "epoch": 2.844827586206897,
+      "grad_norm": 1.1631884574890137,
+      "learning_rate": 3.2800000000000004e-05,
+      "loss": 2.855,
+      "step": 330
+    },
+    {
+      "epoch": 2.9310344827586206,
+      "grad_norm": 1.625454306602478,
+      "learning_rate": 3.38e-05,
+      "loss": 2.781,
+      "step": 340
+    },
+    {
+      "epoch": 3.0172413793103448,
+      "grad_norm": 2.0763564109802246,
+      "learning_rate": 3.48e-05,
+      "loss": 2.7756,
+      "step": 350
+    },
+    {
+      "epoch": 3.103448275862069,
+      "grad_norm": 2.036031723022461,
+      "learning_rate": 3.58e-05,
+      "loss": 2.6458,
+      "step": 360
+    },
+    {
+      "epoch": 3.189655172413793,
+      "grad_norm": 1.366801142692566,
+      "learning_rate": 3.68e-05,
+      "loss": 2.5189,
+      "step": 370
+    },
+    {
+      "epoch": 3.2758620689655173,
+      "grad_norm": 2.034527540206909,
+      "learning_rate": 3.7800000000000004e-05,
+      "loss": 2.433,
+      "step": 380
+    },
+    {
+      "epoch": 3.3620689655172415,
+      "grad_norm": 3.8338165283203125,
+      "learning_rate": 3.88e-05,
+      "loss": 2.2885,
+      "step": 390
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "grad_norm": 2.3443217277526855,
+      "learning_rate": 3.9800000000000005e-05,
+      "loss": 2.1714,
+      "step": 400
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "eval_loss": 2.0944502353668213,
+      "eval_runtime": 39.7668,
+      "eval_samples_per_second": 33.797,
+      "eval_steps_per_second": 33.797,
+      "eval_wer": 1.0325047801147227,
+      "step": 400
+    },
+    {
+      "epoch": 3.5344827586206895,
+      "grad_norm": 4.349735260009766,
+      "learning_rate": 4.08e-05,
+      "loss": 2.0881,
+      "step": 410
+    },
+    {
+      "epoch": 3.6206896551724137,
+      "grad_norm": 2.450747489929199,
+      "learning_rate": 4.18e-05,
+      "loss": 1.9522,
+      "step": 420
+    },
+    {
+      "epoch": 3.706896551724138,
+      "grad_norm": 2.2519729137420654,
+      "learning_rate": 4.2800000000000004e-05,
+      "loss": 1.8395,
+      "step": 430
+    },
+    {
+      "epoch": 3.793103448275862,
+      "grad_norm": 2.693664789199829,
+      "learning_rate": 4.38e-05,
+      "loss": 1.7525,
+      "step": 440
+    },
+    {
+      "epoch": 3.8793103448275863,
+      "grad_norm": 1.9744929075241089,
+      "learning_rate": 4.4800000000000005e-05,
+      "loss": 1.6222,
+      "step": 450
+    },
+    {
+      "epoch": 3.9655172413793105,
+      "grad_norm": 3.802494764328003,
+      "learning_rate": 4.58e-05,
+      "loss": 1.5397,
+      "step": 460
+    },
+    {
+      "epoch": 4.051724137931035,
+      "grad_norm": 2.301044225692749,
+      "learning_rate": 4.6800000000000006e-05,
+      "loss": 1.4376,
+      "step": 470
+    },
+    {
+      "epoch": 4.137931034482759,
+      "grad_norm": 2.279372215270996,
+      "learning_rate": 4.78e-05,
+      "loss": 1.2829,
+      "step": 480
+    },
+    {
+      "epoch": 4.224137931034483,
+      "grad_norm": 3.314736843109131,
+      "learning_rate": 4.88e-05,
+      "loss": 1.1976,
+      "step": 490
+    },
+    {
+      "epoch": 4.310344827586207,
+      "grad_norm": 2.434694290161133,
+      "learning_rate": 4.9800000000000004e-05,
+      "loss": 1.1579,
+      "step": 500
+    },
+    {
+      "epoch": 4.310344827586207,
+      "eval_loss": 1.045101284980774,
+      "eval_runtime": 39.7455,
+      "eval_samples_per_second": 33.815,
+      "eval_steps_per_second": 33.815,
+      "eval_wer": 0.8299189656742239,
+      "step": 500
+    },
+    {
+      "epoch": 4.396551724137931,
+      "grad_norm": 1.8384031057357788,
+      "learning_rate": 5.08e-05,
+      "loss": 1.0684,
+      "step": 510
+    },
+    {
+      "epoch": 4.482758620689655,
+      "grad_norm": 3.599148988723755,
+      "learning_rate": 5.1800000000000005e-05,
+      "loss": 1.0319,
+      "step": 520
+    },
+    {
+      "epoch": 4.568965517241379,
+      "grad_norm": 2.066476583480835,
+      "learning_rate": 5.28e-05,
+      "loss": 0.9179,
+      "step": 530
+    },
+    {
+      "epoch": 4.655172413793103,
+      "grad_norm": 2.2173750400543213,
+      "learning_rate": 5.380000000000001e-05,
+      "loss": 0.8838,
+      "step": 540
+    },
+    {
+      "epoch": 4.741379310344827,
+      "grad_norm": 2.427091121673584,
+      "learning_rate": 5.4800000000000004e-05,
+      "loss": 0.8991,
+      "step": 550
+    },
+    {
+      "epoch": 4.827586206896552,
+      "grad_norm": 2.7432241439819336,
+      "learning_rate": 5.580000000000001e-05,
+      "loss": 0.8,
+      "step": 560
+    },
+    {
+      "epoch": 4.913793103448276,
+      "grad_norm": 3.254221200942993,
+      "learning_rate": 5.68e-05,
+      "loss": 0.7803,
+      "step": 570
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 4.457448482513428,
+      "learning_rate": 5.7799999999999995e-05,
+      "loss": 0.8205,
+      "step": 580
+    },
+    {
+      "epoch": 5.086206896551724,
+      "grad_norm": 3.1023166179656982,
+      "learning_rate": 5.88e-05,
+      "loss": 0.6703,
+      "step": 590
+    },
+    {
+      "epoch": 5.172413793103448,
+      "grad_norm": 2.5916504859924316,
+      "learning_rate": 5.9800000000000003e-05,
+      "loss": 0.6087,
+      "step": 600
+    },
+    {
+      "epoch": 5.172413793103448,
+      "eval_loss": 0.6753795146942139,
+      "eval_runtime": 39.7485,
+      "eval_samples_per_second": 33.813,
+      "eval_steps_per_second": 33.813,
+      "eval_wer": 0.6440863152144223,
+      "step": 600
+    },
+    {
+      "epoch": 5.258620689655173,
+      "grad_norm": 2.1707613468170166,
+      "learning_rate": 6.08e-05,
+      "loss": 0.6569,
+      "step": 610
+    },
+    {
+      "epoch": 5.344827586206897,
+      "grad_norm": 2.4291555881500244,
+      "learning_rate": 6.18e-05,
+      "loss": 0.5627,
+      "step": 620
+    },
+    {
+      "epoch": 5.431034482758621,
+      "grad_norm": 2.249617338180542,
+      "learning_rate": 6.280000000000001e-05,
+      "loss": 0.5381,
+      "step": 630
+    },
+    {
+      "epoch": 5.517241379310345,
+      "grad_norm": 1.6661946773529053,
+      "learning_rate": 6.38e-05,
+      "loss": 0.6338,
+      "step": 640
+    },
+    {
+      "epoch": 5.603448275862069,
+      "grad_norm": 2.60294771194458,
+      "learning_rate": 6.48e-05,
+      "loss": 0.5181,
+      "step": 650
+    },
+    {
+      "epoch": 5.689655172413794,
+      "grad_norm": 3.3003089427948,
+      "learning_rate": 6.58e-05,
+      "loss": 0.5189,
+      "step": 660
+    },
+    {
+      "epoch": 5.775862068965517,
+      "grad_norm": 1.880764126777649,
+      "learning_rate": 6.680000000000001e-05,
+      "loss": 0.564,
+      "step": 670
+    },
+    {
+      "epoch": 5.862068965517241,
+      "grad_norm": 2.0575127601623535,
+      "learning_rate": 6.780000000000001e-05,
+      "loss": 0.4729,
+      "step": 680
+    },
+    {
+      "epoch": 5.948275862068965,
+      "grad_norm": 2.5159761905670166,
+      "learning_rate": 6.879999999999999e-05,
+      "loss": 0.4899,
+      "step": 690
+    },
+    {
+      "epoch": 6.0344827586206895,
+      "grad_norm": 1.4463504552841187,
+      "learning_rate": 6.98e-05,
+      "loss": 0.481,
+      "step": 700
+    },
+    {
+      "epoch": 6.0344827586206895,
+      "eval_loss": 0.5275412201881409,
+      "eval_runtime": 39.9601,
+      "eval_samples_per_second": 33.634,
+      "eval_steps_per_second": 33.634,
+      "eval_wer": 0.5760721114449604,
+      "step": 700
+    },
+    {
+      "epoch": 6.120689655172414,
+      "grad_norm": 1.788765549659729,
+      "learning_rate": 7.08e-05,
+      "loss": 0.3865,
+      "step": 710
+    },
+    {
+      "epoch": 6.206896551724138,
+      "grad_norm": 1.862762212753296,
+      "learning_rate": 7.18e-05,
+      "loss": 0.3726,
+      "step": 720
+    },
+    {
+      "epoch": 6.293103448275862,
+      "grad_norm": 1.6512093544006348,
+      "learning_rate": 7.280000000000001e-05,
+      "loss": 0.4116,
+      "step": 730
+    },
+    {
+      "epoch": 6.379310344827586,
+      "grad_norm": 2.098067045211792,
+      "learning_rate": 7.38e-05,
+      "loss": 0.3779,
+      "step": 740
+    },
+    {
+      "epoch": 6.4655172413793105,
+      "grad_norm": 3.3030078411102295,
+      "learning_rate": 7.48e-05,
+      "loss": 0.3728,
+      "step": 750
+    },
+    {
+      "epoch": 6.551724137931035,
+      "grad_norm": 2.1799120903015137,
+      "learning_rate": 7.58e-05,
+      "loss": 0.4047,
+      "step": 760
+    },
+    {
+      "epoch": 6.637931034482759,
+      "grad_norm": 1.862434983253479,
+      "learning_rate": 7.680000000000001e-05,
+      "loss": 0.313,
+      "step": 770
+    },
+    {
+      "epoch": 6.724137931034483,
+      "grad_norm": 6.29113245010376,
+      "learning_rate": 7.780000000000001e-05,
+      "loss": 0.4052,
+      "step": 780
+    },
+    {
+      "epoch": 6.810344827586206,
+      "grad_norm": 1.4220325946807861,
+      "learning_rate": 7.88e-05,
+      "loss": 0.3218,
+      "step": 790
+    },
+    {
+      "epoch": 6.896551724137931,
+      "grad_norm": 2.586819648742676,
+      "learning_rate": 7.98e-05,
+      "loss": 0.3072,
+      "step": 800
+    },
+    {
+      "epoch": 6.896551724137931,
+      "eval_loss": 0.4836220443248749,
+      "eval_runtime": 39.8762,
+      "eval_samples_per_second": 33.704,
+      "eval_steps_per_second": 33.704,
+      "eval_wer": 0.5264499681325685,
+      "step": 800
+    },
+    {
+      "epoch": 6.982758620689655,
+      "grad_norm": 1.6589460372924805,
+      "learning_rate": 8.080000000000001e-05,
+      "loss": 0.3862,
+      "step": 810
+    },
+    {
+      "epoch": 7.068965517241379,
+      "grad_norm": 1.7299175262451172,
+      "learning_rate": 8.18e-05,
+      "loss": 0.2938,
+      "step": 820
+    },
+    {
+      "epoch": 7.155172413793103,
+      "grad_norm": 2.0545098781585693,
+      "learning_rate": 8.28e-05,
+      "loss": 0.249,
+      "step": 830
+    },
+    {
+      "epoch": 7.241379310344827,
+      "grad_norm": 24.935670852661133,
+      "learning_rate": 8.38e-05,
+      "loss": 0.3202,
+      "step": 840
+    },
+    {
+      "epoch": 7.327586206896552,
+      "grad_norm": 2.497840642929077,
+      "learning_rate": 8.48e-05,
+      "loss": 0.2803,
+      "step": 850
+    },
+    {
+      "epoch": 7.413793103448276,
+      "grad_norm": 2.698636531829834,
+      "learning_rate": 8.58e-05,
+      "loss": 0.2473,
+      "step": 860
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 1.4561227560043335,
+      "learning_rate": 8.680000000000001e-05,
+      "loss": 0.3223,
+      "step": 870
+    },
+    {
+      "epoch": 7.586206896551724,
+      "grad_norm": 1.7760556936264038,
+      "learning_rate": 8.78e-05,
+      "loss": 0.2481,
+      "step": 880
+    },
+    {
+      "epoch": 7.672413793103448,
+      "grad_norm": 2.308103084564209,
+      "learning_rate": 8.88e-05,
+      "loss": 0.2545,
+      "step": 890
+    },
+    {
+      "epoch": 7.758620689655173,
+      "grad_norm": 1.4128385782241821,
+      "learning_rate": 8.98e-05,
+      "loss": 0.332,
+      "step": 900
+    },
+    {
+      "epoch": 7.758620689655173,
+      "eval_loss": 0.44030094146728516,
+      "eval_runtime": 39.9401,
+      "eval_samples_per_second": 33.65,
+      "eval_steps_per_second": 33.65,
+      "eval_wer": 0.5233542747883092,
+      "step": 900
+    },
+    {
+      "epoch": 7.844827586206897,
+      "grad_norm": 1.7903906106948853,
+      "learning_rate": 9.080000000000001e-05,
+      "loss": 0.2411,
+      "step": 910
+    },
+    {
+      "epoch": 7.931034482758621,
+      "grad_norm": 2.0804216861724854,
+      "learning_rate": 9.180000000000001e-05,
+      "loss": 0.2707,
+      "step": 920
+    },
+    {
+      "epoch": 8.017241379310345,
+      "grad_norm": 1.4420605897903442,
+      "learning_rate": 9.28e-05,
+      "loss": 0.3186,
+      "step": 930
+    },
+    {
+      "epoch": 8.10344827586207,
+      "grad_norm": 2.2910854816436768,
+      "learning_rate": 9.38e-05,
+      "loss": 0.1937,
+      "step": 940
+    },
+    {
+      "epoch": 8.189655172413794,
+      "grad_norm": 3.5892796516418457,
+      "learning_rate": 9.48e-05,
+      "loss": 0.2321,
+      "step": 950
+    },
+    {
+      "epoch": 8.275862068965518,
+      "grad_norm": 1.6509956121444702,
+      "learning_rate": 9.58e-05,
+      "loss": 0.2868,
+      "step": 960
+    },
+    {
+      "epoch": 8.362068965517242,
+      "grad_norm": 1.6983604431152344,
+      "learning_rate": 9.680000000000001e-05,
+      "loss": 0.2004,
+      "step": 970
+    },
+    {
+      "epoch": 8.448275862068966,
+      "grad_norm": 2.061176061630249,
+      "learning_rate": 9.78e-05,
+      "loss": 0.2025,
+      "step": 980
+    },
+    {
+      "epoch": 8.53448275862069,
+      "grad_norm": 1.7732270956039429,
+      "learning_rate": 9.88e-05,
+      "loss": 0.2598,
+      "step": 990
+    },
+    {
+      "epoch": 8.620689655172415,
+      "grad_norm": 1.8335466384887695,
+      "learning_rate": 9.98e-05,
+      "loss": 0.1876,
+      "step": 1000
+    },
+    {
+      "epoch": 8.620689655172415,
+      "eval_loss": 0.4757933020591736,
+      "eval_runtime": 39.8291,
+      "eval_samples_per_second": 33.744,
+      "eval_steps_per_second": 33.744,
+      "eval_wer": 0.5221706273331512,
+      "step": 1000
+    },
+    {
+      "epoch": 8.706896551724139,
+      "grad_norm": 2.52902889251709,
+      "learning_rate": 9.939393939393939e-05,
+      "loss": 0.2456,
+      "step": 1010
+    },
+    {
+      "epoch": 8.793103448275861,
+      "grad_norm": 1.7294162511825562,
+      "learning_rate": 9.863636363636364e-05,
+      "loss": 0.2499,
+      "step": 1020
+    },
+    {
+      "epoch": 8.879310344827585,
+      "grad_norm": 21.9121150970459,
+      "learning_rate": 9.787878787878789e-05,
+      "loss": 0.1854,
+      "step": 1030
+    },
+    {
+      "epoch": 8.96551724137931,
+      "grad_norm": 3.9164559841156006,
+      "learning_rate": 9.712121212121212e-05,
+      "loss": 0.2576,
+      "step": 1040
+    },
+    {
+      "epoch": 9.051724137931034,
+      "grad_norm": 1.239221215248108,
+      "learning_rate": 9.636363636363637e-05,
+      "loss": 0.2118,
+      "step": 1050
+    },
+    {
+      "epoch": 9.137931034482758,
+      "grad_norm": 3.1416544914245605,
+      "learning_rate": 9.560606060606061e-05,
+      "loss": 0.1577,
+      "step": 1060
+    },
+    {
+      "epoch": 9.224137931034482,
+      "grad_norm": 2.4253621101379395,
+      "learning_rate": 9.484848484848486e-05,
+      "loss": 0.2092,
+      "step": 1070
+    },
+    {
+      "epoch": 9.310344827586206,
+      "grad_norm": 1.194345474243164,
+      "learning_rate": 9.40909090909091e-05,
+      "loss": 0.1876,
+      "step": 1080
+    },
+    {
+      "epoch": 9.39655172413793,
+      "grad_norm": 2.411029100418091,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.1546,
+      "step": 1090
+    },
+    {
+      "epoch": 9.482758620689655,
+      "grad_norm": 3.246082067489624,
+      "learning_rate": 9.257575757575758e-05,
+      "loss": 0.2232,
+      "step": 1100
+    },
+    {
+      "epoch": 9.482758620689655,
+      "eval_loss": 0.45077577233314514,
+      "eval_runtime": 39.9221,
+      "eval_samples_per_second": 33.666,
+      "eval_steps_per_second": 33.666,
+      "eval_wer": 0.48921059819721385,
+      "step": 1100
+    },
+    {
+      "epoch": 9.568965517241379,
+      "grad_norm": 1.3427454233169556,
+      "learning_rate": 9.181818181818183e-05,
+      "loss": 0.1777,
+      "step": 1110
+    },
+    {
+      "epoch": 9.655172413793103,
+      "grad_norm": 1.5090447664260864,
+      "learning_rate": 9.106060606060606e-05,
+      "loss": 0.1646,
+      "step": 1120
+    },
+    {
+      "epoch": 9.741379310344827,
+      "grad_norm": 1.3060975074768066,
+      "learning_rate": 9.030303030303031e-05,
+      "loss": 0.225,
+      "step": 1130
+    },
+    {
+      "epoch": 9.827586206896552,
+      "grad_norm": 1.3011540174484253,
+      "learning_rate": 8.954545454545455e-05,
+      "loss": 0.1552,
+      "step": 1140
+    },
+    {
+      "epoch": 9.913793103448276,
+      "grad_norm": 1.9938538074493408,
+      "learning_rate": 8.87878787878788e-05,
+      "loss": 0.1715,
+      "step": 1150
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 3.334385395050049,
+      "learning_rate": 8.803030303030304e-05,
+      "loss": 0.2092,
+      "step": 1160
+    },
+    {
+      "epoch": 10.086206896551724,
+      "grad_norm": 1.011092185974121,
+      "learning_rate": 8.727272727272727e-05,
+      "loss": 0.14,
+      "step": 1170
+    },
+    {
+      "epoch": 10.172413793103448,
+      "grad_norm": 2.517902135848999,
+      "learning_rate": 8.651515151515152e-05,
+      "loss": 0.1512,
+      "step": 1180
+    },
+    {
+      "epoch": 10.258620689655173,
+      "grad_norm": 1.2418378591537476,
+      "learning_rate": 8.575757575757576e-05,
+      "loss": 0.1846,
+      "step": 1190
+    },
+    {
+      "epoch": 10.344827586206897,
+      "grad_norm": 1.5885329246520996,
+      "learning_rate": 8.5e-05,
+      "loss": 0.1332,
+      "step": 1200
+    },
+    {
+      "epoch": 10.344827586206897,
+      "eval_loss": 0.4394075274467468,
+      "eval_runtime": 39.9367,
+      "eval_samples_per_second": 33.653,
+      "eval_steps_per_second": 33.653,
+      "eval_wer": 0.4740052808886461,
+      "step": 1200
+    },
+    {
+      "epoch": 10.431034482758621,
+      "grad_norm": 1.2539469003677368,
+      "learning_rate": 8.424242424242424e-05,
+      "loss": 0.1485,
+      "step": 1210
+    },
+    {
+      "epoch": 10.517241379310345,
+      "grad_norm": 1.357601284980774,
+      "learning_rate": 8.348484848484849e-05,
+      "loss": 0.1988,
+      "step": 1220
+    },
+    {
+      "epoch": 10.60344827586207,
+      "grad_norm": 2.0564587116241455,
+      "learning_rate": 8.272727272727273e-05,
+      "loss": 0.137,
+      "step": 1230
+    },
+    {
+      "epoch": 10.689655172413794,
+      "grad_norm": 2.48364520072937,
+      "learning_rate": 8.196969696969698e-05,
+      "loss": 0.1245,
+      "step": 1240
+    },
+    {
+      "epoch": 10.775862068965518,
+      "grad_norm": 1.015891671180725,
+      "learning_rate": 8.121212121212121e-05,
+      "loss": 0.1602,
+      "step": 1250
+    },
+    {
+      "epoch": 10.862068965517242,
+      "grad_norm": 1.1023950576782227,
+      "learning_rate": 8.045454545454546e-05,
+      "loss": 0.1215,
+      "step": 1260
+    },
+    {
+      "epoch": 10.948275862068966,
+      "grad_norm": 2.703427791595459,
+      "learning_rate": 7.96969696969697e-05,
+      "loss": 0.1621,
+      "step": 1270
+    },
+    {
+      "epoch": 11.03448275862069,
+      "grad_norm": 1.1821691989898682,
+      "learning_rate": 7.893939393939395e-05,
+      "loss": 0.1651,
+      "step": 1280
+    },
+    {
+      "epoch": 11.120689655172415,
+      "grad_norm": 0.930283784866333,
+      "learning_rate": 7.818181818181818e-05,
+      "loss": 0.1066,
+      "step": 1290
+    },
+    {
+      "epoch": 11.206896551724139,
+      "grad_norm": 1.6548758745193481,
+      "learning_rate": 7.742424242424243e-05,
+      "loss": 0.1085,
+      "step": 1300
+    },
+    {
+      "epoch": 11.206896551724139,
+      "eval_loss": 0.4466467499732971,
+      "eval_runtime": 39.8633,
+      "eval_samples_per_second": 33.715,
+      "eval_steps_per_second": 33.715,
+      "eval_wer": 0.46207775653282346,
+      "step": 1300
+    },
+    {
+      "epoch": 11.293103448275861,
+      "grad_norm": 1.1760716438293457,
+      "learning_rate": 7.666666666666667e-05,
+      "loss": 0.1418,
+      "step": 1310
+    },
+    {
+      "epoch": 11.379310344827585,
+      "grad_norm": 2.1062755584716797,
+      "learning_rate": 7.59090909090909e-05,
+      "loss": 0.1133,
+      "step": 1320
+    },
+    {
+      "epoch": 11.46551724137931,
+      "grad_norm": 2.67399001121521,
+      "learning_rate": 7.515151515151515e-05,
+      "loss": 0.1318,
+      "step": 1330
+    },
+    {
+      "epoch": 11.551724137931034,
+      "grad_norm": 1.0049142837524414,
+      "learning_rate": 7.439393939393939e-05,
+      "loss": 0.1474,
+      "step": 1340
+    },
+    {
+      "epoch": 11.637931034482758,
+      "grad_norm": 1.586559772491455,
+      "learning_rate": 7.363636363636364e-05,
+      "loss": 0.0908,
+      "step": 1350
+    },
+    {
+      "epoch": 11.724137931034482,
+      "grad_norm": 3.784040927886963,
+      "learning_rate": 7.287878787878788e-05,
+      "loss": 0.1521,
+      "step": 1360
+    },
+    {
+      "epoch": 11.810344827586206,
+      "grad_norm": 1.125501275062561,
+      "learning_rate": 7.212121212121213e-05,
+      "loss": 0.1163,
+      "step": 1370
+    },
+    {
+      "epoch": 11.89655172413793,
+      "grad_norm": 2.1989808082580566,
+      "learning_rate": 7.136363636363636e-05,
+      "loss": 0.1109,
+      "step": 1380
+    },
+    {
+      "epoch": 11.982758620689655,
+      "grad_norm": 1.1287301778793335,
+      "learning_rate": 7.060606060606061e-05,
+      "loss": 0.152,
+      "step": 1390
+    },
+    {
+      "epoch": 12.068965517241379,
+      "grad_norm": 1.538678765296936,
+      "learning_rate": 6.984848484848485e-05,
+      "loss": 0.098,
+      "step": 1400
+    },
+    {
+      "epoch": 12.068965517241379,
+      "eval_loss": 0.42302384972572327,
+      "eval_runtime": 40.1773,
+      "eval_samples_per_second": 33.452,
+      "eval_steps_per_second": 33.452,
+      "eval_wer": 0.44933078393881454,
+      "step": 1400
+    },
+    {
+      "epoch": 12.155172413793103,
+      "grad_norm": 1.400772213935852,
+      "learning_rate": 6.90909090909091e-05,
+      "loss": 0.092,
+      "step": 1410
+    },
+    {
+      "epoch": 12.241379310344827,
+      "grad_norm": 3.6780846118927,
+      "learning_rate": 6.833333333333333e-05,
+      "loss": 0.1649,
+      "step": 1420
+    },
+    {
+      "epoch": 12.327586206896552,
+      "grad_norm": 1.5424057245254517,
+      "learning_rate": 6.757575757575758e-05,
+      "loss": 0.091,
+      "step": 1430
+    },
+    {
+      "epoch": 12.413793103448276,
+      "grad_norm": 1.4868180751800537,
+      "learning_rate": 6.681818181818183e-05,
+      "loss": 0.0869,
+      "step": 1440
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 1.1947145462036133,
+      "learning_rate": 6.606060606060607e-05,
+      "loss": 0.1499,
+      "step": 1450
+    },
+    {
+      "epoch": 12.586206896551724,
+      "grad_norm": 1.0430784225463867,
+      "learning_rate": 6.530303030303032e-05,
+      "loss": 0.0954,
+      "step": 1460
+    },
+    {
+      "epoch": 12.672413793103448,
+      "grad_norm": 2.4261584281921387,
+      "learning_rate": 6.454545454545455e-05,
+      "loss": 0.1032,
+      "step": 1470
+    },
+    {
+      "epoch": 12.758620689655173,
+      "grad_norm": 1.033467411994934,
+      "learning_rate": 6.37878787878788e-05,
+      "loss": 0.1158,
+      "step": 1480
+    },
+    {
+      "epoch": 12.844827586206897,
+      "grad_norm": 1.1535651683807373,
+      "learning_rate": 6.303030303030302e-05,
+      "loss": 0.0864,
+      "step": 1490
+    },
+    {
+      "epoch": 12.931034482758621,
+      "grad_norm": 1.28826105594635,
+      "learning_rate": 6.227272727272727e-05,
+      "loss": 0.1219,
+      "step": 1500
+    },
+    {
+      "epoch": 12.931034482758621,
+      "eval_loss": 0.418023020029068,
+      "eval_runtime": 40.2192,
+      "eval_samples_per_second": 33.417,
+      "eval_steps_per_second": 33.417,
+      "eval_wer": 0.44596194118182647,
+      "step": 1500
+    },
+    {
+      "epoch": 13.017241379310345,
+      "grad_norm": 1.055411458015442,
+      "learning_rate": 6.151515151515151e-05,
+      "loss": 0.1289,
+      "step": 1510
+    },
+    {
+      "epoch": 13.10344827586207,
+      "grad_norm": 1.1269094944000244,
+      "learning_rate": 6.075757575757576e-05,
+      "loss": 0.0776,
+      "step": 1520
+    },
+    {
+      "epoch": 13.189655172413794,
+      "grad_norm": 1.7149118185043335,
+      "learning_rate": 6e-05,
+      "loss": 0.0871,
+      "step": 1530
+    },
+    {
+      "epoch": 13.275862068965518,
+      "grad_norm": 1.7456856966018677,
+      "learning_rate": 5.9242424242424244e-05,
+      "loss": 0.1087,
+      "step": 1540
+    },
+    {
+      "epoch": 13.362068965517242,
+      "grad_norm": 1.3434715270996094,
+      "learning_rate": 5.848484848484849e-05,
+      "loss": 0.0821,
+      "step": 1550
+    },
+    {
+      "epoch": 13.448275862068966,
+      "grad_norm": 2.103512763977051,
+      "learning_rate": 5.772727272727273e-05,
+      "loss": 0.0878,
+      "step": 1560
+    },
+    {
+      "epoch": 13.53448275862069,
+      "grad_norm": 1.240224838256836,
+      "learning_rate": 5.696969696969697e-05,
+      "loss": 0.1044,
+      "step": 1570
+    },
+    {
+      "epoch": 13.620689655172415,
+      "grad_norm": 0.7336703538894653,
+      "learning_rate": 5.6212121212121215e-05,
+      "loss": 0.0753,
+      "step": 1580
+    },
+    {
+      "epoch": 13.706896551724139,
+      "grad_norm": 2.293342351913452,
+      "learning_rate": 5.545454545454546e-05,
+      "loss": 0.1059,
+      "step": 1590
+    },
+    {
+      "epoch": 13.793103448275861,
+      "grad_norm": 1.1853971481323242,
+      "learning_rate": 5.46969696969697e-05,
+      "loss": 0.1021,
+      "step": 1600
+    },
+    {
+      "epoch": 13.793103448275861,
+      "eval_loss": 0.41785839200019836,
+      "eval_runtime": 40.2906,
+      "eval_samples_per_second": 33.358,
+      "eval_steps_per_second": 33.358,
+      "eval_wer": 0.4405900027314941,
+      "step": 1600
+    },
+    {
+      "epoch": 13.879310344827585,
+      "grad_norm": 1.331200361251831,
+      "learning_rate": 5.393939393939394e-05,
+      "loss": 0.0648,
+      "step": 1610
+    },
+    {
+      "epoch": 13.96551724137931,
+      "grad_norm": 2.28397536277771,
+      "learning_rate": 5.3181818181818186e-05,
+      "loss": 0.1121,
+      "step": 1620
+    },
+    {
+      "epoch": 14.051724137931034,
+      "grad_norm": 0.9436893463134766,
+      "learning_rate": 5.242424242424243e-05,
+      "loss": 0.0725,
+      "step": 1630
+    },
+    {
+      "epoch": 14.137931034482758,
+      "grad_norm": 1.6113288402557373,
+      "learning_rate": 5.166666666666667e-05,
+      "loss": 0.0691,
+      "step": 1640
+    },
+    {
+      "epoch": 14.224137931034482,
+      "grad_norm": 2.479888439178467,
+      "learning_rate": 5.090909090909091e-05,
+      "loss": 0.0979,
+      "step": 1650
+    },
+    {
+      "epoch": 14.310344827586206,
+      "grad_norm": 1.006616473197937,
+      "learning_rate": 5.015151515151515e-05,
+      "loss": 0.0909,
+      "step": 1660
+    },
+    {
+      "epoch": 14.39655172413793,
+      "grad_norm": 1.4571704864501953,
+      "learning_rate": 4.93939393939394e-05,
+      "loss": 0.0761,
+      "step": 1670
+    },
+    {
+      "epoch": 14.482758620689655,
+      "grad_norm": 1.5729875564575195,
+      "learning_rate": 4.863636363636364e-05,
+      "loss": 0.0862,
+      "step": 1680
+    },
+    {
+      "epoch": 14.568965517241379,
+      "grad_norm": 1.2180376052856445,
+      "learning_rate": 4.787878787878788e-05,
+      "loss": 0.0646,
+      "step": 1690
+    },
+    {
+      "epoch": 14.655172413793103,
+      "grad_norm": 1.7464072704315186,
+      "learning_rate": 4.712121212121212e-05,
+      "loss": 0.0741,
+      "step": 1700
+    },
+    {
+      "epoch": 14.655172413793103,
+      "eval_loss": 0.4113341271877289,
+      "eval_runtime": 40.2841,
+      "eval_samples_per_second": 33.363,
+      "eval_steps_per_second": 33.363,
+      "eval_wer": 0.4309387234817445,
+      "step": 1700
+    },
+    {
+      "epoch": 14.741379310344827,
+      "grad_norm": 0.8571386337280273,
+      "learning_rate": 4.6439393939393944e-05,
+      "loss": 0.1315,
+      "step": 1710
+    },
+    {
+      "epoch": 14.827586206896552,
+      "grad_norm": 1.331377387046814,
+      "learning_rate": 4.5681818181818186e-05,
+      "loss": 0.0603,
+      "step": 1720
+    },
+    {
+      "epoch": 14.913793103448276,
+      "grad_norm": 1.5398732423782349,
+      "learning_rate": 4.492424242424242e-05,
+      "loss": 0.0796,
+      "step": 1730
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 3.689671754837036,
+      "learning_rate": 4.4166666666666665e-05,
+      "loss": 0.085,
+      "step": 1740
+    },
+    {
+      "epoch": 15.086206896551724,
+      "grad_norm": 1.132613182067871,
+      "learning_rate": 4.340909090909091e-05,
+      "loss": 0.0544,
+      "step": 1750
+    },
+    {
+      "epoch": 15.172413793103448,
+      "grad_norm": 1.5951859951019287,
+      "learning_rate": 4.265151515151515e-05,
+      "loss": 0.0601,
+      "step": 1760
+    },
+    {
+      "epoch": 15.258620689655173,
+      "grad_norm": 0.5179944634437561,
+      "learning_rate": 4.189393939393939e-05,
+      "loss": 0.097,
+      "step": 1770
+    },
+    {
+      "epoch": 15.344827586206897,
+      "grad_norm": 0.9744370579719543,
+      "learning_rate": 4.113636363636364e-05,
+      "loss": 0.0596,
+      "step": 1780
+    },
+    {
+      "epoch": 15.431034482758621,
+      "grad_norm": 1.8794275522232056,
+      "learning_rate": 4.0378787878787885e-05,
+      "loss": 0.0677,
+      "step": 1790
+    },
+    {
+      "epoch": 15.517241379310345,
+      "grad_norm": 0.748386025428772,
+      "learning_rate": 3.962121212121213e-05,
+      "loss": 0.0896,
+      "step": 1800
+    },
+    {
+      "epoch": 15.517241379310345,
+      "eval_loss": 0.43920788168907166,
+      "eval_runtime": 40.1997,
+      "eval_samples_per_second": 33.433,
+      "eval_steps_per_second": 33.433,
+      "eval_wer": 0.4307566238732587,
+      "step": 1800
+    },
+    {
+      "epoch": 15.60344827586207,
+      "grad_norm": 0.9639837145805359,
+      "learning_rate": 3.8863636363636364e-05,
+      "loss": 0.0604,
+      "step": 1810
+    },
+    {
+      "epoch": 15.689655172413794,
+      "grad_norm": 1.9640839099884033,
+      "learning_rate": 3.810606060606061e-05,
+      "loss": 0.0711,
+      "step": 1820
+    },
+    {
+      "epoch": 15.775862068965518,
+      "grad_norm": 1.4438735246658325,
+      "learning_rate": 3.734848484848485e-05,
+      "loss": 0.0867,
+      "step": 1830
+    },
+    {
+      "epoch": 15.862068965517242,
+      "grad_norm": 1.0062426328659058,
+      "learning_rate": 3.659090909090909e-05,
+      "loss": 0.0605,
+      "step": 1840
+    },
+    {
+      "epoch": 15.948275862068966,
+      "grad_norm": 1.6331523656845093,
+      "learning_rate": 3.5833333333333335e-05,
+      "loss": 0.0662,
+      "step": 1850
+    },
+    {
+      "epoch": 16.03448275862069,
+      "grad_norm": 0.8070217370986938,
+      "learning_rate": 3.507575757575758e-05,
+      "loss": 0.0765,
+      "step": 1860
+    },
+    {
+      "epoch": 16.120689655172413,
+      "grad_norm": 1.4137670993804932,
+      "learning_rate": 3.431818181818182e-05,
+      "loss": 0.0537,
+      "step": 1870
+    },
+    {
+      "epoch": 16.20689655172414,
+      "grad_norm": 1.5437769889831543,
+      "learning_rate": 3.356060606060606e-05,
+      "loss": 0.0684,
+      "step": 1880
+    },
+    {
+      "epoch": 16.29310344827586,
+      "grad_norm": 0.90281081199646,
+      "learning_rate": 3.2803030303030305e-05,
+      "loss": 0.0744,
+      "step": 1890
+    },
+    {
+      "epoch": 16.379310344827587,
+      "grad_norm": 1.139837622642517,
+      "learning_rate": 3.204545454545455e-05,
+      "loss": 0.0492,
+      "step": 1900
+    },
+    {
+      "epoch": 16.379310344827587,
+      "eval_loss": 0.4201890528202057,
+      "eval_runtime": 40.1502,
+      "eval_samples_per_second": 33.474,
+      "eval_steps_per_second": 33.474,
+      "eval_wer": 0.4313029226987162,
+      "step": 1900
+    },
+    {
+      "epoch": 16.46551724137931,
+      "grad_norm": 1.679457426071167,
+      "learning_rate": 3.128787878787879e-05,
+      "loss": 0.0652,
+      "step": 1910
+    },
+    {
+      "epoch": 16.551724137931036,
+      "grad_norm": 0.6661111116409302,
+      "learning_rate": 3.0530303030303034e-05,
+      "loss": 0.0649,
+      "step": 1920
+    },
+    {
+      "epoch": 16.637931034482758,
+      "grad_norm": 1.1774355173110962,
+      "learning_rate": 2.9772727272727273e-05,
+      "loss": 0.0469,
+      "step": 1930
+    },
+    {
+      "epoch": 16.724137931034484,
+      "grad_norm": 1.783923864364624,
+      "learning_rate": 2.901515151515152e-05,
+      "loss": 0.0752,
+      "step": 1940
+    },
+    {
+      "epoch": 16.810344827586206,
+      "grad_norm": 1.176321268081665,
+      "learning_rate": 2.825757575757576e-05,
+      "loss": 0.0519,
+      "step": 1950
+    },
+    {
+      "epoch": 16.896551724137932,
+      "grad_norm": 1.3150608539581299,
+      "learning_rate": 2.7500000000000004e-05,
+      "loss": 0.0547,
+      "step": 1960
+    },
+    {
+      "epoch": 16.982758620689655,
+      "grad_norm": 0.983769953250885,
+      "learning_rate": 2.674242424242424e-05,
+      "loss": 0.0799,
+      "step": 1970
+    },
+    {
+      "epoch": 17.06896551724138,
+      "grad_norm": 0.996890127658844,
+      "learning_rate": 2.5984848484848483e-05,
+      "loss": 0.0577,
+      "step": 1980
+    },
+    {
+      "epoch": 17.155172413793103,
+      "grad_norm": 2.3034253120422363,
+      "learning_rate": 2.5227272727272726e-05,
+      "loss": 0.0515,
+      "step": 1990
+    },
+    {
+      "epoch": 17.24137931034483,
+      "grad_norm": 3.7528610229492188,
+      "learning_rate": 2.4469696969696972e-05,
+      "loss": 0.0759,
+      "step": 2000
+    },
+    {
+      "epoch": 17.24137931034483,
+      "eval_loss": 0.43480169773101807,
+      "eval_runtime": 40.017,
+      "eval_samples_per_second": 33.586,
+      "eval_steps_per_second": 33.586,
+      "eval_wer": 0.4207411454065374,
+      "step": 2000
+    },
+    {
+      "epoch": 17.32758620689655,
+      "grad_norm": 0.6646668314933777,
+      "learning_rate": 2.3712121212121214e-05,
+      "loss": 0.0419,
+      "step": 2010
+    },
+    {
+      "epoch": 17.413793103448278,
+      "grad_norm": 1.3250740766525269,
+      "learning_rate": 2.2954545454545457e-05,
+      "loss": 0.0595,
+      "step": 2020
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.8094995021820068,
+      "learning_rate": 2.21969696969697e-05,
+      "loss": 0.0691,
+      "step": 2030
+    },
+    {
+      "epoch": 17.586206896551722,
+      "grad_norm": 0.846946120262146,
+      "learning_rate": 2.143939393939394e-05,
+      "loss": 0.052,
+      "step": 2040
+    },
+    {
+      "epoch": 17.67241379310345,
+      "grad_norm": 1.652417540550232,
+      "learning_rate": 2.0681818181818182e-05,
+      "loss": 0.0565,
+      "step": 2050
+    },
+    {
+      "epoch": 17.75862068965517,
+      "grad_norm": 1.0080279111862183,
+      "learning_rate": 1.9924242424242425e-05,
+      "loss": 0.0745,
+      "step": 2060
+    },
+    {
+      "epoch": 17.844827586206897,
+      "grad_norm": 0.7252691388130188,
+      "learning_rate": 1.9166666666666667e-05,
+      "loss": 0.0513,
+      "step": 2070
+    },
+    {
+      "epoch": 17.93103448275862,
+      "grad_norm": 1.58548903465271,
+      "learning_rate": 1.840909090909091e-05,
+      "loss": 0.055,
+      "step": 2080
+    },
+    {
+      "epoch": 18.017241379310345,
+      "grad_norm": 0.6634634733200073,
+      "learning_rate": 1.7651515151515153e-05,
+      "loss": 0.0658,
+      "step": 2090
+    },
+    {
+      "epoch": 18.103448275862068,
+      "grad_norm": 1.1495524644851685,
+      "learning_rate": 1.6893939393939395e-05,
+      "loss": 0.0406,
+      "step": 2100
+    },
+    {
+      "epoch": 18.103448275862068,
+      "eval_loss": 0.44191813468933105,
+      "eval_runtime": 40.0967,
+      "eval_samples_per_second": 33.519,
+      "eval_steps_per_second": 33.519,
+      "eval_wer": 0.42046799599380863,
+      "step": 2100
+    },
+    {
+      "epoch": 18.189655172413794,
+      "grad_norm": 0.9788354635238647,
+      "learning_rate": 1.6136363636363638e-05,
+      "loss": 0.0381,
+      "step": 2110
+    },
+    {
+      "epoch": 18.275862068965516,
+      "grad_norm": 1.093633770942688,
+      "learning_rate": 1.5378787878787877e-05,
+      "loss": 0.071,
+      "step": 2120
+    },
+    {
+      "epoch": 18.362068965517242,
+      "grad_norm": 0.7164376974105835,
+      "learning_rate": 1.4621212121212122e-05,
+      "loss": 0.0439,
+      "step": 2130
+    },
+    {
+      "epoch": 18.448275862068964,
+      "grad_norm": 0.9887032508850098,
+      "learning_rate": 1.3863636363636364e-05,
+      "loss": 0.0481,
+      "step": 2140
+    },
+    {
+      "epoch": 18.53448275862069,
+      "grad_norm": 0.45052286982536316,
+      "learning_rate": 1.3106060606060607e-05,
+      "loss": 0.0571,
+      "step": 2150
+    },
+    {
+      "epoch": 18.620689655172413,
+      "grad_norm": 1.167181134223938,
+      "learning_rate": 1.234848484848485e-05,
+      "loss": 0.0452,
+      "step": 2160
+    },
+    {
+      "epoch": 18.70689655172414,
+      "grad_norm": 1.378661870956421,
+      "learning_rate": 1.159090909090909e-05,
+      "loss": 0.0643,
+      "step": 2170
+    },
+    {
+      "epoch": 18.79310344827586,
+      "grad_norm": 0.854932963848114,
+      "learning_rate": 1.0833333333333334e-05,
+      "loss": 0.0587,
+      "step": 2180
+    },
+    {
+      "epoch": 18.879310344827587,
+      "grad_norm": 0.8007526397705078,
+      "learning_rate": 1.0075757575757576e-05,
+      "loss": 0.0395,
+      "step": 2190
+    },
+    {
+      "epoch": 18.96551724137931,
+      "grad_norm": 3.317830801010132,
+      "learning_rate": 9.318181818181819e-06,
+      "loss": 0.074,
+      "step": 2200
+    },
+    {
+      "epoch": 18.96551724137931,
+      "eval_loss": 0.43061742186546326,
+      "eval_runtime": 40.0034,
+      "eval_samples_per_second": 33.597,
+      "eval_steps_per_second": 33.597,
+      "eval_wer": 0.420012746972594,
+      "step": 2200
+    },
+    {
+      "epoch": 19.051724137931036,
+      "grad_norm": 0.7710875272750854,
+      "learning_rate": 8.56060606060606e-06,
+      "loss": 0.046,
+      "step": 2210
+    },
+    {
+      "epoch": 19.137931034482758,
+      "grad_norm": 0.5200530886650085,
+      "learning_rate": 7.803030303030304e-06,
+      "loss": 0.0394,
+      "step": 2220
+    },
+    {
+      "epoch": 19.224137931034484,
+      "grad_norm": 1.3544327020645142,
+      "learning_rate": 7.045454545454545e-06,
+      "loss": 0.0582,
+      "step": 2230
+    },
+    {
+      "epoch": 19.310344827586206,
+      "grad_norm": 0.8653574585914612,
+      "learning_rate": 6.287878787878789e-06,
+      "loss": 0.0606,
+      "step": 2240
+    },
+    {
+      "epoch": 19.396551724137932,
+      "grad_norm": 1.5852700471878052,
+      "learning_rate": 5.530303030303031e-06,
+      "loss": 0.0367,
+      "step": 2250
+    },
+    {
+      "epoch": 19.482758620689655,
+      "grad_norm": 2.2167246341705322,
+      "learning_rate": 4.772727272727273e-06,
+      "loss": 0.0782,
+      "step": 2260
+    },
+    {
+      "epoch": 19.56896551724138,
+      "grad_norm": 0.5891330242156982,
+      "learning_rate": 4.015151515151515e-06,
+      "loss": 0.0416,
+      "step": 2270
+    },
+    {
+      "epoch": 19.655172413793103,
+      "grad_norm": 1.1137330532073975,
+      "learning_rate": 3.257575757575758e-06,
+      "loss": 0.0515,
+      "step": 2280
+    },
+    {
+      "epoch": 19.74137931034483,
+      "grad_norm": 0.8132285475730896,
+      "learning_rate": 2.5e-06,
+      "loss": 0.0512,
+      "step": 2290
+    },
+    {
+      "epoch": 19.82758620689655,
+      "grad_norm": 0.7994781136512756,
+      "learning_rate": 1.7424242424242427e-06,
+      "loss": 0.0378,
+      "step": 2300
+    },
+    {
+      "epoch": 19.82758620689655,
+      "eval_loss": 0.4273350238800049,
+      "eval_runtime": 40.0934,
+      "eval_samples_per_second": 33.522,
+      "eval_steps_per_second": 33.522,
+      "eval_wer": 0.41728125284530637,
+      "step": 2300
+    },
+    {
+      "epoch": 19.913793103448278,
+      "grad_norm": 0.9775754809379578,
+      "learning_rate": 9.848484848484847e-07,
+      "loss": 0.0489,
+      "step": 2310
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.8857516050338745,
+      "learning_rate": 2.2727272727272726e-07,
+      "loss": 0.0554,
+      "step": 2320
+    },
+    {
+      "epoch": 20.0,
+      "step": 2320,
+      "total_flos": 2.1476719263248095e+18,
+      "train_loss": 0.8618391515622879,
+      "train_runtime": 3159.4128,
+      "train_samples_per_second": 23.397,
+      "train_steps_per_second": 0.734
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2320,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 400,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.1476719263248095e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

wav2vec2-base-timit-fine-tuned./training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abed99ebdf15c43d2882e1b9d49f7e81da386dc7c0be97a54f7bddbea730415d
+size 5112

wav2vec2-base-timit-fine-tuned./vocab.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "[PAD]": 28,
+  "[UNK]": 27,
+  "a": 1,
+  "b": 2,
+  "c": 3,
+  "d": 4,
+  "e": 5,
+  "f": 6,
+  "g": 7,
+  "h": 8,
+  "i": 9,
+  "j": 10,
+  "k": 11,
+  "l": 12,
+  "m": 13,
+  "n": 14,
+  "o": 15,
+  "p": 16,
+  "q": 17,
+  "r": 18,
+  "s": 19,
+  "t": 20,
+  "u": 21,
+  "v": 22,
+  "w": 23,
+  "x": 24,
+  "y": 25,
+  "z": 26,
+  "|": 0
+}

wav2vec2-base-timit-fine-tuned/README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+---
+license: apache-2.0
+base_model: facebook/wav2vec2-base
+tags:
+- automatic-speech-recognition
+- timit_asr
+- generated_from_trainer
+datasets:
+- timit_asr
+metrics:
+- wer
+model-index:
+- name: wav2vec2-base-timit-fine-tuned
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: TIMIT_ASR - NA
+      type: timit_asr
+      config: clean
+      split: test
+      args: 'Config: na, Training split: train, Eval split: test'
+    metrics:
+    - name: Wer
+      type: wer
+      value: 0.4090867704634435
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# wav2vec2-base-timit-fine-tuned
+This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the TIMIT_ASR - NA dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.4218
+- Wer: 0.4091
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 32
+- eval_batch_size: 1
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 1000
+- num_epochs: 20.0
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch   | Step | Validation Loss | Wer    |
+|:-------------:|:-------:|:----:|:---------------:|:------:|
+| 3.1612        | 0.8621  | 100  | 3.1181          | 1.0    |
+| 2.978         | 1.7241  | 200  | 2.9722          | 1.0    |
+| 2.9185        | 2.5862  | 300  | 2.9098          | 1.0    |
+| 2.1282        | 3.4483  | 400  | 2.0066          | 1.0247 |
+| 1.1234        | 4.3103  | 500  | 1.0197          | 0.8393 |
+| 0.602         | 5.1724  | 600  | 0.6714          | 0.6600 |
+| 0.5032        | 6.0345  | 700  | 0.5285          | 0.5659 |
+| 0.3101        | 6.8966  | 800  | 0.4819          | 0.5282 |
+| 0.3432        | 7.7586  | 900  | 0.4653          | 0.5272 |
+| 0.1922        | 8.6207  | 1000 | 0.4672          | 0.4918 |
+| 0.2284        | 9.4828  | 1100 | 0.4834          | 0.4870 |
+| 0.1372        | 10.3448 | 1200 | 0.4380          | 0.4727 |
+| 0.1105        | 11.2069 | 1300 | 0.4509          | 0.4594 |
+| 0.0992        | 12.0690 | 1400 | 0.4196          | 0.4544 |
+| 0.1226        | 12.9310 | 1500 | 0.4237          | 0.4321 |
+| 0.1013        | 13.7931 | 1600 | 0.4113          | 0.4298 |
+| 0.0661        | 14.6552 | 1700 | 0.4038          | 0.4276 |
+| 0.0901        | 15.5172 | 1800 | 0.4321          | 0.4225 |
+| 0.053         | 16.3793 | 1900 | 0.4076          | 0.4236 |
+| 0.0805        | 17.2414 | 2000 | 0.4336          | 0.4156 |
+| 0.049         | 18.1034 | 2100 | 0.4193          | 0.4114 |
+| 0.0717        | 18.9655 | 2200 | 0.4139          | 0.4091 |
+| 0.0389        | 19.8276 | 2300 | 0.4216          | 0.4087 |
+### Framework versions
+- Transformers 4.42.0.dev0
+- Pytorch 2.3.0a0+git71dd2de
+- Datasets 2.19.1
+- Tokenizers 0.19.1

wav2vec2-base-timit-fine-tuned/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</s>": 30,
+  "<s>": 29
+}

wav2vec2-base-timit-fine-tuned/all_results.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 20.0,
+    "eval_loss": 0.42176610231399536,
+    "eval_runtime": 39.428,
+    "eval_samples": 1344,
+    "eval_samples_per_second": 34.087,
+    "eval_steps_per_second": 34.087,
+    "eval_wer": 0.4090867704634435,
+    "total_flos": 2.1476719263248095e+18,
+    "train_loss": 0.8590125822430027,
+    "train_runtime": 3151.1477,
+    "train_samples": 3696,
+    "train_samples_per_second": 23.458,
+    "train_steps_per_second": 0.736
+}

wav2vec2-base-timit-fine-tuned/config.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "_name_or_path": "facebook/wav2vec2-base",
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "freeze_feat_extract_train": true,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 28,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 31,
+  "xvector_output_dim": 512
+}