|
loading configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/config.json from cache at /home/cahya/.cache/huggingface/transformers/47f005d7b541562c0734cfe1b8aaf7f644846084b33a9247f5810d5a16d001a7.1c2175954f7220a41c71683d239699eb295d40ec92ac51faac3b85ad4bef2ad8 |
|
/home/cahya/Work/MachineLearning/transformers/src/transformers/configuration_utils.py:353: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`. |
|
warnings.warn( |
|
Model config Wav2Vec2Config { |
|
"_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv", |
|
"activation_dropout": 0.055, |
|
"adapter_kernel_size": 3, |
|
"adapter_stride": 2, |
|
"add_adapter": false, |
|
"apply_spec_augment": true, |
|
"architectures": [ |
|
"Wav2Vec2ForCTC" |
|
], |
|
"attention_dropout": 0.094, |
|
"bos_token_id": 1, |
|
"classifier_proj_size": 256, |
|
"codevector_dim": 256, |
|
"contrastive_logits_temperature": 0.1, |
|
"conv_bias": false, |
|
"conv_dim": [ |
|
512, |
|
512, |
|
512, |
|
512, |
|
512, |
|
512, |
|
512 |
|
], |
|
"conv_kernel": [ |
|
10, |
|
3, |
|
3, |
|
3, |
|
3, |
|
2, |
|
2 |
|
], |
|
"conv_stride": [ |
|
5, |
|
2, |
|
2, |
|
2, |
|
2, |
|
2, |
|
2 |
|
], |
|
"ctc_loss_reduction": "mean", |
|
"ctc_zero_infinity": true, |
|
"diversity_loss_weight": 0.1, |
|
"do_stable_layer_norm": false, |
|
"eos_token_id": 2, |
|
"feat_extract_activation": "gelu", |
|
"feat_extract_norm": "group", |
|
"feat_proj_dropout": 0.04, |
|
"feat_quantizer_dropout": 0.0, |
|
"final_dropout": 0.1, |
|
"gradient_checkpointing": true, |
|
"hidden_act": "gelu", |
|
"hidden_dropout": 0.047, |
|
"hidden_size": 768, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 3072, |
|
"layer_norm_eps": 1e-05, |
|
"layerdrop": 0.041, |
|
"mask_feature_length": 10, |
|
"mask_feature_min_masks": 0, |
|
"mask_feature_prob": 0.0, |
|
"mask_time_length": 10, |
|
"mask_time_min_masks": 2, |
|
"mask_time_prob": 0.4, |
|
"model_type": "wav2vec2", |
|
"num_adapter_layers": 3, |
|
"num_attention_heads": 12, |
|
"num_codevector_groups": 2, |
|
"num_codevectors_per_group": 320, |
|
"num_conv_pos_embedding_groups": 16, |
|
"num_conv_pos_embeddings": 128, |
|
"num_feat_extract_layers": 7, |
|
"num_hidden_layers": 12, |
|
"num_negatives": 100, |
|
"output_hidden_size": 768, |
|
"pad_token_id": 39, |
|
"proj_codevector_dim": 256, |
|
"tdnn_dilation": [ |
|
1, |
|
2, |
|
3, |
|
1, |
|
1 |
|
], |
|
"tdnn_dim": [ |
|
512, |
|
512, |
|
512, |
|
512, |
|
1500 |
|
], |
|
"tdnn_kernel": [ |
|
5, |
|
3, |
|
3, |
|
1, |
|
1 |
|
], |
|
"transformers_version": "4.17.0.dev0", |
|
"use_weighted_layer_sum": false, |
|
"vocab_size": 40, |
|
"xvector_output_dim": 512 |
|
} |
|
|
|
0%| | 0/1 [00:00<?, ?ba/s]
100%|ββββββββββ| 1/1 [00:00<00:00, 9.35ba/s]
100%|ββββββββββ| 1/1 [00:00<00:00, 9.34ba/s] |
|
0%| | 0/1 [00:00<?, ?ba/s]
100%|ββββββββββ| 1/1 [00:00<00:00, 25.43ba/s] |
|
Didn't find file ./output/tokenizer_config.json. We won't load it. |
|
Didn't find file ./output/added_tokens.json. We won't load it. |
|
Didn't find file ./output/special_tokens_map.json. We won't load it. |
|
Didn't find file ./output/tokenizer.json. We won't load it. |
|
loading file ./output/vocab.json |
|
loading file None |
|
loading file None |
|
loading file None |
|
loading file None |
|
file ./output/config.json not found |
|
Adding <s> to the vocabulary |
|
Adding </s> to the vocabulary |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
loading configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/config.json from cache at /home/cahya/.cache/huggingface/transformers/47f005d7b541562c0734cfe1b8aaf7f644846084b33a9247f5810d5a16d001a7.1c2175954f7220a41c71683d239699eb295d40ec92ac51faac3b85ad4bef2ad8 |
|
Model config Wav2Vec2Config { |
|
"_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv", |
|
"activation_dropout": 0.055, |
|
"adapter_kernel_size": 3, |
|
"adapter_stride": 2, |
|
"add_adapter": false, |
|
"apply_spec_augment": true, |
|
"architectures": [ |
|
"Wav2Vec2ForCTC" |
|
], |
|
"attention_dropout": 0.094, |
|
"bos_token_id": 1, |
|
"classifier_proj_size": 256, |
|
"codevector_dim": 256, |
|
"contrastive_logits_temperature": 0.1, |
|
"conv_bias": false, |
|
"conv_dim": [ |
|
512, |
|
512, |
|
512, |
|
512, |
|
512, |
|
512, |
|
512 |
|
], |
|
"conv_kernel": [ |
|
10, |
|
3, |
|
3, |
|
3, |
|
3, |
|
2, |
|
2 |
|
], |
|
"conv_stride": [ |
|
5, |
|
2, |
|
2, |
|
2, |
|
2, |
|
2, |
|
2 |
|
], |
|
"ctc_loss_reduction": "mean", |
|
"ctc_zero_infinity": true, |
|
"diversity_loss_weight": 0.1, |
|
"do_stable_layer_norm": false, |
|
"eos_token_id": 2, |
|
"feat_extract_activation": "gelu", |
|
"feat_extract_norm": "group", |
|
"feat_proj_dropout": 0.04, |
|
"feat_quantizer_dropout": 0.0, |
|
"final_dropout": 0.1, |
|
"gradient_checkpointing": true, |
|
"hidden_act": "gelu", |
|
"hidden_dropout": 0.047, |
|
"hidden_size": 768, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 3072, |
|
"layer_norm_eps": 1e-05, |
|
"layerdrop": 0.041, |
|
"mask_feature_length": 10, |
|
"mask_feature_min_masks": 0, |
|
"mask_feature_prob": 0.0, |
|
"mask_time_length": 10, |
|
"mask_time_min_masks": 2, |
|
"mask_time_prob": 0.4, |
|
"model_type": "wav2vec2", |
|
"num_adapter_layers": 3, |
|
"num_attention_heads": 12, |
|
"num_codevector_groups": 2, |
|
"num_codevectors_per_group": 320, |
|
"num_conv_pos_embedding_groups": 16, |
|
"num_conv_pos_embeddings": 128, |
|
"num_feat_extract_layers": 7, |
|
"num_hidden_layers": 12, |
|
"num_negatives": 100, |
|
"output_hidden_size": 768, |
|
"pad_token_id": 39, |
|
"proj_codevector_dim": 256, |
|
"tdnn_dilation": [ |
|
1, |
|
2, |
|
3, |
|
1, |
|
1 |
|
], |
|
"tdnn_dim": [ |
|
512, |
|
512, |
|
512, |
|
512, |
|
1500 |
|
], |
|
"tdnn_kernel": [ |
|
5, |
|
3, |
|
3, |
|
1, |
|
1 |
|
], |
|
"transformers_version": "4.17.0.dev0", |
|
"use_weighted_layer_sum": false, |
|
"vocab_size": 40, |
|
"xvector_output_dim": 512 |
|
} |
|
|
|
loading feature extractor configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/preprocessor_config.json from cache at /home/cahya/.cache/huggingface/transformers/34433162acde7e1ca4a265d8ae309442e4ddadff37e6e37d2d37eb7133f65f8f.fcd266b775b7f33ba9b607a0fee7cc615aeb2eb281586f046280492ea380ae23 |
|
Feature extractor Wav2Vec2FeatureExtractor { |
|
"do_normalize": true, |
|
"feature_extractor_type": "Wav2Vec2FeatureExtractor", |
|
"feature_size": 1, |
|
"padding_side": "right", |
|
"padding_value": 0.0, |
|
"return_attention_mask": true, |
|
"sampling_rate": 16000 |
|
} |
|
|
|
loading weights file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/pytorch_model.bin from cache at /home/cahya/.cache/huggingface/transformers/3b3f7d0041c2b08b031c8357e39249bdbc06c8bfcd5a9f8891c7f259b07a0b85.356b4eec0d55a5c4d2d480c2dd2ea2cc0c867771bc39b8cdc97b629e4206482c |
|
Traceback (most recent call last): |
|
File "run_speech_recognition_ctc.py", line 745, in <module> |
|
main() |
|
File "run_speech_recognition_ctc.py", line 552, in main |
|
model = AutoModelForCTC.from_pretrained( |
|
File "/home/cahya/Work/MachineLearning/transformers/src/transformers/models/auto/auto_factory.py", line 447, in from_pretrained |
|
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) |
|
File "/home/cahya/Work/MachineLearning/transformers/src/transformers/modeling_utils.py", line 1528, in from_pretrained |
|
model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_state_dict_into_model( |
|
File "/home/cahya/Work/MachineLearning/transformers/src/transformers/modeling_utils.py", line 1682, in _load_state_dict_into_model |
|
raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") |
|
RuntimeError: Error(s) in loading state_dict for Wav2Vec2ForCTC: |
|
size mismatch for lm_head.weight: copying a param with shape torch.Size([40, 768]) from checkpoint, the shape in current model is torch.Size([41, 768]). |
|
size mismatch for lm_head.bias: copying a param with shape torch.Size([40]) from checkpoint, the shape in current model is torch.Size([41]). |
|
|