patrickvonplaten
commited on
Commit
•
d1e51fe
1
Parent(s):
1cf2986
ready for training
Browse files- README.md +0 -57
- added_tokens.json +0 -4
- all_results.json +0 -14
- config.json +0 -117
- eval_results.json +0 -9
- preprocessor_config.json +0 -9
- pytorch_model.bin +0 -3
- run.sh +12 -7
- special_tokens_map.json +0 -22
- tokenizer_config.json +0 -12
- train_results.json +0 -8
- trainer_state.json +0 -25
- training_args.bin +0 -3
- vocab.json +0 -48
README.md
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
tags:
|
4 |
-
- generated_from_trainer
|
5 |
-
datasets:
|
6 |
-
- common_voice
|
7 |
-
model-index:
|
8 |
-
- name: ''
|
9 |
-
results: []
|
10 |
-
---
|
11 |
-
|
12 |
-
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
13 |
-
should probably proofread and complete it, then remove this comment. -->
|
14 |
-
|
15 |
-
#
|
16 |
-
|
17 |
-
This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the common_voice dataset.
|
18 |
-
|
19 |
-
## Model description
|
20 |
-
|
21 |
-
More information needed
|
22 |
-
|
23 |
-
## Intended uses & limitations
|
24 |
-
|
25 |
-
More information needed
|
26 |
-
|
27 |
-
## Training and evaluation data
|
28 |
-
|
29 |
-
More information needed
|
30 |
-
|
31 |
-
## Training procedure
|
32 |
-
|
33 |
-
### Training hyperparameters
|
34 |
-
|
35 |
-
The following hyperparameters were used during training:
|
36 |
-
- learning_rate: 0.0003
|
37 |
-
- train_batch_size: 16
|
38 |
-
- eval_batch_size: 8
|
39 |
-
- seed: 42
|
40 |
-
- gradient_accumulation_steps: 2
|
41 |
-
- total_train_batch_size: 32
|
42 |
-
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
43 |
-
- lr_scheduler_type: linear
|
44 |
-
- lr_scheduler_warmup_steps: 500
|
45 |
-
- num_epochs: 15.0
|
46 |
-
- mixed_precision_training: Native AMP
|
47 |
-
|
48 |
-
### Training results
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
### Framework versions
|
53 |
-
|
54 |
-
- Transformers 4.22.0.dev0
|
55 |
-
- Pytorch 1.11.0+cu113
|
56 |
-
- Datasets 2.4.0
|
57 |
-
- Tokenizers 0.12.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
added_tokens.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"</s>": 47,
|
3 |
-
"<s>": 46
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
all_results.json
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"epoch": 15.0,
|
3 |
-
"eval_loss": 19.86508560180664,
|
4 |
-
"eval_runtime": 0.4623,
|
5 |
-
"eval_samples": 9,
|
6 |
-
"eval_samples_per_second": 19.467,
|
7 |
-
"eval_steps_per_second": 4.326,
|
8 |
-
"eval_wer": 1.0,
|
9 |
-
"train_loss": 24.409126790364585,
|
10 |
-
"train_runtime": 20.8017,
|
11 |
-
"train_samples": 22,
|
12 |
-
"train_samples_per_second": 15.864,
|
13 |
-
"train_steps_per_second": 0.721
|
14 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
DELETED
@@ -1,117 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "facebook/wav2vec2-base",
|
3 |
-
"activation_dropout": 0.0,
|
4 |
-
"adapter_kernel_size": 3,
|
5 |
-
"adapter_stride": 2,
|
6 |
-
"add_adapter": false,
|
7 |
-
"apply_spec_augment": true,
|
8 |
-
"architectures": [
|
9 |
-
"Wav2Vec2ForCTC"
|
10 |
-
],
|
11 |
-
"attention_dropout": 0.0,
|
12 |
-
"bos_token_id": 1,
|
13 |
-
"classifier_proj_size": 256,
|
14 |
-
"codevector_dim": 256,
|
15 |
-
"contrastive_logits_temperature": 0.1,
|
16 |
-
"conv_bias": false,
|
17 |
-
"conv_dim": [
|
18 |
-
512,
|
19 |
-
512,
|
20 |
-
512,
|
21 |
-
512,
|
22 |
-
512,
|
23 |
-
512,
|
24 |
-
512
|
25 |
-
],
|
26 |
-
"conv_kernel": [
|
27 |
-
10,
|
28 |
-
3,
|
29 |
-
3,
|
30 |
-
3,
|
31 |
-
3,
|
32 |
-
2,
|
33 |
-
2
|
34 |
-
],
|
35 |
-
"conv_stride": [
|
36 |
-
5,
|
37 |
-
2,
|
38 |
-
2,
|
39 |
-
2,
|
40 |
-
2,
|
41 |
-
2,
|
42 |
-
2
|
43 |
-
],
|
44 |
-
"ctc_loss_reduction": "mean",
|
45 |
-
"ctc_zero_infinity": false,
|
46 |
-
"diversity_loss_weight": 0.1,
|
47 |
-
"do_stable_layer_norm": false,
|
48 |
-
"eos_token_id": 2,
|
49 |
-
"feat_extract_activation": "gelu",
|
50 |
-
"feat_extract_norm": "group",
|
51 |
-
"feat_proj_dropout": 0.0,
|
52 |
-
"feat_quantizer_dropout": 0.0,
|
53 |
-
"final_dropout": 0.0,
|
54 |
-
"freeze_feat_extract_train": true,
|
55 |
-
"hidden_act": "gelu",
|
56 |
-
"hidden_dropout": 0.0,
|
57 |
-
"hidden_size": 768,
|
58 |
-
"initializer_range": 0.02,
|
59 |
-
"intermediate_size": 3072,
|
60 |
-
"layer_norm_eps": 1e-05,
|
61 |
-
"layerdrop": 0.0,
|
62 |
-
"mask_channel_length": 10,
|
63 |
-
"mask_channel_min_space": 1,
|
64 |
-
"mask_channel_other": 0.0,
|
65 |
-
"mask_channel_prob": 0.0,
|
66 |
-
"mask_channel_selection": "static",
|
67 |
-
"mask_feature_length": 10,
|
68 |
-
"mask_feature_min_masks": 0,
|
69 |
-
"mask_feature_prob": 0.0,
|
70 |
-
"mask_time_length": 10,
|
71 |
-
"mask_time_min_masks": 2,
|
72 |
-
"mask_time_min_space": 1,
|
73 |
-
"mask_time_other": 0.0,
|
74 |
-
"mask_time_prob": 0.05,
|
75 |
-
"mask_time_selection": "static",
|
76 |
-
"model_type": "wav2vec2",
|
77 |
-
"no_mask_channel_overlap": false,
|
78 |
-
"no_mask_time_overlap": false,
|
79 |
-
"num_adapter_layers": 3,
|
80 |
-
"num_attention_heads": 12,
|
81 |
-
"num_codevector_groups": 2,
|
82 |
-
"num_codevectors_per_group": 320,
|
83 |
-
"num_conv_pos_embedding_groups": 16,
|
84 |
-
"num_conv_pos_embeddings": 128,
|
85 |
-
"num_feat_extract_layers": 7,
|
86 |
-
"num_hidden_layers": 12,
|
87 |
-
"num_negatives": 100,
|
88 |
-
"output_hidden_size": 768,
|
89 |
-
"pad_token_id": 45,
|
90 |
-
"proj_codevector_dim": 256,
|
91 |
-
"tdnn_dilation": [
|
92 |
-
1,
|
93 |
-
2,
|
94 |
-
3,
|
95 |
-
1,
|
96 |
-
1
|
97 |
-
],
|
98 |
-
"tdnn_dim": [
|
99 |
-
512,
|
100 |
-
512,
|
101 |
-
512,
|
102 |
-
512,
|
103 |
-
1500
|
104 |
-
],
|
105 |
-
"tdnn_kernel": [
|
106 |
-
5,
|
107 |
-
3,
|
108 |
-
3,
|
109 |
-
1,
|
110 |
-
1
|
111 |
-
],
|
112 |
-
"torch_dtype": "float32",
|
113 |
-
"transformers_version": "4.22.0.dev0",
|
114 |
-
"use_weighted_layer_sum": false,
|
115 |
-
"vocab_size": 48,
|
116 |
-
"xvector_output_dim": 512
|
117 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"epoch": 15.0,
|
3 |
-
"eval_loss": 19.86508560180664,
|
4 |
-
"eval_runtime": 0.4623,
|
5 |
-
"eval_samples": 9,
|
6 |
-
"eval_samples_per_second": 19.467,
|
7 |
-
"eval_steps_per_second": 4.326,
|
8 |
-
"eval_wer": 1.0
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocessor_config.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"do_normalize": true,
|
3 |
-
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
-
"feature_size": 1,
|
5 |
-
"padding_side": "right",
|
6 |
-
"padding_value": 0.0,
|
7 |
-
"return_attention_mask": false,
|
8 |
-
"sampling_rate": 16000
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pytorch_model.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b7020e5b6d127c2a75362883d75ca0d7ddafb6744cd555006ae55e73c762ae60
|
3 |
-
size 377706071
|
|
|
|
|
|
|
|
run.sh
CHANGED
@@ -1,23 +1,28 @@
|
|
1 |
#!/usr/bin/env bash
|
2 |
CUDA_VISIBLE_DEVICES="0" python run_speech_recognition_ctc.py \
|
3 |
-
--dataset_name="
|
4 |
-
--model_name_or_path="facebook/wav2vec2-
|
5 |
-
--dataset_config_name="
|
|
|
|
|
6 |
--output_dir="./" \
|
|
|
7 |
--overwrite_output_dir \
|
8 |
-
--num_train_epochs="
|
9 |
--per_device_train_batch_size="32" \
|
|
|
10 |
--learning_rate="3e-4" \
|
11 |
--warmup_steps="500" \
|
12 |
--evaluation_strategy="steps" \
|
13 |
-
--text_column_name="
|
14 |
-
--length_column_name="input_length" \
|
15 |
--save_steps="400" \
|
16 |
--eval_steps="100" \
|
|
|
17 |
--layerdrop="0.0" \
|
18 |
--save_total_limit="3" \
|
19 |
-
--
|
20 |
--gradient_checkpointing \
|
|
|
21 |
--fp16 \
|
22 |
--group_by_length \
|
23 |
--push_to_hub \
|
|
|
1 |
#!/usr/bin/env bash
|
2 |
CUDA_VISIBLE_DEVICES="0" python run_speech_recognition_ctc.py \
|
3 |
+
--dataset_name="/home/patrick_huggingface_co/ami-kaldi-chunked-wav2vec2-base" \
|
4 |
+
--model_name_or_path="facebook/wav2vec2-large-lv60" \
|
5 |
+
--dataset_config_name="ihm" \
|
6 |
+
--train_split_name="train" \
|
7 |
+
--eval_split_name="validation" \
|
8 |
--output_dir="./" \
|
9 |
+
--preprocessing_num_workers="16" \
|
10 |
--overwrite_output_dir \
|
11 |
+
--num_train_epochs="3" \
|
12 |
--per_device_train_batch_size="32" \
|
13 |
+
--gradient_accumulation_steps="1" \
|
14 |
--learning_rate="3e-4" \
|
15 |
--warmup_steps="500" \
|
16 |
--evaluation_strategy="steps" \
|
17 |
+
--text_column_name="text" \
|
|
|
18 |
--save_steps="400" \
|
19 |
--eval_steps="100" \
|
20 |
+
--logging_steps="1" \
|
21 |
--layerdrop="0.0" \
|
22 |
--save_total_limit="3" \
|
23 |
+
--freeze_feature_extractor \
|
24 |
--gradient_checkpointing \
|
25 |
+
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \
|
26 |
--fp16 \
|
27 |
--group_by_length \
|
28 |
--push_to_hub \
|
special_tokens_map.json
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"additional_special_tokens": [
|
3 |
-
{
|
4 |
-
"content": "<s>",
|
5 |
-
"lstrip": false,
|
6 |
-
"normalized": true,
|
7 |
-
"rstrip": false,
|
8 |
-
"single_word": false
|
9 |
-
},
|
10 |
-
{
|
11 |
-
"content": "</s>",
|
12 |
-
"lstrip": false,
|
13 |
-
"normalized": true,
|
14 |
-
"rstrip": false,
|
15 |
-
"single_word": false
|
16 |
-
}
|
17 |
-
],
|
18 |
-
"bos_token": "<s>",
|
19 |
-
"eos_token": "</s>",
|
20 |
-
"pad_token": "[PAD]",
|
21 |
-
"unk_token": "[UNK]"
|
22 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_config.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bos_token": "<s>",
|
3 |
-
"do_lower_case": false,
|
4 |
-
"eos_token": "</s>",
|
5 |
-
"name_or_path": "./",
|
6 |
-
"pad_token": "[PAD]",
|
7 |
-
"replace_word_delimiter_char": " ",
|
8 |
-
"special_tokens_map_file": null,
|
9 |
-
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
10 |
-
"unk_token": "[UNK]",
|
11 |
-
"word_delimiter_token": "|"
|
12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_results.json
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"epoch": 15.0,
|
3 |
-
"train_loss": 24.409126790364585,
|
4 |
-
"train_runtime": 20.8017,
|
5 |
-
"train_samples": 22,
|
6 |
-
"train_samples_per_second": 15.864,
|
7 |
-
"train_steps_per_second": 0.721
|
8 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trainer_state.json
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"best_metric": null,
|
3 |
-
"best_model_checkpoint": null,
|
4 |
-
"epoch": 15.0,
|
5 |
-
"global_step": 15,
|
6 |
-
"is_hyper_param_search": false,
|
7 |
-
"is_local_process_zero": true,
|
8 |
-
"is_world_process_zero": true,
|
9 |
-
"log_history": [
|
10 |
-
{
|
11 |
-
"epoch": 15.0,
|
12 |
-
"step": 15,
|
13 |
-
"total_flos": 2.629133656215552e+16,
|
14 |
-
"train_loss": 24.409126790364585,
|
15 |
-
"train_runtime": 20.8017,
|
16 |
-
"train_samples_per_second": 15.864,
|
17 |
-
"train_steps_per_second": 0.721
|
18 |
-
}
|
19 |
-
],
|
20 |
-
"max_steps": 15,
|
21 |
-
"num_train_epochs": 15,
|
22 |
-
"total_flos": 2.629133656215552e+16,
|
23 |
-
"trial_name": null,
|
24 |
-
"trial_params": null
|
25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_args.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0a609728264d711b88677fd490926ca3e18b3681ec912259bcc35a92d12c4366
|
3 |
-
size 3311
|
|
|
|
|
|
|
|
vocab.json
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"!": 1,
|
3 |
-
",": 2,
|
4 |
-
"-": 3,
|
5 |
-
".": 4,
|
6 |
-
":": 5,
|
7 |
-
"?": 6,
|
8 |
-
"[PAD]": 45,
|
9 |
-
"[UNK]": 44,
|
10 |
-
"|": 0,
|
11 |
-
"а": 7,
|
12 |
-
"б": 8,
|
13 |
-
"в": 9,
|
14 |
-
"г": 10,
|
15 |
-
"д": 11,
|
16 |
-
"е": 12,
|
17 |
-
"ж": 13,
|
18 |
-
"з": 14,
|
19 |
-
"и": 15,
|
20 |
-
"к": 16,
|
21 |
-
"л": 17,
|
22 |
-
"м": 18,
|
23 |
-
"н": 19,
|
24 |
-
"о": 20,
|
25 |
-
"п": 21,
|
26 |
-
"р": 22,
|
27 |
-
"с": 23,
|
28 |
-
"т": 24,
|
29 |
-
"у": 25,
|
30 |
-
"ф": 26,
|
31 |
-
"х": 27,
|
32 |
-
"ц": 28,
|
33 |
-
"ш": 29,
|
34 |
-
"ы": 30,
|
35 |
-
"ь": 31,
|
36 |
-
"қ": 32,
|
37 |
-
"ҟ": 33,
|
38 |
-
"ҩ": 34,
|
39 |
-
"ҭ": 35,
|
40 |
-
"ҳ": 36,
|
41 |
-
"ҵ": 37,
|
42 |
-
"ҽ": 38,
|
43 |
-
"ҿ": 39,
|
44 |
-
"ә": 40,
|
45 |
-
"ӡ": 41,
|
46 |
-
"ӷ": 42,
|
47 |
-
"ԥ": 43
|
48 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|