patrickvonplaten commited on
Commit
d1e51fe
1 Parent(s): 1cf2986

ready for training

Browse files
README.md DELETED
@@ -1,57 +0,0 @@
1
- ---
2
- license: apache-2.0
3
- tags:
4
- - generated_from_trainer
5
- datasets:
6
- - common_voice
7
- model-index:
8
- - name: ''
9
- results: []
10
- ---
11
-
12
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
- should probably proofread and complete it, then remove this comment. -->
14
-
15
- #
16
-
17
- This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the common_voice dataset.
18
-
19
- ## Model description
20
-
21
- More information needed
22
-
23
- ## Intended uses & limitations
24
-
25
- More information needed
26
-
27
- ## Training and evaluation data
28
-
29
- More information needed
30
-
31
- ## Training procedure
32
-
33
- ### Training hyperparameters
34
-
35
- The following hyperparameters were used during training:
36
- - learning_rate: 0.0003
37
- - train_batch_size: 16
38
- - eval_batch_size: 8
39
- - seed: 42
40
- - gradient_accumulation_steps: 2
41
- - total_train_batch_size: 32
42
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
- - lr_scheduler_type: linear
44
- - lr_scheduler_warmup_steps: 500
45
- - num_epochs: 15.0
46
- - mixed_precision_training: Native AMP
47
-
48
- ### Training results
49
-
50
-
51
-
52
- ### Framework versions
53
-
54
- - Transformers 4.22.0.dev0
55
- - Pytorch 1.11.0+cu113
56
- - Datasets 2.4.0
57
- - Tokenizers 0.12.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
added_tokens.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "</s>": 47,
3
- "<s>": 46
4
- }
 
 
 
 
 
all_results.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "epoch": 15.0,
3
- "eval_loss": 19.86508560180664,
4
- "eval_runtime": 0.4623,
5
- "eval_samples": 9,
6
- "eval_samples_per_second": 19.467,
7
- "eval_steps_per_second": 4.326,
8
- "eval_wer": 1.0,
9
- "train_loss": 24.409126790364585,
10
- "train_runtime": 20.8017,
11
- "train_samples": 22,
12
- "train_samples_per_second": 15.864,
13
- "train_steps_per_second": 0.721
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json DELETED
@@ -1,117 +0,0 @@
1
- {
2
- "_name_or_path": "facebook/wav2vec2-base",
3
- "activation_dropout": 0.0,
4
- "adapter_kernel_size": 3,
5
- "adapter_stride": 2,
6
- "add_adapter": false,
7
- "apply_spec_augment": true,
8
- "architectures": [
9
- "Wav2Vec2ForCTC"
10
- ],
11
- "attention_dropout": 0.0,
12
- "bos_token_id": 1,
13
- "classifier_proj_size": 256,
14
- "codevector_dim": 256,
15
- "contrastive_logits_temperature": 0.1,
16
- "conv_bias": false,
17
- "conv_dim": [
18
- 512,
19
- 512,
20
- 512,
21
- 512,
22
- 512,
23
- 512,
24
- 512
25
- ],
26
- "conv_kernel": [
27
- 10,
28
- 3,
29
- 3,
30
- 3,
31
- 3,
32
- 2,
33
- 2
34
- ],
35
- "conv_stride": [
36
- 5,
37
- 2,
38
- 2,
39
- 2,
40
- 2,
41
- 2,
42
- 2
43
- ],
44
- "ctc_loss_reduction": "mean",
45
- "ctc_zero_infinity": false,
46
- "diversity_loss_weight": 0.1,
47
- "do_stable_layer_norm": false,
48
- "eos_token_id": 2,
49
- "feat_extract_activation": "gelu",
50
- "feat_extract_norm": "group",
51
- "feat_proj_dropout": 0.0,
52
- "feat_quantizer_dropout": 0.0,
53
- "final_dropout": 0.0,
54
- "freeze_feat_extract_train": true,
55
- "hidden_act": "gelu",
56
- "hidden_dropout": 0.0,
57
- "hidden_size": 768,
58
- "initializer_range": 0.02,
59
- "intermediate_size": 3072,
60
- "layer_norm_eps": 1e-05,
61
- "layerdrop": 0.0,
62
- "mask_channel_length": 10,
63
- "mask_channel_min_space": 1,
64
- "mask_channel_other": 0.0,
65
- "mask_channel_prob": 0.0,
66
- "mask_channel_selection": "static",
67
- "mask_feature_length": 10,
68
- "mask_feature_min_masks": 0,
69
- "mask_feature_prob": 0.0,
70
- "mask_time_length": 10,
71
- "mask_time_min_masks": 2,
72
- "mask_time_min_space": 1,
73
- "mask_time_other": 0.0,
74
- "mask_time_prob": 0.05,
75
- "mask_time_selection": "static",
76
- "model_type": "wav2vec2",
77
- "no_mask_channel_overlap": false,
78
- "no_mask_time_overlap": false,
79
- "num_adapter_layers": 3,
80
- "num_attention_heads": 12,
81
- "num_codevector_groups": 2,
82
- "num_codevectors_per_group": 320,
83
- "num_conv_pos_embedding_groups": 16,
84
- "num_conv_pos_embeddings": 128,
85
- "num_feat_extract_layers": 7,
86
- "num_hidden_layers": 12,
87
- "num_negatives": 100,
88
- "output_hidden_size": 768,
89
- "pad_token_id": 45,
90
- "proj_codevector_dim": 256,
91
- "tdnn_dilation": [
92
- 1,
93
- 2,
94
- 3,
95
- 1,
96
- 1
97
- ],
98
- "tdnn_dim": [
99
- 512,
100
- 512,
101
- 512,
102
- 512,
103
- 1500
104
- ],
105
- "tdnn_kernel": [
106
- 5,
107
- 3,
108
- 3,
109
- 1,
110
- 1
111
- ],
112
- "torch_dtype": "float32",
113
- "transformers_version": "4.22.0.dev0",
114
- "use_weighted_layer_sum": false,
115
- "vocab_size": 48,
116
- "xvector_output_dim": 512
117
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "epoch": 15.0,
3
- "eval_loss": 19.86508560180664,
4
- "eval_runtime": 0.4623,
5
- "eval_samples": 9,
6
- "eval_samples_per_second": 19.467,
7
- "eval_steps_per_second": 4.326,
8
- "eval_wer": 1.0
9
- }
 
 
 
 
 
 
 
 
 
 
preprocessor_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "do_normalize": true,
3
- "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
- "feature_size": 1,
5
- "padding_side": "right",
6
- "padding_value": 0.0,
7
- "return_attention_mask": false,
8
- "sampling_rate": 16000
9
- }
 
 
 
 
 
 
 
 
 
 
pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7020e5b6d127c2a75362883d75ca0d7ddafb6744cd555006ae55e73c762ae60
3
- size 377706071
 
 
 
 
run.sh CHANGED
@@ -1,23 +1,28 @@
1
  #!/usr/bin/env bash
2
  CUDA_VISIBLE_DEVICES="0" python run_speech_recognition_ctc.py \
3
- --dataset_name="common_voice" \
4
- --model_name_or_path="facebook/wav2vec2-base" \
5
- --dataset_config_name="ab" \
 
 
6
  --output_dir="./" \
 
7
  --overwrite_output_dir \
8
- --num_train_epochs="15" \
9
  --per_device_train_batch_size="32" \
 
10
  --learning_rate="3e-4" \
11
  --warmup_steps="500" \
12
  --evaluation_strategy="steps" \
13
- --text_column_name="sentence" \
14
- --length_column_name="input_length" \
15
  --save_steps="400" \
16
  --eval_steps="100" \
 
17
  --layerdrop="0.0" \
18
  --save_total_limit="3" \
19
- --freeze_feature_encoder \
20
  --gradient_checkpointing \
 
21
  --fp16 \
22
  --group_by_length \
23
  --push_to_hub \
 
1
  #!/usr/bin/env bash
2
  CUDA_VISIBLE_DEVICES="0" python run_speech_recognition_ctc.py \
3
+ --dataset_name="/home/patrick_huggingface_co/ami-kaldi-chunked-wav2vec2-base" \
4
+ --model_name_or_path="facebook/wav2vec2-large-lv60" \
5
+ --dataset_config_name="ihm" \
6
+ --train_split_name="train" \
7
+ --eval_split_name="validation" \
8
  --output_dir="./" \
9
+ --preprocessing_num_workers="16" \
10
  --overwrite_output_dir \
11
+ --num_train_epochs="3" \
12
  --per_device_train_batch_size="32" \
13
+ --gradient_accumulation_steps="1" \
14
  --learning_rate="3e-4" \
15
  --warmup_steps="500" \
16
  --evaluation_strategy="steps" \
17
+ --text_column_name="text" \
 
18
  --save_steps="400" \
19
  --eval_steps="100" \
20
+ --logging_steps="1" \
21
  --layerdrop="0.0" \
22
  --save_total_limit="3" \
23
+ --freeze_feature_extractor \
24
  --gradient_checkpointing \
25
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \
26
  --fp16 \
27
  --group_by_length \
28
  --push_to_hub \
special_tokens_map.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<s>",
5
- "lstrip": false,
6
- "normalized": true,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "</s>",
12
- "lstrip": false,
13
- "normalized": true,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
- ],
18
- "bos_token": "<s>",
19
- "eos_token": "</s>",
20
- "pad_token": "[PAD]",
21
- "unk_token": "[UNK]"
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "bos_token": "<s>",
3
- "do_lower_case": false,
4
- "eos_token": "</s>",
5
- "name_or_path": "./",
6
- "pad_token": "[PAD]",
7
- "replace_word_delimiter_char": " ",
8
- "special_tokens_map_file": null,
9
- "tokenizer_class": "Wav2Vec2CTCTokenizer",
10
- "unk_token": "[UNK]",
11
- "word_delimiter_token": "|"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
train_results.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "epoch": 15.0,
3
- "train_loss": 24.409126790364585,
4
- "train_runtime": 20.8017,
5
- "train_samples": 22,
6
- "train_samples_per_second": 15.864,
7
- "train_steps_per_second": 0.721
8
- }
 
 
 
 
 
 
 
 
 
trainer_state.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 15.0,
5
- "global_step": 15,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 15.0,
12
- "step": 15,
13
- "total_flos": 2.629133656215552e+16,
14
- "train_loss": 24.409126790364585,
15
- "train_runtime": 20.8017,
16
- "train_samples_per_second": 15.864,
17
- "train_steps_per_second": 0.721
18
- }
19
- ],
20
- "max_steps": 15,
21
- "num_train_epochs": 15,
22
- "total_flos": 2.629133656215552e+16,
23
- "trial_name": null,
24
- "trial_params": null
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a609728264d711b88677fd490926ca3e18b3681ec912259bcc35a92d12c4366
3
- size 3311
 
 
 
 
vocab.json DELETED
@@ -1,48 +0,0 @@
1
- {
2
- "!": 1,
3
- ",": 2,
4
- "-": 3,
5
- ".": 4,
6
- ":": 5,
7
- "?": 6,
8
- "[PAD]": 45,
9
- "[UNK]": 44,
10
- "|": 0,
11
- "а": 7,
12
- "б": 8,
13
- "в": 9,
14
- "г": 10,
15
- "д": 11,
16
- "е": 12,
17
- "ж": 13,
18
- "з": 14,
19
- "и": 15,
20
- "к": 16,
21
- "л": 17,
22
- "м": 18,
23
- "н": 19,
24
- "о": 20,
25
- "п": 21,
26
- "р": 22,
27
- "с": 23,
28
- "т": 24,
29
- "у": 25,
30
- "ф": 26,
31
- "х": 27,
32
- "ц": 28,
33
- "ш": 29,
34
- "ы": 30,
35
- "ь": 31,
36
- "қ": 32,
37
- "ҟ": 33,
38
- "ҩ": 34,
39
- "ҭ": 35,
40
- "ҳ": 36,
41
- "ҵ": 37,
42
- "ҽ": 38,
43
- "ҿ": 39,
44
- "ә": 40,
45
- "ӡ": 41,
46
- "ӷ": 42,
47
- "ԥ": 43
48
- }