arslanarjumand commited on
Commit
a1228af
1 Parent(s): bafe36a

arslanarjumand/wav2vec-reptiles

Browse files
Files changed (5) hide show
  1. README.md +19 -15
  2. config.json +33 -48
  3. model.safetensors +2 -2
  4. preprocessor_config.json +7 -6
  5. training_args.bin +2 -2
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- license: apache-2.0
3
- base_model: vitouphy/wav2vec2-xls-r-300m-english
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,13 +13,13 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # wav2vec-reptiles
15
 
16
- This model is a fine-tuned version of [vitouphy/wav2vec2-xls-r-300m-english](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-english) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 2223.3787
19
- - Pcc Accuracy: 0.2187
20
- - Pcc Fluency: 0.0834
21
- - Pcc Total Score: 0.1532
22
- - Pcc Content: 0.2235
23
 
24
  ## Model description
25
 
@@ -47,23 +47,27 @@ The following hyperparameters were used during training:
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.4
50
- - num_epochs: 25
51
  - mixed_precision_training: Native AMP
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss | Pcc Accuracy | Pcc Fluency | Pcc Total Score | Pcc Content |
56
  |:-------------:|:-----:|:----:|:---------------:|:------------:|:-----------:|:---------------:|:-----------:|
57
- | 2728.7846 | 5.0 | 100 | 3196.0576 | 0.3476 | -0.2327 | -0.3110 | 0.2340 |
58
- | 2491.7434 | 10.0 | 200 | 2875.6025 | 0.2791 | -0.0388 | -0.0724 | 0.2475 |
59
- | 1926.2301 | 15.0 | 300 | 2480.8772 | 0.2280 | 0.0499 | 0.1131 | 0.2334 |
60
- | 2065.1381 | 20.0 | 400 | 2265.0391 | 0.2201 | 0.0799 | 0.1478 | 0.2238 |
61
- | 1903.073 | 25.0 | 500 | 2223.3787 | 0.2187 | 0.0834 | 0.1532 | 0.2235 |
 
 
 
 
62
 
63
 
64
  ### Framework versions
65
 
66
  - Transformers 4.37.0
67
  - Pytorch 2.1.2
68
- - Datasets 2.17.0
69
  - Tokenizers 0.15.1
 
1
  ---
2
+ license: mit
3
+ base_model: facebook/w2v-bert-2.0
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # wav2vec-reptiles
15
 
16
+ This model is a fine-tuned version of [facebook/w2v-bert-2.0](https://huggingface.co/facebook/w2v-bert-2.0) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 484.9289
19
+ - Pcc Accuracy: -0.1604
20
+ - Pcc Fluency: -0.1393
21
+ - Pcc Total Score: -0.1591
22
+ - Pcc Content: -0.1544
23
 
24
  ## Model description
25
 
 
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.4
50
+ - num_epochs: 10
51
  - mixed_precision_training: Native AMP
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss | Pcc Accuracy | Pcc Fluency | Pcc Total Score | Pcc Content |
56
  |:-------------:|:-----:|:----:|:---------------:|:------------:|:-----------:|:---------------:|:-----------:|
57
+ | 3175.5941 | 1.07 | 500 | 2802.1936 | -0.2863 | -0.2729 | -0.3001 | -0.2745 |
58
+ | 1733.457 | 2.13 | 1000 | 2440.8833 | -0.2827 | -0.2779 | -0.2959 | -0.2787 |
59
+ | 1890.6879 | 3.2 | 1500 | 1470.4958 | -0.2806 | -0.2763 | -0.2933 | -0.2772 |
60
+ | 470.8979 | 4.27 | 2000 | 565.3928 | -0.2658 | -0.2589 | -0.2764 | -0.2621 |
61
+ | 881.7893 | 5.34 | 2500 | 501.9731 | -0.2331 | -0.2204 | -0.2394 | -0.2285 |
62
+ | 379.352 | 6.4 | 3000 | 497.4395 | -0.2040 | -0.1871 | -0.2068 | -0.1982 |
63
+ | 378.5915 | 7.47 | 3500 | 491.6927 | -0.1783 | -0.1590 | -0.1789 | -0.1726 |
64
+ | 539.6395 | 8.54 | 4000 | 487.6133 | -0.1639 | -0.1434 | -0.1631 | -0.1582 |
65
+ | 319.019 | 9.61 | 4500 | 484.9289 | -0.1604 | -0.1393 | -0.1591 | -0.1544 |
66
 
67
 
68
  ### Framework versions
69
 
70
  - Transformers 4.37.0
71
  - Pytorch 2.1.2
72
+ - Datasets 2.17.1
73
  - Tokenizers 0.15.1
config.json CHANGED
@@ -1,85 +1,69 @@
1
  {
2
- "_name_or_path": "vitouphy/wav2vec2-xls-r-300m-english",
3
  "activation_dropout": 0.006,
4
- "adapter_attn_dim": null,
5
  "adapter_kernel_size": 3,
6
  "adapter_stride": 2,
7
  "add_adapter": false,
8
- "apply_spec_augment": true,
9
  "architectures": [
10
- "Wav2Vec2ForWav2Vec2ForCTCAndUttranceRegression"
11
  ],
12
  "attention_dropout": 0.0094,
13
  "bos_token_id": 1,
14
- "classifier_proj_size": 256,
15
  "codevector_dim": 768,
 
16
  "contrastive_logits_temperature": 0.1,
17
- "conv_bias": true,
18
- "conv_dim": [
19
- 512,
20
- 512,
21
- 512,
22
- 512,
23
- 512,
24
- 512,
25
- 512
26
- ],
27
- "conv_kernel": [
28
- 10,
29
- 3,
30
- 3,
31
- 3,
32
- 3,
33
- 2,
34
- 2
35
- ],
36
- "conv_stride": [
37
- 5,
38
- 2,
39
- 2,
40
- 2,
41
- 2,
42
- 2,
43
- 2
44
- ],
45
- "ctc_loss_reduction": "mean",
46
- "ctc_zero_infinity": true,
47
  "diversity_loss_weight": 0.1,
48
- "do_stable_layer_norm": true,
49
  "eos_token_id": 2,
50
- "feat_extract_activation": "gelu",
51
- "feat_extract_dropout": 0.0,
52
- "feat_extract_norm": "layer",
53
  "feat_proj_dropout": 0.0,
54
  "feat_quantizer_dropout": 0.0,
 
55
  "final_dropout": 0.0005,
56
- "gradient_checkpointing": false,
57
- "hidden_act": "gelu_new",
58
  "hidden_dropout": 0.004,
59
  "hidden_size": 1024,
 
 
 
 
 
 
60
  "initializer_range": 0.02,
61
  "intermediate_size": 4096,
 
 
 
 
 
 
62
  "layer_norm_eps": 1e-05,
63
  "layerdrop": 0.0005,
 
64
  "mask_feature_length": 5,
65
  "mask_feature_min_masks": 2,
66
  "mask_feature_prob": 0.0075,
67
  "mask_time_length": 5,
68
  "mask_time_min_masks": 2,
69
  "mask_time_prob": 0.0085,
70
- "model_type": "wav2vec2",
71
- "num_adapter_layers": 3,
 
72
  "num_attention_heads": 16,
73
  "num_codevector_groups": 2,
74
  "num_codevectors_per_group": 320,
75
- "num_conv_pos_embedding_groups": 16,
76
- "num_conv_pos_embeddings": 128,
77
- "num_feat_extract_layers": 7,
78
  "num_hidden_layers": 24,
79
  "num_negatives": 100,
80
  "output_hidden_size": 1024,
81
- "pad_token_id": 28,
 
82
  "proj_codevector_dim": 768,
 
 
83
  "tdnn_dilation": [
84
  1,
85
  2,
@@ -103,7 +87,8 @@
103
  ],
104
  "torch_dtype": "float32",
105
  "transformers_version": "4.37.0",
 
106
  "use_weighted_layer_sum": false,
107
- "vocab_size": 31,
108
  "xvector_output_dim": 512
109
  }
 
1
  {
2
+ "_name_or_path": "facebook/w2v-bert-2.0",
3
  "activation_dropout": 0.006,
4
+ "adapter_act": "relu",
5
  "adapter_kernel_size": 3,
6
  "adapter_stride": 2,
7
  "add_adapter": false,
8
+ "apply_spec_augment": false,
9
  "architectures": [
10
+ "Wav2Vec2BertForSequenceClassification"
11
  ],
12
  "attention_dropout": 0.0094,
13
  "bos_token_id": 1,
14
+ "classifier_proj_size": 768,
15
  "codevector_dim": 768,
16
+ "conformer_conv_dropout": 0.1,
17
  "contrastive_logits_temperature": 0.1,
18
+ "conv_depthwise_kernel_size": 31,
19
+ "ctc_loss_reduction": "sum",
20
+ "ctc_zero_infinity": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "diversity_loss_weight": 0.1,
 
22
  "eos_token_id": 2,
 
 
 
23
  "feat_proj_dropout": 0.0,
24
  "feat_quantizer_dropout": 0.0,
25
+ "feature_projection_input_dim": 160,
26
  "final_dropout": 0.0005,
27
+ "hidden_act": "swish",
 
28
  "hidden_dropout": 0.004,
29
  "hidden_size": 1024,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2",
34
+ "3": "LABEL_3"
35
+ },
36
  "initializer_range": 0.02,
37
  "intermediate_size": 4096,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1,
41
+ "LABEL_2": 2,
42
+ "LABEL_3": 3
43
+ },
44
  "layer_norm_eps": 1e-05,
45
  "layerdrop": 0.0005,
46
+ "left_max_position_embeddings": 64,
47
  "mask_feature_length": 5,
48
  "mask_feature_min_masks": 2,
49
  "mask_feature_prob": 0.0075,
50
  "mask_time_length": 5,
51
  "mask_time_min_masks": 2,
52
  "mask_time_prob": 0.0085,
53
+ "max_source_positions": 5000,
54
+ "model_type": "wav2vec2-bert",
55
+ "num_adapter_layers": 1,
56
  "num_attention_heads": 16,
57
  "num_codevector_groups": 2,
58
  "num_codevectors_per_group": 320,
 
 
 
59
  "num_hidden_layers": 24,
60
  "num_negatives": 100,
61
  "output_hidden_size": 1024,
62
+ "pad_token_id": 0,
63
+ "position_embeddings_type": "relative_key",
64
  "proj_codevector_dim": 768,
65
+ "right_max_position_embeddings": 8,
66
+ "rotary_embedding_base": 10000,
67
  "tdnn_dilation": [
68
  1,
69
  2,
 
87
  ],
88
  "torch_dtype": "float32",
89
  "transformers_version": "4.37.0",
90
+ "use_intermediate_ffn_before_adapter": false,
91
  "use_weighted_layer_sum": false,
92
+ "vocab_size": null,
93
  "xvector_output_dim": 512
94
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d03ae21377669660184c9ffc98e3b22fc8fc609e5e7c5633f51c53ad78a8336
3
- size 1261874520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f318a716fecd3b0dbb5ed7cfb690c5382d27c5eb1d3cd14d3172708b0eaa1fa
3
+ size 2325236000
preprocessor_config.json CHANGED
@@ -1,10 +1,11 @@
1
  {
2
- "do_normalize": true,
3
- "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
- "feature_size": 1,
5
  "padding_side": "right",
6
- "padding_value": 0.0,
7
- "processor_class": "Wav2Vec2Processor",
8
  "return_attention_mask": true,
9
- "sampling_rate": 16000
 
10
  }
 
1
  {
2
+ "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
+ "feature_size": 80,
4
+ "num_mel_bins": 80,
5
  "padding_side": "right",
6
+ "padding_value": 1,
7
+ "processor_class": "Wav2Vec2BertProcessor",
8
  "return_attention_mask": true,
9
+ "sampling_rate": 16000,
10
+ "stride": 2
11
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f54219229ffd0652e89c138b196f81421a689404ecbfedf03264f45dd509842
3
- size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b8493271d986106cb9bad47ff6fd143fa99c16d7efd113c1b7176ed3232012
3
+ size 4728