arslanarjumand/wav2vec-reptiles

Browse files

Files changed (5) hide show

README.md +19 -15
config.json +33 -48
model.safetensors +2 -2
preprocessor_config.json +7 -6
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-license: apache-2.0
-base_model: vitouphy/wav2vec2-xls-r-300m-english
 tags:
 - generated_from_trainer
 model-index:
@@ -13,13 +13,13 @@ should probably proofread and complete it, then remove this comment. -->
 # wav2vec-reptiles
-This model is a fine-tuned version of [vitouphy/wav2vec2-xls-r-300m-english](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-english) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 2223.3787
-- Pcc Accuracy: 0.2187
-- Pcc Fluency: 0.0834
-- Pcc Total Score: 0.1532
-- Pcc Content: 0.2235
 ## Model description
@@ -47,23 +47,27 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.4
-- num_epochs: 25
 - mixed_precision_training: Native AMP
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss | Pcc Accuracy | Pcc Fluency | Pcc Total Score | Pcc Content |
 |:-------------:|:-----:|:----:|:---------------:|:------------:|:-----------:|:---------------:|:-----------:|
-| 2728.7846     | 5.0   | 100  | 3196.0576       | 0.3476       | -0.2327     | -0.3110         | 0.2340      |
-| 2491.7434     | 10.0  | 200  | 2875.6025       | 0.2791       | -0.0388     | -0.0724         | 0.2475      |
-| 1926.2301     | 15.0  | 300  | 2480.8772       | 0.2280       | 0.0499      | 0.1131          | 0.2334      |
-| 2065.1381     | 20.0  | 400  | 2265.0391       | 0.2201       | 0.0799      | 0.1478          | 0.2238      |
-| 1903.073      | 25.0  | 500  | 2223.3787       | 0.2187       | 0.0834      | 0.1532          | 0.2235      |
 ### Framework versions
 - Transformers 4.37.0
 - Pytorch 2.1.2
-- Datasets 2.17.0
 - Tokenizers 0.15.1

 ---
+license: mit
+base_model: facebook/w2v-bert-2.0
 tags:
 - generated_from_trainer
 model-index:
 # wav2vec-reptiles
+This model is a fine-tuned version of [facebook/w2v-bert-2.0](https://huggingface.co/facebook/w2v-bert-2.0) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 484.9289
+- Pcc Accuracy: -0.1604
+- Pcc Fluency: -0.1393
+- Pcc Total Score: -0.1591
+- Pcc Content: -0.1544
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.4
+- num_epochs: 10
 - mixed_precision_training: Native AMP
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss | Pcc Accuracy | Pcc Fluency | Pcc Total Score | Pcc Content |
 |:-------------:|:-----:|:----:|:---------------:|:------------:|:-----------:|:---------------:|:-----------:|
+| 3175.5941     | 1.07  | 500  | 2802.1936       | -0.2863      | -0.2729     | -0.3001         | -0.2745     |
+| 1733.457      | 2.13  | 1000 | 2440.8833       | -0.2827      | -0.2779     | -0.2959         | -0.2787     |
+| 1890.6879     | 3.2   | 1500 | 1470.4958       | -0.2806      | -0.2763     | -0.2933         | -0.2772     |
+| 470.8979      | 4.27  | 2000 | 565.3928        | -0.2658      | -0.2589     | -0.2764         | -0.2621     |
+| 881.7893      | 5.34  | 2500 | 501.9731        | -0.2331      | -0.2204     | -0.2394         | -0.2285     |
+| 379.352       | 6.4   | 3000 | 497.4395        | -0.2040      | -0.1871     | -0.2068         | -0.1982     |
+| 378.5915      | 7.47  | 3500 | 491.6927        | -0.1783      | -0.1590     | -0.1789         | -0.1726     |
+| 539.6395      | 8.54  | 4000 | 487.6133        | -0.1639      | -0.1434     | -0.1631         | -0.1582     |
+| 319.019       | 9.61  | 4500 | 484.9289        | -0.1604      | -0.1393     | -0.1591         | -0.1544     |
 ### Framework versions
 - Transformers 4.37.0
 - Pytorch 2.1.2
+- Datasets 2.17.1
 - Tokenizers 0.15.1

config.json CHANGED Viewed

@@ -1,85 +1,69 @@
 {
-  "_name_or_path": "vitouphy/wav2vec2-xls-r-300m-english",
   "activation_dropout": 0.006,
-  "adapter_attn_dim": null,
   "adapter_kernel_size": 3,
   "adapter_stride": 2,
   "add_adapter": false,
-  "apply_spec_augment": true,
   "architectures": [
-    "Wav2Vec2ForWav2Vec2ForCTCAndUttranceRegression"
   ],
   "attention_dropout": 0.0094,
   "bos_token_id": 1,
-  "classifier_proj_size": 256,
   "codevector_dim": 768,
   "contrastive_logits_temperature": 0.1,
-  "conv_bias": true,
-  "conv_dim": [
-    512,
-    512,
-    512,
-    512,
-    512,
-    512,
-    512
-  ],
-  "conv_kernel": [
-    10,
-    3,
-    3,
-    3,
-    3,
-    2,
-    2
-  ],
-  "conv_stride": [
-    5,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2
-  ],
-  "ctc_loss_reduction": "mean",
-  "ctc_zero_infinity": true,
   "diversity_loss_weight": 0.1,
-  "do_stable_layer_norm": true,
   "eos_token_id": 2,
-  "feat_extract_activation": "gelu",
-  "feat_extract_dropout": 0.0,
-  "feat_extract_norm": "layer",
   "feat_proj_dropout": 0.0,
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0005,
-  "gradient_checkpointing": false,
-  "hidden_act": "gelu_new",
   "hidden_dropout": 0.004,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
   "layerdrop": 0.0005,
   "mask_feature_length": 5,
   "mask_feature_min_masks": 2,
   "mask_feature_prob": 0.0075,
   "mask_time_length": 5,
   "mask_time_min_masks": 2,
   "mask_time_prob": 0.0085,
-  "model_type": "wav2vec2",
-  "num_adapter_layers": 3,
   "num_attention_heads": 16,
   "num_codevector_groups": 2,
   "num_codevectors_per_group": 320,
-  "num_conv_pos_embedding_groups": 16,
-  "num_conv_pos_embeddings": 128,
-  "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
   "num_negatives": 100,
   "output_hidden_size": 1024,
-  "pad_token_id": 28,
   "proj_codevector_dim": 768,
   "tdnn_dilation": [
     1,
     2,
@@ -103,7 +87,8 @@
   ],
   "torch_dtype": "float32",
   "transformers_version": "4.37.0",
   "use_weighted_layer_sum": false,
-  "vocab_size": 31,
   "xvector_output_dim": 512
 }

 {
+  "_name_or_path": "facebook/w2v-bert-2.0",
   "activation_dropout": 0.006,
+  "adapter_act": "relu",
   "adapter_kernel_size": 3,
   "adapter_stride": 2,
   "add_adapter": false,
+  "apply_spec_augment": false,
   "architectures": [
+    "Wav2Vec2BertForSequenceClassification"
   ],
   "attention_dropout": 0.0094,
   "bos_token_id": 1,
+  "classifier_proj_size": 768,
   "codevector_dim": 768,
+  "conformer_conv_dropout": 0.1,
   "contrastive_logits_temperature": 0.1,
+  "conv_depthwise_kernel_size": 31,
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
   "diversity_loss_weight": 0.1,
   "eos_token_id": 2,
   "feat_proj_dropout": 0.0,
   "feat_quantizer_dropout": 0.0,
+  "feature_projection_input_dim": 160,
   "final_dropout": 0.0005,
+  "hidden_act": "swish",
   "hidden_dropout": 0.004,
   "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3"
+  },
   "initializer_range": 0.02,
   "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3
+  },
   "layer_norm_eps": 1e-05,
   "layerdrop": 0.0005,
+  "left_max_position_embeddings": 64,
   "mask_feature_length": 5,
   "mask_feature_min_masks": 2,
   "mask_feature_prob": 0.0075,
   "mask_time_length": 5,
   "mask_time_min_masks": 2,
   "mask_time_prob": 0.0085,
+  "max_source_positions": 5000,
+  "model_type": "wav2vec2-bert",
+  "num_adapter_layers": 1,
   "num_attention_heads": 16,
   "num_codevector_groups": 2,
   "num_codevectors_per_group": 320,
   "num_hidden_layers": 24,
   "num_negatives": 100,
   "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "position_embeddings_type": "relative_key",
   "proj_codevector_dim": 768,
+  "right_max_position_embeddings": 8,
+  "rotary_embedding_base": 10000,
   "tdnn_dilation": [
     1,
     2,
   ],
   "torch_dtype": "float32",
   "transformers_version": "4.37.0",
+  "use_intermediate_ffn_before_adapter": false,
   "use_weighted_layer_sum": false,
+  "vocab_size": null,
   "xvector_output_dim": 512
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d03ae21377669660184c9ffc98e3b22fc8fc609e5e7c5633f51c53ad78a8336
-size 1261874520

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f318a716fecd3b0dbb5ed7cfb690c5382d27c5eb1d3cd14d3172708b0eaa1fa
+size 2325236000

preprocessor_config.json CHANGED Viewed

@@ -1,10 +1,11 @@
 {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
   "padding_side": "right",
-  "padding_value": 0.0,
-  "processor_class": "Wav2Vec2Processor",
   "return_attention_mask": true,
-  "sampling_rate": 16000
 }

 {
+  "feature_extractor_type": "SeamlessM4TFeatureExtractor",
+  "feature_size": 80,
+  "num_mel_bins": 80,
   "padding_side": "right",
+  "padding_value": 1,
+  "processor_class": "Wav2Vec2BertProcessor",
   "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "stride": 2
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2f54219229ffd0652e89c138b196f81421a689404ecbfedf03264f45dd509842
-size 4664

 version https://git-lfs.github.com/spec/v1
+oid sha256:32b8493271d986106cb9bad47ff6fd143fa99c16d7efd113c1b7176ed3232012
+size 4728