fsicoli
/

whisper-medium-pt-cv16-fleurs

@@ -4,36 +4,34 @@ base_model: openai/whisper-medium
 tags:
 - generated_from_trainer
 datasets:
-- mozilla-foundation/common_voice_16_1
 metrics:
 - wer
 model-index:
-- name: whisper-large-v3-pt-cv16-fleurs
   results:
   - task:
       name: Automatic Speech Recognition
       type: automatic-speech-recognition
     dataset:
-      name: mozilla-foundation/common_voice_16_1 pt
-      type: mozilla-foundation/common_voice_16_1
-      config: pt
-      split: test
-      args: pt
     metrics:
     - name: Wer
       type: wer
-      value: 0.11905377038591959
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-# whisper-large-v3-pt-cv16-fleurs
-This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on the mozilla-foundation/common_voice_16_1 pt dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.1975
-- Wer: 0.1191
 ## Model description
@@ -58,11 +56,12 @@ The following hyperparameters were used during training:
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 2
-- total_train_batch_size: 2
 - total_eval_batch_size: 2
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
-- lr_scheduler_warmup_steps: 2000
 - training_steps: 5000
 - mixed_precision_training: Native AMP
@@ -70,11 +69,11 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss | Wer    |
 |:-------------:|:-----:|:----:|:---------------:|:------:|
-| 0.2614        | 0.06  | 1000 | 0.2986          | 0.1466 |
-| 0.2632        | 0.13  | 2000 | 0.2244          | 0.1316 |
-| 0.1694        | 0.19  | 3000 | 0.2086          | 0.1234 |
-| 0.1658        | 0.26  | 4000 | 0.1987          | 0.1205 |
-| 0.1391        | 0.32  | 5000 | 0.1975          | 0.1191 |
 ### Framework versions

 tags:
 - generated_from_trainer
 datasets:
+- fsicoli/cv16-fleurs
 metrics:
 - wer
 model-index:
+- name: whisper-medium-pt-cv16-fleurs
   results:
   - task:
       name: Automatic Speech Recognition
       type: automatic-speech-recognition
     dataset:
+      name: fsicoli/cv16-fleurs default
+      type: fsicoli/cv16-fleurs
+      args: default
     metrics:
     - name: Wer
       type: wer
+      value: 0.09421927983206846
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+# whisper-medium-pt-cv16-fleurs
+This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on the fsicoli/cv16-fleurs default dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.1409
+- Wer: 0.0942
 ## Model description
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 2
+- gradient_accumulation_steps: 16
+- total_train_batch_size: 32
 - total_eval_batch_size: 2
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 5000
 - training_steps: 5000
 - mixed_precision_training: Native AMP
 | Training Loss | Epoch | Step | Validation Loss | Wer    |
 |:-------------:|:-----:|:----:|:---------------:|:------:|
+| 0.2552        | 0.93  | 1000 | 0.2200          | 0.1220 |
+| 0.1928        | 1.87  | 2000 | 0.1645          | 0.1062 |
+| 0.1646        | 2.8   | 3000 | 0.1508          | 0.1016 |
+| 0.1333        | 3.74  | 4000 | 0.1438          | 0.0970 |
+| 0.1027        | 4.67  | 5000 | 0.1409          | 0.0942 |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
     "epoch": 4.67,
-    "eval_loss": 0.19753539562225342,
-    "eval_runtime": 7876.6978,
     "eval_samples": 9414,
-    "eval_samples_per_second": 1.195,
-    "eval_steps_per_second": 0.598,
-    "eval_wer": 0.11905377038591959,
     "train_loss": 0.2694411336898804,
     "train_runtime": 106369.5753,
     "train_samples": 34267,

 {
     "epoch": 4.67,
+    "eval_loss": 0.14086556434631348,
+    "eval_runtime": 7908.2656,
     "eval_samples": 9414,
+    "eval_samples_per_second": 1.19,
+    "eval_steps_per_second": 0.595,
+    "eval_wer": 0.09421927983206846,
     "train_loss": 0.2694411336898804,
     "train_runtime": 106369.5753,
     "train_samples": 34267,

eval_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.32,
-    "eval_loss": 0.19753539562225342,
-    "eval_runtime": 7876.6978,
     "eval_samples": 9414,
-    "eval_samples_per_second": 1.195,
-    "eval_steps_per_second": 0.598,
-    "eval_wer": 0.11905377038591959
 }

 {
+    "epoch": 4.67,
+    "eval_loss": 0.14086556434631348,
+    "eval_runtime": 7908.2656,
     "eval_samples": 9414,
+    "eval_samples_per_second": 1.19,
+    "eval_steps_per_second": 0.595,
+    "eval_wer": 0.09421927983206846
 }