techiaith
/

wav2vec2-base-cy

Automatic Speech Recognition

Inference Endpoints

Model card Files Files and versions

DewiBrynJones commited on Mar 25, 2024

Commit

bc79fa0

·

1 Parent(s): dc71ee6

Approx. 4000 hours YT data

Files changed (3) hide show

README.md +2 -11
config.json +2 -2
model.safetensors +3 -0

README.md CHANGED Viewed

@@ -14,16 +14,7 @@ This model is experimental in investigating pretraining better models with more
 https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-pretraining
-This initial base model has been pre-trained with scripts at
-https://github.com/techiaith/docker-wav2vec2-cy/tree/main/train/pre-train
-using English speech from LibriSpeech's minimal subsets (`validation` and `test`), and 184 hours and 47 minutes of Welsh speech from various playlists on YouTube. The script [`build_youtube_playlists_corpus.sh`](https://github.com/techiaith/docker-wav2vec2-cy/blob/main/inference/python/build_youtube_playlists_corpus.sh) lists the playlists used.
-Until we have collected thousands of hours of Welsh speech, rather than hundreds, the WER scores, after fine-tuning, will remain very high. The following WERs are from tests on a Welsh Common Voice test set as well a [second set of YouTube videos with corrected transcriptions](https://git.techiaith.bangor.ac.uk/data-porth-technolegau-iaith/corpws-profi-adnabod-lleferydd/-/tree/master/data/trawsgrifio).
-| Test Set | WER | CER | WER (+LM) | CER (+LM)|
-| -------- | --- | --- | --------- | -------- |
-| CV CY 10 | 94.83  | 85.55  | 92.31  | 82.25 |
-| YouTube  | 95.43  | 90.26  | 93.60  | 89.33 |

 https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-pretraining
+This base model has been pre-trained with only approximately 4000 hours of Welsh and English speech collected from various channels on YouTube. The corpus contains only 25% Welsh language speech. English language speech contains Welsh-accented English speech and therefore has been retained for pre-training.
+Until we have collected many more hours of speech, this pre-trained model will be of limited use for fine-tuning any useful downstream tasks.

config.json CHANGED Viewed

@@ -1,5 +1,6 @@
 {
   "activation_dropout": 0.0,
   "adapter_kernel_size": 3,
   "adapter_stride": 2,
   "add_adapter": false,
@@ -51,7 +52,6 @@
   "feat_proj_dropout": 0.0,
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
-  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout": 0.0,
   "hidden_dropout_prob": 0.0,
@@ -101,7 +101,7 @@
     1
   ],
   "torch_dtype": "float32",
-  "transformers_version": "4.21.0",
   "use_weighted_layer_sum": false,
   "vocab_size": 32,
   "xvector_output_dim": 512

 {
   "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
   "adapter_kernel_size": 3,
   "adapter_stride": 2,
   "add_adapter": false,
   "feat_proj_dropout": 0.0,
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "hidden_act": "gelu",
   "hidden_dropout": 0.0,
   "hidden_dropout_prob": 0.0,
     1
   ],
   "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
   "use_weighted_layer_sum": false,
   "vocab_size": 32,
   "xvector_output_dim": 512

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1567f48173b19ed1f1e2c9c76fad74d2a6ed662c39128a00be62cca1dd2c9ba7
+size 380246024