marinone94 commited on
Commit
b6994be
1 Parent(s): c9cb648

validate language model results on cv_7 test

Browse files
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ wandb/run-20220127_103723-scy0vyln/run-scy0vyln.wandb filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  *venv
2
  .ipynb_checkpoints/
3
  checkpoint*/
 
 
1
  *venv
2
  .ipynb_checkpoints/
3
  checkpoint*/
4
+ wandb/
.ipynb_checkpoints/run-checkpoint.sh CHANGED
@@ -1,6 +1,6 @@
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0" \
3
- --model_name_or_path="marinone94/xls-r-300m-sv-robust" \
4
  --dataset_config_name="sv-SE" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
@@ -31,4 +31,4 @@ python run_speech_recognition_ctc.py \
31
  --fp16 \
32
  --group_by_length \
33
  --do_train --do_eval \
34
- --push_to_hub
 
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0" \
3
+ --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --dataset_config_name="sv-SE" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
 
31
  --fp16 \
32
  --group_by_length \
33
  --do_train --do_eval \
34
+ --push_to_hub
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "marinone94/xls-r-300m-sv-robust",
3
  "activation_dropout": 0.1,
4
  "adapter_kernel_size": 3,
5
  "adapter_stride": 2,
@@ -59,12 +59,20 @@
59
  "intermediate_size": 4096,
60
  "layer_norm_eps": 1e-05,
61
  "layerdrop": 0.0,
 
 
 
 
 
62
  "mask_feature_length": 64,
63
  "mask_feature_min_masks": 0,
64
  "mask_feature_prob": 0.25,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
 
 
67
  "mask_time_prob": 0.75,
 
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
 
1
  {
2
+ "_name_or_path": "KBLab/wav2vec2-large-voxrex",
3
  "activation_dropout": 0.1,
4
  "adapter_kernel_size": 3,
5
  "adapter_stride": 2,
 
59
  "intermediate_size": 4096,
60
  "layer_norm_eps": 1e-05,
61
  "layerdrop": 0.0,
62
+ "mask_channel_length": 10,
63
+ "mask_channel_min_space": 1,
64
+ "mask_channel_other": 0.0,
65
+ "mask_channel_prob": 0.0,
66
+ "mask_channel_selection": "static",
67
  "mask_feature_length": 64,
68
  "mask_feature_min_masks": 0,
69
  "mask_feature_prob": 0.25,
70
  "mask_time_length": 10,
71
  "mask_time_min_masks": 2,
72
+ "mask_time_min_space": 1,
73
+ "mask_time_other": 0.0,
74
  "mask_time_prob": 0.75,
75
+ "mask_time_selection": "static",
76
  "model_type": "wav2vec2",
77
  "num_adapter_layers": 3,
78
  "num_attention_heads": 16,
eda.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
6
  "id": "c9526c52",
7
  "metadata": {},
8
  "outputs": [],
@@ -23,12 +23,12 @@
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": 21,
27
  "id": "cc9f1c45",
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
31
- "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n",
32
  "dataset_config_name = \"sv-SE\"\n",
33
  "train_split_name = \"train+validation\"\n",
34
  "use_auth_token = True"
@@ -36,7 +36,7 @@
36
  },
37
  {
38
  "cell_type": "code",
39
- "execution_count": 22,
40
  "id": "21fd7030",
41
  "metadata": {},
42
  "outputs": [],
@@ -46,15 +46,105 @@
46
  },
47
  {
48
  "cell_type": "code",
49
- "execution_count": 35,
50
  "id": "81a27912",
51
  "metadata": {},
52
  "outputs": [
53
  {
54
- "name": "stderr",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  "output_type": "stream",
56
  "text": [
57
- "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ]
59
  }
60
  ],
@@ -307,6 +397,14 @@
307
  "avg_tokens_test = num_tokens_test / test_data.num_rows\n",
308
  "print(f\"Avg tokens training data: {avg_tokens_test}\")"
309
  ]
 
 
 
 
 
 
 
 
310
  }
311
  ],
312
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "id": "c9526c52",
7
  "metadata": {},
8
  "outputs": [],
 
23
  },
24
  {
25
  "cell_type": "code",
26
+ "execution_count": 2,
27
  "id": "cc9f1c45",
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
31
+ "dataset_name = \"mozilla-foundation/common_voice_8_0\"\n",
32
  "dataset_config_name = \"sv-SE\"\n",
33
  "train_split_name = \"train+validation\"\n",
34
  "use_auth_token = True"
 
36
  },
37
  {
38
  "cell_type": "code",
39
+ "execution_count": 3,
40
  "id": "21fd7030",
41
  "metadata": {},
42
  "outputs": [],
 
46
  },
47
  {
48
  "cell_type": "code",
49
+ "execution_count": 4,
50
  "id": "81a27912",
51
  "metadata": {},
52
  "outputs": [
53
  {
54
+ "data": {
55
+ "application/vnd.jupyter.widget-view+json": {
56
+ "model_id": "92387075d7064947bfe8117d393afa30",
57
+ "version_major": 2,
58
+ "version_minor": 0
59
+ },
60
+ "text/plain": [
61
+ "Downloading: 0%| | 0.00/9.88k [00:00<?, ?B/s]"
62
+ ]
63
+ },
64
+ "metadata": {},
65
+ "output_type": "display_data"
66
+ },
67
+ {
68
+ "data": {
69
+ "application/vnd.jupyter.widget-view+json": {
70
+ "model_id": "7610803e99ac4fba9529711bf7668d66",
71
+ "version_major": 2,
72
+ "version_minor": 0
73
+ },
74
+ "text/plain": [
75
+ "Downloading: 0%| | 0.00/2.98k [00:00<?, ?B/s]"
76
+ ]
77
+ },
78
+ "metadata": {},
79
+ "output_type": "display_data"
80
+ },
81
+ {
82
+ "data": {
83
+ "application/vnd.jupyter.widget-view+json": {
84
+ "model_id": "6f5c59109df240e79714106f54cc1d8a",
85
+ "version_major": 2,
86
+ "version_minor": 0
87
+ },
88
+ "text/plain": [
89
+ "Downloading: 0%| | 0.00/53.1k [00:00<?, ?B/s]"
90
+ ]
91
+ },
92
+ "metadata": {},
93
+ "output_type": "display_data"
94
+ },
95
+ {
96
+ "name": "stdout",
97
  "output_type": "stream",
98
  "text": [
99
+ "Downloading and preparing dataset common_voice/sv-SE to /Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/8.0.0/7c985b71d3a4f98ad5985f8eff1035a7084ddbbb84f01591cd095991e7c2499e...\n"
100
+ ]
101
+ },
102
+ {
103
+ "data": {
104
+ "application/vnd.jupyter.widget-view+json": {
105
+ "model_id": "b8cfd99809dd41f2a25248f384b0c73a",
106
+ "version_major": 2,
107
+ "version_minor": 0
108
+ },
109
+ "text/plain": [
110
+ "Downloading: 0%| | 0.00/1.11G [00:00<?, ?B/s]"
111
+ ]
112
+ },
113
+ "metadata": {},
114
+ "output_type": "display_data"
115
+ },
116
+ {
117
+ "data": {
118
+ "application/vnd.jupyter.widget-view+json": {
119
+ "model_id": "",
120
+ "version_major": 2,
121
+ "version_minor": 0
122
+ },
123
+ "text/plain": [
124
+ "0 examples [00:00, ? examples/s]"
125
+ ]
126
+ },
127
+ "metadata": {},
128
+ "output_type": "display_data"
129
+ },
130
+ {
131
+ "ename": "KeyError",
132
+ "evalue": "'accents'",
133
+ "output_type": "error",
134
+ "traceback": [
135
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
136
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
137
+ "Input \u001b[0;32mIn [4]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m raw_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_config_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain_split_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m)\u001b[49m\n",
138
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/load.py:1694\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)\u001b[0m\n\u001b[1;32m 1691\u001b[0m try_from_hf_gcs \u001b[38;5;241m=\u001b[39m path \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;66;03m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 1694\u001b[0m \u001b[43mbuilder_instance\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1695\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1696\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1697\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_verifications\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_verifications\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1698\u001b[0m \u001b[43m \u001b[49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1699\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1700\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1702\u001b[0m \u001b[38;5;66;03m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m 1703\u001b[0m keep_in_memory \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1704\u001b[0m keep_in_memory \u001b[38;5;28;01mif\u001b[39;00m keep_in_memory \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m is_small_dataset(builder_instance\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size)\n\u001b[1;32m 1705\u001b[0m )\n",
139
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:595\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 593\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHF google storage unreachable. Downloading and preparing it from source\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m downloaded_from_gcs:\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 596\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverify_infos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_infos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\n\u001b[1;32m 597\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n",
140
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:684\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 680\u001b[0m split_dict\u001b[38;5;241m.\u001b[39madd(split_generator\u001b[38;5;241m.\u001b[39msplit_info)\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 683\u001b[0m \u001b[38;5;66;03m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m--> 684\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_generator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 686\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 687\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot find data file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 688\u001b[0m \u001b[38;5;241m+\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_download_instructions \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 689\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mOriginal error:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 691\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n",
141
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:1083\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split\u001b[0;34m(self, split_generator)\u001b[0m\n\u001b[1;32m 1075\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1076\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, record \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mtqdm(\n\u001b[1;32m 1077\u001b[0m generator,\n\u001b[1;32m 1078\u001b[0m unit\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m examples\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1081\u001b[0m disable\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET),\n\u001b[1;32m 1082\u001b[0m ):\n\u001b[0;32m-> 1083\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrecord\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1084\u001b[0m writer\u001b[38;5;241m.\u001b[39mwrite(example, key)\n\u001b[1;32m 1085\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n",
142
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:1214\u001b[0m, in \u001b[0;36mFeatures.encode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1204\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1205\u001b[0m \u001b[38;5;124;03mEncode example into a format for Arrow.\u001b[39;00m\n\u001b[1;32m 1206\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1211\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1212\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1213\u001b[0m example \u001b[38;5;241m=\u001b[39m cast_to_python_objects(example)\n\u001b[0;32m-> 1214\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mencode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m)\u001b[49m\n",
143
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:976\u001b[0m, in \u001b[0;36mencode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# Nested structures: we allow dict, list/tuples, sequences\u001b[39;00m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 977\u001b[0m k: encode_nested_example(sub_schema, sub_obj) \u001b[38;5;28;01mfor\u001b[39;00m k, (sub_schema, sub_obj) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(schema, obj)\n\u001b[1;32m 978\u001b[0m }\n\u001b[1;32m 979\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m 980\u001b[0m sub_schema \u001b[38;5;241m=\u001b[39m schema[\u001b[38;5;241m0\u001b[39m]\n",
144
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:976\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# Nested structures: we allow dict, list/tuples, sequences\u001b[39;00m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 977\u001b[0m k: encode_nested_example(sub_schema, sub_obj) \u001b[38;5;28;01mfor\u001b[39;00m k, (sub_schema, sub_obj) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(schema, obj)\n\u001b[1;32m 978\u001b[0m }\n\u001b[1;32m 979\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m 980\u001b[0m sub_schema \u001b[38;5;241m=\u001b[39m schema[\u001b[38;5;241m0\u001b[39m]\n",
145
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/utils/py_utils.py:153\u001b[0m, in \u001b[0;36mzip_dict\u001b[0;34m(*dicts)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m unique_values(itertools\u001b[38;5;241m.\u001b[39mchain(\u001b[38;5;241m*\u001b[39mdicts)): \u001b[38;5;66;03m# set merge all keys\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Will raise KeyError if the dict don't have the same keys\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m key, \u001b[38;5;28;43mtuple\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43md\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdicts\u001b[49m\u001b[43m)\u001b[49m\n",
146
+ "File \u001b[0;32m~/Repos/datasets/src/datasets/utils/py_utils.py:153\u001b[0m, in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m unique_values(itertools\u001b[38;5;241m.\u001b[39mchain(\u001b[38;5;241m*\u001b[39mdicts)): \u001b[38;5;66;03m# set merge all keys\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Will raise KeyError if the dict don't have the same keys\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m key, \u001b[38;5;28mtuple\u001b[39m(\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m dicts)\n",
147
+ "\u001b[0;31mKeyError\u001b[0m: 'accents'"
148
  ]
149
  }
150
  ],
 
397
  "avg_tokens_test = num_tokens_test / test_data.num_rows\n",
398
  "print(f\"Avg tokens training data: {avg_tokens_test}\")"
399
  ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": null,
404
+ "id": "4f906c9c",
405
+ "metadata": {},
406
+ "outputs": [],
407
+ "source": []
408
  }
409
  ],
410
  "metadata": {
log_mozilla-foundation_common_voice_7_0_sv-SE_test_predictions.txt CHANGED
The diff for this file is too large to render. See raw diff
 
mozilla-foundation_common_voice_7_0_sv-SE_test_eval_results.txt CHANGED
@@ -1,2 +1,2 @@
1
- WER: 0.2741846662179526
2
- CER: 0.0824110988910274
 
1
+ WER: 0.18888820901915193
2
+ CER: 0.06630922921822015
preprocessor_config.json CHANGED
@@ -4,7 +4,6 @@
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0,
7
- "processor_class": "Wav2Vec2ProcessorWithLM",
8
  "return_attention_mask": true,
9
  "sampling_rate": 16000
10
  }
 
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0,
 
7
  "return_attention_mask": true,
8
  "sampling_rate": 16000
9
  }
run.sh CHANGED
@@ -1,6 +1,6 @@
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0" \
3
- --model_name_or_path="marinone94/xls-r-300m-sv-robust" \
4
  --dataset_config_name="sv-SE" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
@@ -31,5 +31,4 @@ python run_speech_recognition_ctc.py \
31
  --fp16 \
32
  --group_by_length \
33
  --do_train --do_eval \
34
- --push_to_hub \
35
- --push_lm_to_hub
 
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0" \
3
+ --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --dataset_config_name="sv-SE" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
 
31
  --fp16 \
32
  --group_by_length \
33
  --do_train --do_eval \
34
+ --push_to_hub
 
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "marinone94/xls-r-300m-sv-robust", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0efc7aad7fd151e94de194f50e712cd8d3c82a2cf2ccee51d253c9130af43c3a
3
- size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93512254853170601d69416635484a8a91b518c58d4ec3e92cb2e1d96df8639
3
+ size 3055