marinone94
commited on
Commit
•
b6994be
1
Parent(s):
c9cb648
validate language model results on cv_7 test
Browse files- .gitattributes +1 -0
- .gitignore +1 -0
- .ipynb_checkpoints/run-checkpoint.sh +2 -2
- config.json +9 -1
- eda.ipynb +105 -7
- log_mozilla-foundation_common_voice_7_0_sv-SE_test_predictions.txt +0 -0
- mozilla-foundation_common_voice_7_0_sv-SE_test_eval_results.txt +2 -2
- preprocessor_config.json +0 -1
- run.sh +2 -3
- special_tokens_map.json +1 -1
- tokenizer_config.json +1 -1
- training_args.bin +2 -2
.gitattributes
CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
wandb/run-20220127_103723-scy0vyln/run-scy0vyln.wandb filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
*venv
|
2 |
.ipynb_checkpoints/
|
3 |
checkpoint*/
|
|
|
|
1 |
*venv
|
2 |
.ipynb_checkpoints/
|
3 |
checkpoint*/
|
4 |
+
wandb/
|
.ipynb_checkpoints/run-checkpoint.sh
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
python run_speech_recognition_ctc.py \
|
2 |
--dataset_name="mozilla-foundation/common_voice_7_0" \
|
3 |
-
--model_name_or_path="
|
4 |
--dataset_config_name="sv-SE" \
|
5 |
--output_dir="./" \
|
6 |
--overwrite_output_dir \
|
@@ -31,4 +31,4 @@ python run_speech_recognition_ctc.py \
|
|
31 |
--fp16 \
|
32 |
--group_by_length \
|
33 |
--do_train --do_eval \
|
34 |
-
--push_to_hub
|
|
|
1 |
python run_speech_recognition_ctc.py \
|
2 |
--dataset_name="mozilla-foundation/common_voice_7_0" \
|
3 |
+
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--dataset_config_name="sv-SE" \
|
5 |
--output_dir="./" \
|
6 |
--overwrite_output_dir \
|
|
|
31 |
--fp16 \
|
32 |
--group_by_length \
|
33 |
--do_train --do_eval \
|
34 |
+
--push_to_hub
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"activation_dropout": 0.1,
|
4 |
"adapter_kernel_size": 3,
|
5 |
"adapter_stride": 2,
|
@@ -59,12 +59,20 @@
|
|
59 |
"intermediate_size": 4096,
|
60 |
"layer_norm_eps": 1e-05,
|
61 |
"layerdrop": 0.0,
|
|
|
|
|
|
|
|
|
|
|
62 |
"mask_feature_length": 64,
|
63 |
"mask_feature_min_masks": 0,
|
64 |
"mask_feature_prob": 0.25,
|
65 |
"mask_time_length": 10,
|
66 |
"mask_time_min_masks": 2,
|
|
|
|
|
67 |
"mask_time_prob": 0.75,
|
|
|
68 |
"model_type": "wav2vec2",
|
69 |
"num_adapter_layers": 3,
|
70 |
"num_attention_heads": 16,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "KBLab/wav2vec2-large-voxrex",
|
3 |
"activation_dropout": 0.1,
|
4 |
"adapter_kernel_size": 3,
|
5 |
"adapter_stride": 2,
|
|
|
59 |
"intermediate_size": 4096,
|
60 |
"layer_norm_eps": 1e-05,
|
61 |
"layerdrop": 0.0,
|
62 |
+
"mask_channel_length": 10,
|
63 |
+
"mask_channel_min_space": 1,
|
64 |
+
"mask_channel_other": 0.0,
|
65 |
+
"mask_channel_prob": 0.0,
|
66 |
+
"mask_channel_selection": "static",
|
67 |
"mask_feature_length": 64,
|
68 |
"mask_feature_min_masks": 0,
|
69 |
"mask_feature_prob": 0.25,
|
70 |
"mask_time_length": 10,
|
71 |
"mask_time_min_masks": 2,
|
72 |
+
"mask_time_min_space": 1,
|
73 |
+
"mask_time_other": 0.0,
|
74 |
"mask_time_prob": 0.75,
|
75 |
+
"mask_time_selection": "static",
|
76 |
"model_type": "wav2vec2",
|
77 |
"num_adapter_layers": 3,
|
78 |
"num_attention_heads": 16,
|
eda.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"id": "c9526c52",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
@@ -23,12 +23,12 @@
|
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
-
"execution_count":
|
27 |
"id": "cc9f1c45",
|
28 |
"metadata": {},
|
29 |
"outputs": [],
|
30 |
"source": [
|
31 |
-
"dataset_name = \"mozilla-foundation/
|
32 |
"dataset_config_name = \"sv-SE\"\n",
|
33 |
"train_split_name = \"train+validation\"\n",
|
34 |
"use_auth_token = True"
|
@@ -36,7 +36,7 @@
|
|
36 |
},
|
37 |
{
|
38 |
"cell_type": "code",
|
39 |
-
"execution_count":
|
40 |
"id": "21fd7030",
|
41 |
"metadata": {},
|
42 |
"outputs": [],
|
@@ -46,15 +46,105 @@
|
|
46 |
},
|
47 |
{
|
48 |
"cell_type": "code",
|
49 |
-
"execution_count":
|
50 |
"id": "81a27912",
|
51 |
"metadata": {},
|
52 |
"outputs": [
|
53 |
{
|
54 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
"output_type": "stream",
|
56 |
"text": [
|
57 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
]
|
59 |
}
|
60 |
],
|
@@ -307,6 +397,14 @@
|
|
307 |
"avg_tokens_test = num_tokens_test / test_data.num_rows\n",
|
308 |
"print(f\"Avg tokens training data: {avg_tokens_test}\")"
|
309 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
}
|
311 |
],
|
312 |
"metadata": {
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
"id": "c9526c52",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
|
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
+
"execution_count": 2,
|
27 |
"id": "cc9f1c45",
|
28 |
"metadata": {},
|
29 |
"outputs": [],
|
30 |
"source": [
|
31 |
+
"dataset_name = \"mozilla-foundation/common_voice_8_0\"\n",
|
32 |
"dataset_config_name = \"sv-SE\"\n",
|
33 |
"train_split_name = \"train+validation\"\n",
|
34 |
"use_auth_token = True"
|
|
|
36 |
},
|
37 |
{
|
38 |
"cell_type": "code",
|
39 |
+
"execution_count": 3,
|
40 |
"id": "21fd7030",
|
41 |
"metadata": {},
|
42 |
"outputs": [],
|
|
|
46 |
},
|
47 |
{
|
48 |
"cell_type": "code",
|
49 |
+
"execution_count": 4,
|
50 |
"id": "81a27912",
|
51 |
"metadata": {},
|
52 |
"outputs": [
|
53 |
{
|
54 |
+
"data": {
|
55 |
+
"application/vnd.jupyter.widget-view+json": {
|
56 |
+
"model_id": "92387075d7064947bfe8117d393afa30",
|
57 |
+
"version_major": 2,
|
58 |
+
"version_minor": 0
|
59 |
+
},
|
60 |
+
"text/plain": [
|
61 |
+
"Downloading: 0%| | 0.00/9.88k [00:00<?, ?B/s]"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
"metadata": {},
|
65 |
+
"output_type": "display_data"
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"data": {
|
69 |
+
"application/vnd.jupyter.widget-view+json": {
|
70 |
+
"model_id": "7610803e99ac4fba9529711bf7668d66",
|
71 |
+
"version_major": 2,
|
72 |
+
"version_minor": 0
|
73 |
+
},
|
74 |
+
"text/plain": [
|
75 |
+
"Downloading: 0%| | 0.00/2.98k [00:00<?, ?B/s]"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
"metadata": {},
|
79 |
+
"output_type": "display_data"
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"data": {
|
83 |
+
"application/vnd.jupyter.widget-view+json": {
|
84 |
+
"model_id": "6f5c59109df240e79714106f54cc1d8a",
|
85 |
+
"version_major": 2,
|
86 |
+
"version_minor": 0
|
87 |
+
},
|
88 |
+
"text/plain": [
|
89 |
+
"Downloading: 0%| | 0.00/53.1k [00:00<?, ?B/s]"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"metadata": {},
|
93 |
+
"output_type": "display_data"
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"name": "stdout",
|
97 |
"output_type": "stream",
|
98 |
"text": [
|
99 |
+
"Downloading and preparing dataset common_voice/sv-SE to /Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/8.0.0/7c985b71d3a4f98ad5985f8eff1035a7084ddbbb84f01591cd095991e7c2499e...\n"
|
100 |
+
]
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"data": {
|
104 |
+
"application/vnd.jupyter.widget-view+json": {
|
105 |
+
"model_id": "b8cfd99809dd41f2a25248f384b0c73a",
|
106 |
+
"version_major": 2,
|
107 |
+
"version_minor": 0
|
108 |
+
},
|
109 |
+
"text/plain": [
|
110 |
+
"Downloading: 0%| | 0.00/1.11G [00:00<?, ?B/s]"
|
111 |
+
]
|
112 |
+
},
|
113 |
+
"metadata": {},
|
114 |
+
"output_type": "display_data"
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"data": {
|
118 |
+
"application/vnd.jupyter.widget-view+json": {
|
119 |
+
"model_id": "",
|
120 |
+
"version_major": 2,
|
121 |
+
"version_minor": 0
|
122 |
+
},
|
123 |
+
"text/plain": [
|
124 |
+
"0 examples [00:00, ? examples/s]"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
"metadata": {},
|
128 |
+
"output_type": "display_data"
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"ename": "KeyError",
|
132 |
+
"evalue": "'accents'",
|
133 |
+
"output_type": "error",
|
134 |
+
"traceback": [
|
135 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
136 |
+
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
137 |
+
"Input \u001b[0;32mIn [4]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m raw_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_config_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain_split_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m)\u001b[49m\n",
|
138 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/load.py:1694\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)\u001b[0m\n\u001b[1;32m 1691\u001b[0m try_from_hf_gcs \u001b[38;5;241m=\u001b[39m path \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;66;03m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 1694\u001b[0m \u001b[43mbuilder_instance\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1695\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1696\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1697\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_verifications\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_verifications\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1698\u001b[0m \u001b[43m \u001b[49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1699\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1700\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1702\u001b[0m \u001b[38;5;66;03m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m 1703\u001b[0m keep_in_memory \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1704\u001b[0m keep_in_memory \u001b[38;5;28;01mif\u001b[39;00m keep_in_memory \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m is_small_dataset(builder_instance\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size)\n\u001b[1;32m 1705\u001b[0m )\n",
|
139 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:595\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 593\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHF google storage unreachable. Downloading and preparing it from source\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m downloaded_from_gcs:\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 596\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverify_infos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_infos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\n\u001b[1;32m 597\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n",
|
140 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:684\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 680\u001b[0m split_dict\u001b[38;5;241m.\u001b[39madd(split_generator\u001b[38;5;241m.\u001b[39msplit_info)\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 683\u001b[0m \u001b[38;5;66;03m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m--> 684\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_generator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 686\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 687\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot find data file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 688\u001b[0m \u001b[38;5;241m+\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_download_instructions \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 689\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mOriginal error:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 691\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n",
|
141 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:1083\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split\u001b[0;34m(self, split_generator)\u001b[0m\n\u001b[1;32m 1075\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1076\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, record \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mtqdm(\n\u001b[1;32m 1077\u001b[0m generator,\n\u001b[1;32m 1078\u001b[0m unit\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m examples\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1081\u001b[0m disable\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET),\n\u001b[1;32m 1082\u001b[0m ):\n\u001b[0;32m-> 1083\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrecord\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1084\u001b[0m writer\u001b[38;5;241m.\u001b[39mwrite(example, key)\n\u001b[1;32m 1085\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n",
|
142 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:1214\u001b[0m, in \u001b[0;36mFeatures.encode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1204\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1205\u001b[0m \u001b[38;5;124;03mEncode example into a format for Arrow.\u001b[39;00m\n\u001b[1;32m 1206\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1211\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1212\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1213\u001b[0m example \u001b[38;5;241m=\u001b[39m cast_to_python_objects(example)\n\u001b[0;32m-> 1214\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mencode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m)\u001b[49m\n",
|
143 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:976\u001b[0m, in \u001b[0;36mencode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# Nested structures: we allow dict, list/tuples, sequences\u001b[39;00m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 977\u001b[0m k: encode_nested_example(sub_schema, sub_obj) \u001b[38;5;28;01mfor\u001b[39;00m k, (sub_schema, sub_obj) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(schema, obj)\n\u001b[1;32m 978\u001b[0m }\n\u001b[1;32m 979\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m 980\u001b[0m sub_schema \u001b[38;5;241m=\u001b[39m schema[\u001b[38;5;241m0\u001b[39m]\n",
|
144 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:976\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# Nested structures: we allow dict, list/tuples, sequences\u001b[39;00m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 977\u001b[0m k: encode_nested_example(sub_schema, sub_obj) \u001b[38;5;28;01mfor\u001b[39;00m k, (sub_schema, sub_obj) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(schema, obj)\n\u001b[1;32m 978\u001b[0m }\n\u001b[1;32m 979\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m 980\u001b[0m sub_schema \u001b[38;5;241m=\u001b[39m schema[\u001b[38;5;241m0\u001b[39m]\n",
|
145 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/utils/py_utils.py:153\u001b[0m, in \u001b[0;36mzip_dict\u001b[0;34m(*dicts)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m unique_values(itertools\u001b[38;5;241m.\u001b[39mchain(\u001b[38;5;241m*\u001b[39mdicts)): \u001b[38;5;66;03m# set merge all keys\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Will raise KeyError if the dict don't have the same keys\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m key, \u001b[38;5;28;43mtuple\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43md\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdicts\u001b[49m\u001b[43m)\u001b[49m\n",
|
146 |
+
"File \u001b[0;32m~/Repos/datasets/src/datasets/utils/py_utils.py:153\u001b[0m, in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m unique_values(itertools\u001b[38;5;241m.\u001b[39mchain(\u001b[38;5;241m*\u001b[39mdicts)): \u001b[38;5;66;03m# set merge all keys\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Will raise KeyError if the dict don't have the same keys\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m key, \u001b[38;5;28mtuple\u001b[39m(\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m dicts)\n",
|
147 |
+
"\u001b[0;31mKeyError\u001b[0m: 'accents'"
|
148 |
]
|
149 |
}
|
150 |
],
|
|
|
397 |
"avg_tokens_test = num_tokens_test / test_data.num_rows\n",
|
398 |
"print(f\"Avg tokens training data: {avg_tokens_test}\")"
|
399 |
]
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"cell_type": "code",
|
403 |
+
"execution_count": null,
|
404 |
+
"id": "4f906c9c",
|
405 |
+
"metadata": {},
|
406 |
+
"outputs": [],
|
407 |
+
"source": []
|
408 |
}
|
409 |
],
|
410 |
"metadata": {
|
log_mozilla-foundation_common_voice_7_0_sv-SE_test_predictions.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
mozilla-foundation_common_voice_7_0_sv-SE_test_eval_results.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
WER: 0.
|
2 |
-
CER: 0.
|
|
|
1 |
+
WER: 0.18888820901915193
|
2 |
+
CER: 0.06630922921822015
|
preprocessor_config.json
CHANGED
@@ -4,7 +4,6 @@
|
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0,
|
7 |
-
"processor_class": "Wav2Vec2ProcessorWithLM",
|
8 |
"return_attention_mask": true,
|
9 |
"sampling_rate": 16000
|
10 |
}
|
|
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0,
|
|
|
7 |
"return_attention_mask": true,
|
8 |
"sampling_rate": 16000
|
9 |
}
|
run.sh
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
python run_speech_recognition_ctc.py \
|
2 |
--dataset_name="mozilla-foundation/common_voice_7_0" \
|
3 |
-
--model_name_or_path="
|
4 |
--dataset_config_name="sv-SE" \
|
5 |
--output_dir="./" \
|
6 |
--overwrite_output_dir \
|
@@ -31,5 +31,4 @@ python run_speech_recognition_ctc.py \
|
|
31 |
--fp16 \
|
32 |
--group_by_length \
|
33 |
--do_train --do_eval \
|
34 |
-
--push_to_hub
|
35 |
-
--push_lm_to_hub
|
|
|
1 |
python run_speech_recognition_ctc.py \
|
2 |
--dataset_name="mozilla-foundation/common_voice_7_0" \
|
3 |
+
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--dataset_config_name="sv-SE" \
|
5 |
--output_dir="./" \
|
6 |
--overwrite_output_dir \
|
|
|
31 |
--fp16 \
|
32 |
--group_by_length \
|
33 |
--do_train --do_eval \
|
34 |
+
--push_to_hub
|
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e93512254853170601d69416635484a8a91b518c58d4ec3e92cb2e1d96df8639
|
3 |
+
size 3055
|