lucio commited on
Commit
54feddb
1 Parent(s): bba4f2c

Training in progress, step 4500

Browse files
.ipynb_checkpoints/requirements-checkpoint.txt CHANGED
@@ -6,3 +6,5 @@ jiwer~=2.3.0
6
  soundfile~=0.10.3
7
  transformers~=4.16.2
8
  datasets~=1.18.3
 
 
 
6
  soundfile~=0.10.3
7
  transformers~=4.16.2
8
  datasets~=1.18.3
9
+ pyctcdecode
10
+ https://github.com/kpu/kenlm/archive/master.zip
.ipynb_checkpoints/with_ngram_LM-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63e0de4a374839516480d3ebd15f1a566884c85df3e200ae5ef67ad87896d6ba
3
  size 1262058993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:184eeda764638171e3397ddf0afb0061278cc440cdb956d065bdeb73bbfe8086
3
  size 1262058993
requirements.txt CHANGED
@@ -6,3 +6,5 @@ jiwer~=2.3.0
6
  soundfile~=0.10.3
7
  transformers~=4.16.2
8
  datasets~=1.18.3
 
 
 
6
  soundfile~=0.10.3
7
  transformers~=4.16.2
8
  datasets~=1.18.3
9
+ pyctcdecode
10
+ https://github.com/kpu/kenlm/archive/master.zip
runs/Feb06_18-52-28_job-0a778896-a7e2-46e9-bcf5-016f91f242cf/events.out.tfevents.1644173767.job-0a778896-a7e2-46e9-bcf5-016f91f242cf.841782.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13ca3a32617dcb433d6774ca3ceadace283e127985b36df8d5d0b281378c3862
3
- size 13946
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e638063e5f5f1ea9473830277ba79d9fe975573b46c8800ded9eb193132d9a90
3
+ size 15096
uz_cv8_train.txt ADDED
The diff for this file is too large to render. See raw diff
 
with_ngram_LM.ipynb ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 8,
6
+ "id": "7ed2fa91",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "/workspace/xls-r-uzbek-cv8\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "%cd ~/xls-r-uzbek-cv8"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 10,
24
+ "id": "672bdf2b",
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "name": "stdout",
29
+ "output_type": "stream",
30
+ "text": [
31
+ "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
32
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
33
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
34
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
35
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
36
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
37
+ "\u001b[0mCollecting https://github.com/kpu/kenlm/archive/master.zip (from -r requirements.txt (line 10))\n",
38
+ " Using cached https://github.com/kpu/kenlm/archive/master.zip (541 kB)\n",
39
+ " Preparing metadata (setup.py) ... \u001b[?25ldone\n",
40
+ "\u001b[?25hRequirement already satisfied: unidecode in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 1)) (1.3.2)\n",
41
+ "Collecting tensorboard\n",
42
+ " Using cached tensorboard-2.8.0-py3-none-any.whl (5.8 MB)\n",
43
+ "Requirement already satisfied: torch in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 3)) (1.10.2)\n",
44
+ "Requirement already satisfied: torchaudio in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 4)) (0.10.2)\n",
45
+ "Requirement already satisfied: jiwer~=2.3.0 in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 5)) (2.3.0)\n",
46
+ "Requirement already satisfied: soundfile~=0.10.3 in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 6)) (0.10.3.post1)\n",
47
+ "Collecting transformers~=4.16.2\n",
48
+ " Using cached transformers-4.16.2-py3-none-any.whl (3.5 MB)\n",
49
+ "Collecting datasets~=1.18.3\n",
50
+ " Using cached datasets-1.18.3-py3-none-any.whl (311 kB)\n",
51
+ "Requirement already satisfied: pyctcdecode in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 9)) (0.3.0)\n",
52
+ "Requirement already satisfied: protobuf>=3.6.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (3.19.4)\n",
53
+ "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.8.1)\n",
54
+ "Collecting google-auth-oauthlib<0.5,>=0.4.1\n",
55
+ " Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)\n",
56
+ "Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (2.6.0)\n",
57
+ "Requirement already satisfied: numpy>=1.12.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.19.2)\n",
58
+ "Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (50.3.1.post20201107)\n",
59
+ "Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (2.24.0)\n",
60
+ "Requirement already satisfied: markdown>=2.6.8 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (3.3.6)\n",
61
+ "Requirement already satisfied: grpcio>=1.24.3 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.43.0)\n",
62
+ "Requirement already satisfied: wheel>=0.26 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (0.35.1)\n",
63
+ "Requirement already satisfied: absl-py>=0.4 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.0.0)\n",
64
+ "Requirement already satisfied: werkzeug>=0.11.15 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (2.0.2)\n",
65
+ "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (0.6.1)\n",
66
+ "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.8/site-packages (from torch->-r requirements.txt (line 3)) (4.0.1)\n",
67
+ "Requirement already satisfied: python-Levenshtein==0.12.2 in /opt/conda/lib/python3.8/site-packages (from jiwer~=2.3.0->-r requirements.txt (line 5)) (0.12.2)\n",
68
+ "Requirement already satisfied: cffi>=1.0 in /opt/conda/lib/python3.8/site-packages (from soundfile~=0.10.3->-r requirements.txt (line 6)) (1.14.3)\n",
69
+ "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (3.0.12)\n",
70
+ "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (2022.1.18)\n",
71
+ "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (0.0.47)\n",
72
+ "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (21.3)\n",
73
+ "Requirement already satisfied: tokenizers!=0.11.3,>=0.10.1 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (0.11.4)\n",
74
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (0.4.0)\n",
75
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (5.4.1)\n",
76
+ "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (4.62.3)\n",
77
+ "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (0.70.12.2)\n",
78
+ "Requirement already satisfied: dill in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (0.3.4)\n",
79
+ "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (1.4.0)\n",
80
+ "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (3.8.1)\n",
81
+ "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (2.0.2)\n",
82
+ "Requirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (6.0.1)\n",
83
+ "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (2022.1.0)\n",
84
+ "Requirement already satisfied: pygtrie<3.0,>=2.1 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode->-r requirements.txt (line 9)) (2.4.2)\n",
85
+ "Requirement already satisfied: hypothesis<7,>=6.14 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode->-r requirements.txt (line 9)) (6.36.1)\n",
86
+ "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from absl-py>=0.4->tensorboard->-r requirements.txt (line 2)) (1.15.0)\n",
87
+ "Requirement already satisfied: pycparser in /opt/conda/lib/python3.8/site-packages (from cffi>=1.0->soundfile~=0.10.3->-r requirements.txt (line 6)) (2.20)\n",
88
+ "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (4.8)\n",
89
+ "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (5.0.0)\n",
90
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (0.2.8)\n",
91
+ "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.8/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 2)) (1.3.1)\n",
92
+ "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode->-r requirements.txt (line 9)) (2.4.0)\n",
93
+ "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode->-r requirements.txt (line 9)) (21.4.0)\n",
94
+ "Requirement already satisfied: importlib-metadata>=4.4 in /opt/conda/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->-r requirements.txt (line 2)) (4.10.1)\n",
95
+ "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging>=20.0->transformers~=4.16.2->-r requirements.txt (line 7)) (3.0.7)\n",
96
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (3.0.4)\n",
97
+ "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (2.10)\n",
98
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (1.25.11)\n",
99
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (2020.12.5)\n",
100
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (1.7.2)\n",
101
+ "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (4.0.2)\n",
102
+ "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (1.2.0)\n",
103
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (6.0.2)\n",
104
+ "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (1.3.0)\n",
105
+ "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (2.0.10)\n",
106
+ "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets~=1.18.3->-r requirements.txt (line 8)) (2.8.2)\n",
107
+ "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets~=1.18.3->-r requirements.txt (line 8)) (2021.1)\n",
108
+ "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers~=4.16.2->-r requirements.txt (line 7)) (1.1.0)\n",
109
+ "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers~=4.16.2->-r requirements.txt (line 7)) (8.0.3)\n",
110
+ "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.8/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard->-r requirements.txt (line 2)) (3.7.0)\n",
111
+ "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (0.4.8)\n",
112
+ "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 2)) (3.2.0)\n",
113
+ "Building wheels for collected packages: kenlm\n",
114
+ " Building wheel for kenlm (setup.py) ... \u001b[?25ldone\n",
115
+ "\u001b[?25h Created wheel for kenlm: filename=kenlm-0.0.0-cp38-cp38-linux_x86_64.whl size=2348591 sha256=d5c8e5430d89f59ddde39bc78aec471c1e66ef43b6cde792711b2e97d7b8b9dc\n",
116
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-hhcfnszu/wheels/ff/08/4e/a3ddc0e786e0f3c1fcd2e7a82c4324c02fc3ae2638471406d2\n",
117
+ "Successfully built kenlm\n",
118
+ "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
119
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
120
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
121
+ "\u001b[0mInstalling collected packages: kenlm, transformers, google-auth-oauthlib, tensorboard, datasets\n",
122
+ "\u001b[33m WARNING: The script transformers-cli is installed in '/workspace/.local/bin' which is not on PATH.\n",
123
+ " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
124
+ "\u001b[0m\u001b[33m WARNING: The script google-oauthlib-tool is installed in '/workspace/.local/bin' which is not on PATH.\n",
125
+ " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
126
+ "\u001b[0m\u001b[33m WARNING: The script tensorboard is installed in '/workspace/.local/bin' which is not on PATH.\n",
127
+ " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
128
+ "\u001b[0m\u001b[33m WARNING: The script datasets-cli is installed in '/workspace/.local/bin' which is not on PATH.\n",
129
+ " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
130
+ "\u001b[0mSuccessfully installed datasets-1.18.3 google-auth-oauthlib-0.4.6 kenlm-0.0.0 tensorboard-2.8.0 transformers-4.16.2\n",
131
+ "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
132
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
133
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n",
134
+ "\u001b[0m"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "!python -m pip install -r requirements.txt --user"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 14,
145
+ "id": "722882a2",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "from transformers import AutoFeatureExtractor, AutoTokenizer, pipeline\n",
150
+ "from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric\n",
151
+ "\n",
152
+ "import re\n",
153
+ "import string\n",
154
+ "import unidecode"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 12,
160
+ "id": "e288d5c0",
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "name": "stderr",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/uz/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
168
+ ]
169
+ },
170
+ {
171
+ "data": {
172
+ "application/vnd.jupyter.widget-view+json": {
173
+ "model_id": "a8aad37a859241ff81ac932edc204bf8",
174
+ "version_major": 2,
175
+ "version_minor": 0
176
+ },
177
+ "text/plain": [
178
+ " 0%| | 0/5 [00:00<?, ?it/s]"
179
+ ]
180
+ },
181
+ "metadata": {},
182
+ "output_type": "display_data"
183
+ },
184
+ {
185
+ "data": {
186
+ "text/plain": [
187
+ "DatasetDict({\n",
188
+ " train: Dataset({\n",
189
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
190
+ " num_rows: 39456\n",
191
+ " })\n",
192
+ " test: Dataset({\n",
193
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
194
+ " num_rows: 11598\n",
195
+ " })\n",
196
+ " validation: Dataset({\n",
197
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
198
+ " num_rows: 10849\n",
199
+ " })\n",
200
+ " other: Dataset({\n",
201
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
202
+ " num_rows: 119461\n",
203
+ " })\n",
204
+ " invalidated: Dataset({\n",
205
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
206
+ " num_rows: 11276\n",
207
+ " })\n",
208
+ "})"
209
+ ]
210
+ },
211
+ "execution_count": 12,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "dataset_dict = load_dataset(\"mozilla-foundation/common_voice_8_0\", \"uz\", use_auth_token=True)\n",
218
+ "dataset_dict"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 17,
224
+ "id": "86031b96",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "chars_to_ignore_regex=f\"[{re.escape(string.punctuation)}]\" \n",
229
+ "\n",
230
+ "def remove_special_characters(batch):\n",
231
+ " batch[\"text\"] = re.sub(\n",
232
+ " chars_to_ignore_regex, \n",
233
+ " \"\", \n",
234
+ " re.sub(\"['`´]\", \"’\", # elsewhere probably meant as glottal stop\n",
235
+ " re.sub(\"([og])['`´]\", \"\\g<1>‘\", # after o/g indicate modified char\n",
236
+ " unidecode.unidecode(batch[\"sentence\"]).lower()\n",
237
+ " )\n",
238
+ " )\n",
239
+ " ) + \" \"\n",
240
+ " return batch"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "code",
245
+ "execution_count": 18,
246
+ "id": "c2b27b8e",
247
+ "metadata": {},
248
+ "outputs": [
249
+ {
250
+ "data": {
251
+ "application/vnd.jupyter.widget-view+json": {
252
+ "model_id": "4b8d2f0df8ea46bdaee2c94996583c5e",
253
+ "version_major": 2,
254
+ "version_minor": 0
255
+ },
256
+ "text/plain": [
257
+ "0ex [00:00, ?ex/s]"
258
+ ]
259
+ },
260
+ "metadata": {},
261
+ "output_type": "display_data"
262
+ }
263
+ ],
264
+ "source": [
265
+ "dataset = dataset_dict[\"train\"].map(remove_special_characters, remove_columns=dataset_dict[\"train\"].column_names)"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 23,
271
+ "id": "02f89e8e",
272
+ "metadata": {},
273
+ "outputs": [
274
+ {
275
+ "name": "stdout",
276
+ "output_type": "stream",
277
+ "text": [
278
+ " 0 244494 2030240 uz_cv8_train.txt\n"
279
+ ]
280
+ }
281
+ ],
282
+ "source": [
283
+ "text_data = \"uz_cv8_train.txt\"\n",
284
+ "with open(text_data, \"w\") as fs:\n",
285
+ " fs.write(\" \".join(dataset[\"text\"]))\n",
286
+ "\n",
287
+ "!wc $text_data"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 25,
293
+ "id": "27e4f250",
294
+ "metadata": {},
295
+ "outputs": [
296
+ {
297
+ "name": "stdout",
298
+ "output_type": "stream",
299
+ "text": [
300
+ "wc: invalid option -- 'a'\n",
301
+ "Try 'wc --help' for more information.\n"
302
+ ]
303
+ }
304
+ ],
305
+ "source": [
306
+ "!wc -a $text_data"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "id": "e0238868",
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": []
316
+ }
317
+ ],
318
+ "metadata": {
319
+ "kernelspec": {
320
+ "display_name": "Python 3 (ipykernel)",
321
+ "language": "python",
322
+ "name": "python3"
323
+ },
324
+ "language_info": {
325
+ "codemirror_mode": {
326
+ "name": "ipython",
327
+ "version": 3
328
+ },
329
+ "file_extension": ".py",
330
+ "mimetype": "text/x-python",
331
+ "name": "python",
332
+ "nbconvert_exporter": "python",
333
+ "pygments_lexer": "ipython3",
334
+ "version": "3.8.8"
335
+ }
336
+ },
337
+ "nbformat": 4,
338
+ "nbformat_minor": 5
339
+ }