Commit
•
a733f91
0
Parent(s):
Duplicate from anton-l/common_voice_generator
Browse filesCo-authored-by: Anton Lozhkov <anton-l@users.noreply.huggingface.co>
- .gitattributes +27 -0
- .gitignore +1 -0
- README.md +16 -0
- README.template.md +241 -0
- dataset_script.py +260 -0
- generate_datasets.py +135 -0
- languages.ftl +181 -0
- publish.py +3 -0
- test.py +5 -0
.gitattributes
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
common_voice_*
|
README.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
duplicated_from: anton-l/common_voice_generator
|
3 |
+
---
|
4 |
+
## Common voice release generator
|
5 |
+
|
6 |
+
1. Copy the latest release id from the `RELEASES` dict in https://github.com/common-voice/common-voice/blob/main/web/src/components/pages/datasets/releases.ts
|
7 |
+
to the `VERSIONS` variable in `generate_datasets.py`.
|
8 |
+
2. Copy the languages from https://github.com/common-voice/common-voice/blob/release-v1.78.0/web/locales/en/messages.ftl
|
9 |
+
(replacing `release-v1.78.0` with the latest version tag) to the `languages.ftl` file.
|
10 |
+
3. Run `python generate_datasets.py` to generate the dataset repos.
|
11 |
+
4. `cd ..`
|
12 |
+
5. `huggingface-cli repo create --type dataset --organization mozilla-foundation common_voice_11_0`
|
13 |
+
6. `git clone https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0`
|
14 |
+
7. `cd common_voice_11_0`
|
15 |
+
8. `cp ../common_voice_generator/common_voice_11_0/* ./`
|
16 |
+
9. `git add . && git commit -m "Release" && git push`
|
README.template.md
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pretty_name: {{NAME}}
|
3 |
+
annotations_creators:
|
4 |
+
- crowdsourced
|
5 |
+
language_creators:
|
6 |
+
- crowdsourced
|
7 |
+
language_bcp47:
|
8 |
+
{{LANGUAGES}}
|
9 |
+
license:
|
10 |
+
- cc0-1.0
|
11 |
+
multilinguality:
|
12 |
+
- multilingual
|
13 |
+
size_categories:
|
14 |
+
{{SIZES}}
|
15 |
+
source_datasets:
|
16 |
+
- extended|common_voice
|
17 |
+
task_categories:
|
18 |
+
- speech-processing
|
19 |
+
task_ids:
|
20 |
+
- automatic-speech-recognition
|
21 |
+
paperswithcode_id: common-voice
|
22 |
+
extra_gated_prompt: "By clicking on “Access repository” below, you also agree to not attempt to determine the identity of speakers in the Common Voice dataset."
|
23 |
+
---
|
24 |
+
|
25 |
+
# Dataset Card for {{NAME}}
|
26 |
+
|
27 |
+
## Table of Contents
|
28 |
+
- [Dataset Description](#dataset-description)
|
29 |
+
- [Dataset Summary](#dataset-summary)
|
30 |
+
- [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
|
31 |
+
- [Languages](#languages)
|
32 |
+
- [Dataset Structure](#dataset-structure)
|
33 |
+
- [Data Instances](#data-instances)
|
34 |
+
- [Data Fields](#data-fields)
|
35 |
+
- [Data Splits](#data-splits)
|
36 |
+
- [Dataset Creation](#dataset-creation)
|
37 |
+
- [Curation Rationale](#curation-rationale)
|
38 |
+
- [Source Data](#source-data)
|
39 |
+
- [Annotations](#annotations)
|
40 |
+
- [Personal and Sensitive Information](#personal-and-sensitive-information)
|
41 |
+
- [Considerations for Using the Data](#considerations-for-using-the-data)
|
42 |
+
- [Social Impact of Dataset](#social-impact-of-dataset)
|
43 |
+
- [Discussion of Biases](#discussion-of-biases)
|
44 |
+
- [Other Known Limitations](#other-known-limitations)
|
45 |
+
- [Additional Information](#additional-information)
|
46 |
+
- [Dataset Curators](#dataset-curators)
|
47 |
+
- [Licensing Information](#licensing-information)
|
48 |
+
- [Citation Information](#citation-information)
|
49 |
+
- [Contributions](#contributions)
|
50 |
+
|
51 |
+
## Dataset Description
|
52 |
+
|
53 |
+
- **Homepage:** https://commonvoice.mozilla.org/en/datasets
|
54 |
+
- **Repository:** https://github.com/common-voice/common-voice
|
55 |
+
- **Paper:** https://arxiv.org/abs/1912.06670
|
56 |
+
- **Leaderboard:** https://paperswithcode.com/dataset/common-voice
|
57 |
+
- **Point of Contact:** [Anton Lozhkov](mailto:anton@huggingface.co)
|
58 |
+
|
59 |
+
### Dataset Summary
|
60 |
+
|
61 |
+
The Common Voice dataset consists of a unique MP3 and corresponding text file.
|
62 |
+
Many of the {{TOTAL_HRS}} recorded hours in the dataset also include demographic metadata like age, sex, and accent
|
63 |
+
that can help improve the accuracy of speech recognition engines.
|
64 |
+
|
65 |
+
The dataset currently consists of {{VAL_HRS}} validated hours in {{NUM_LANGS}} languages, but more voices and languages are always added.
|
66 |
+
Take a look at the [Languages](https://commonvoice.mozilla.org/en/languages) page to request a language or start contributing.
|
67 |
+
|
68 |
+
### Supported Tasks and Leaderboards
|
69 |
+
|
70 |
+
The results for models trained on the Common Voice datasets are available via the
|
71 |
+
[🤗 Speech Bench](https://huggingface.co/spaces/huggingface/hf-speech-bench)
|
72 |
+
|
73 |
+
### Languages
|
74 |
+
|
75 |
+
```
|
76 |
+
{{LANGUAGES_HUMAN}}
|
77 |
+
```
|
78 |
+
|
79 |
+
## Dataset Structure
|
80 |
+
|
81 |
+
### Data Instances
|
82 |
+
|
83 |
+
A typical data point comprises the `path` to the audio file and its `sentence`.
|
84 |
+
Additional fields include `accent`, `age`, `client_id`, `up_votes`, `down_votes`, `gender`, `locale` and `segment`.
|
85 |
+
|
86 |
+
```python
|
87 |
+
{
|
88 |
+
'client_id': 'd59478fbc1ee646a28a3c652a119379939123784d99131b865a89f8b21c81f69276c48bd574b81267d9d1a77b83b43e6d475a6cfc79c232ddbca946ae9c7afc5',
|
89 |
+
'path': 'et/clips/common_voice_et_18318995.mp3',
|
90 |
+
'audio': {
|
91 |
+
'path': 'et/clips/common_voice_et_18318995.mp3',
|
92 |
+
'array': array([-0.00048828, -0.00018311, -0.00137329, ..., 0.00079346, 0.00091553, 0.00085449], dtype=float32),
|
93 |
+
'sampling_rate': 48000
|
94 |
+
},
|
95 |
+
'sentence': 'Tasub kokku saada inimestega, keda tunned juba ammust ajast saati.',
|
96 |
+
'up_votes': 2,
|
97 |
+
'down_votes': 0,
|
98 |
+
'age': 'twenties',
|
99 |
+
'gender': 'male',
|
100 |
+
'accent': '',
|
101 |
+
'locale': 'et',
|
102 |
+
'segment': ''
|
103 |
+
}
|
104 |
+
```
|
105 |
+
|
106 |
+
### Data Fields
|
107 |
+
|
108 |
+
`client_id` (`string`): An id for which client (voice) made the recording
|
109 |
+
|
110 |
+
`path` (`string`): The path to the audio file
|
111 |
+
|
112 |
+
`audio` (`dict`): A dictionary containing the path to the downloaded audio file, the decoded audio array, and the sampling rate. Note that when accessing the audio column: `dataset[0]["audio"]` the audio file is automatically decoded and resampled to `dataset.features["audio"].sampling_rate`. Decoding and resampling of a large number of audio files might take a significant amount of time. Thus it is important to first query the sample index before the `"audio"` column, *i.e.* `dataset[0]["audio"]` should **always** be preferred over `dataset["audio"][0]`.
|
113 |
+
|
114 |
+
`sentence` (`string`): The sentence the user was prompted to speak
|
115 |
+
|
116 |
+
`up_votes` (`int64`): How many upvotes the audio file has received from reviewers
|
117 |
+
|
118 |
+
`down_votes` (`int64`): How many downvotes the audio file has received from reviewers
|
119 |
+
|
120 |
+
`age` (`string`): The age of the speaker (e.g. `teens`, `twenties`, `fifties`)
|
121 |
+
|
122 |
+
`gender` (`string`): The gender of the speaker
|
123 |
+
|
124 |
+
`accent` (`string`): Accent of the speaker
|
125 |
+
|
126 |
+
`locale` (`string`): The locale of the speaker
|
127 |
+
|
128 |
+
`segment` (`string`): Usually an empty field
|
129 |
+
|
130 |
+
### Data Splits
|
131 |
+
|
132 |
+
The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
|
133 |
+
|
134 |
+
The validated data is data that has been validated with reviewers and received upvotes that the data is of high quality.
|
135 |
+
|
136 |
+
The invalidated data is data has been invalidated by reviewers
|
137 |
+
and received downvotes indicating that the data is of low quality.
|
138 |
+
|
139 |
+
The reported data is data that has been reported, for different reasons.
|
140 |
+
|
141 |
+
The other data is data that has not yet been reviewed.
|
142 |
+
|
143 |
+
The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
|
144 |
+
|
145 |
+
## Data Preprocessing Recommended by Hugging Face
|
146 |
+
|
147 |
+
The following are data preprocessing steps advised by the Hugging Face team. They are accompanied by an example code snippet that shows how to put them to practice.
|
148 |
+
|
149 |
+
Many examples in this dataset have trailing quotations marks, e.g _“the cat sat on the mat.“_. These trailing quotation marks do not change the actual meaning of the sentence, and it is near impossible to infer whether a sentence is a quotation or not a quotation from audio data alone. In these cases, it is advised to strip the quotation marks, leaving: _the cat sat on the mat_.
|
150 |
+
|
151 |
+
In addition, the majority of training sentences end in punctuation ( . or ? or ! ), whereas just a small proportion do not. In the dev set, **almost all** sentences end in punctuation. Thus, it is recommended to append a full-stop ( . ) to the end of the small number of training examples that do not end in punctuation.
|
152 |
+
|
153 |
+
```python
|
154 |
+
from datasets import load_dataset
|
155 |
+
|
156 |
+
ds = load_dataset("mozilla-foundation/{{DATASET_PATH}}", "en", use_auth_token=True)
|
157 |
+
|
158 |
+
def prepare_dataset(batch):
|
159 |
+
"""Function to preprocess the dataset with the .map method"""
|
160 |
+
transcription = batch["sentence"]
|
161 |
+
|
162 |
+
if transcription.startswith('"') and transcription.endswith('"'):
|
163 |
+
# we can remove trailing quotation marks as they do not affect the transcription
|
164 |
+
transcription = transcription[1:-1]
|
165 |
+
|
166 |
+
if transcription[-1] not in [".", "?", "!"]:
|
167 |
+
# append a full-stop to sentences that do not end in punctuation
|
168 |
+
transcription = transcription + "."
|
169 |
+
|
170 |
+
batch["sentence"] = transcription
|
171 |
+
|
172 |
+
return batch
|
173 |
+
|
174 |
+
ds = ds.map(prepare_dataset, desc="preprocess dataset")
|
175 |
+
```
|
176 |
+
|
177 |
+
## Dataset Creation
|
178 |
+
|
179 |
+
### Curation Rationale
|
180 |
+
|
181 |
+
[Needs More Information]
|
182 |
+
|
183 |
+
### Source Data
|
184 |
+
|
185 |
+
#### Initial Data Collection and Normalization
|
186 |
+
|
187 |
+
[Needs More Information]
|
188 |
+
|
189 |
+
#### Who are the source language producers?
|
190 |
+
|
191 |
+
[Needs More Information]
|
192 |
+
|
193 |
+
### Annotations
|
194 |
+
|
195 |
+
#### Annotation process
|
196 |
+
|
197 |
+
[Needs More Information]
|
198 |
+
|
199 |
+
#### Who are the annotators?
|
200 |
+
|
201 |
+
[Needs More Information]
|
202 |
+
|
203 |
+
### Personal and Sensitive Information
|
204 |
+
|
205 |
+
The dataset consists of people who have donated their voice online. You agree to not attempt to determine the identity of speakers in the Common Voice dataset.
|
206 |
+
|
207 |
+
## Considerations for Using the Data
|
208 |
+
|
209 |
+
### Social Impact of Dataset
|
210 |
+
|
211 |
+
The dataset consists of people who have donated their voice online. You agree to not attempt to determine the identity of speakers in the Common Voice dataset.
|
212 |
+
|
213 |
+
### Discussion of Biases
|
214 |
+
|
215 |
+
[More Information Needed]
|
216 |
+
|
217 |
+
### Other Known Limitations
|
218 |
+
|
219 |
+
[More Information Needed]
|
220 |
+
|
221 |
+
## Additional Information
|
222 |
+
|
223 |
+
### Dataset Curators
|
224 |
+
|
225 |
+
[More Information Needed]
|
226 |
+
|
227 |
+
### Licensing Information
|
228 |
+
|
229 |
+
Public Domain, [CC-0](https://creativecommons.org/share-your-work/public-domain/cc0/)
|
230 |
+
|
231 |
+
### Citation Information
|
232 |
+
|
233 |
+
```
|
234 |
+
@inproceedings{commonvoice:2020,
|
235 |
+
author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},
|
236 |
+
title = {Common Voice: A Massively-Multilingual Speech Corpus},
|
237 |
+
booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
|
238 |
+
pages = {4211--4215},
|
239 |
+
year = 2020
|
240 |
+
}
|
241 |
+
```
|
dataset_script.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
""" Common Voice Dataset"""
|
16 |
+
|
17 |
+
|
18 |
+
import csv
|
19 |
+
import os
|
20 |
+
import urllib
|
21 |
+
|
22 |
+
import datasets
|
23 |
+
import requests
|
24 |
+
from datasets.utils.py_utils import size_str
|
25 |
+
from huggingface_hub import HfApi, HfFolder
|
26 |
+
|
27 |
+
from .languages import LANGUAGES
|
28 |
+
from .release_stats import STATS
|
29 |
+
|
30 |
+
_CITATION = """\
|
31 |
+
@inproceedings{commonvoice:2020,
|
32 |
+
author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},
|
33 |
+
title = {Common Voice: A Massively-Multilingual Speech Corpus},
|
34 |
+
booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
|
35 |
+
pages = {4211--4215},
|
36 |
+
year = 2020
|
37 |
+
}
|
38 |
+
"""
|
39 |
+
|
40 |
+
_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets"
|
41 |
+
|
42 |
+
_LICENSE = "https://creativecommons.org/publicdomain/zero/1.0/"
|
43 |
+
|
44 |
+
_API_URL = "https://commonvoice.mozilla.org/api/v1"
|
45 |
+
|
46 |
+
|
47 |
+
class CommonVoiceConfig(datasets.BuilderConfig):
|
48 |
+
"""BuilderConfig for CommonVoice."""
|
49 |
+
|
50 |
+
def __init__(self, name, version, **kwargs):
|
51 |
+
self.language = kwargs.pop("language", None)
|
52 |
+
self.release_date = kwargs.pop("release_date", None)
|
53 |
+
self.num_clips = kwargs.pop("num_clips", None)
|
54 |
+
self.num_speakers = kwargs.pop("num_speakers", None)
|
55 |
+
self.validated_hr = kwargs.pop("validated_hr", None)
|
56 |
+
self.total_hr = kwargs.pop("total_hr", None)
|
57 |
+
self.size_bytes = kwargs.pop("size_bytes", None)
|
58 |
+
self.size_human = size_str(self.size_bytes)
|
59 |
+
description = (
|
60 |
+
f"Common Voice speech to text dataset in {self.language} released on {self.release_date}. "
|
61 |
+
f"The dataset comprises {self.validated_hr} hours of validated transcribed speech data "
|
62 |
+
f"out of {self.total_hr} hours in total from {self.num_speakers} speakers. "
|
63 |
+
f"The dataset contains {self.num_clips} audio clips and has a size of {self.size_human}."
|
64 |
+
)
|
65 |
+
super(CommonVoiceConfig, self).__init__(
|
66 |
+
name=name,
|
67 |
+
version=datasets.Version(version),
|
68 |
+
description=description,
|
69 |
+
**kwargs,
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
class CommonVoice(datasets.GeneratorBasedBuilder):
|
74 |
+
DEFAULT_CONFIG_NAME = "en"
|
75 |
+
DEFAULT_WRITER_BATCH_SIZE = 1000
|
76 |
+
|
77 |
+
BUILDER_CONFIGS = [
|
78 |
+
CommonVoiceConfig(
|
79 |
+
name=lang,
|
80 |
+
version=STATS["version"],
|
81 |
+
language=LANGUAGES[lang],
|
82 |
+
release_date=STATS["date"],
|
83 |
+
num_clips=lang_stats["clips"],
|
84 |
+
num_speakers=lang_stats["users"],
|
85 |
+
validated_hr=float(lang_stats["validHrs"]) if lang_stats["validHrs"] else None,
|
86 |
+
total_hr=float(lang_stats["totalHrs"]) if lang_stats["totalHrs"] else None,
|
87 |
+
size_bytes=int(lang_stats["size"]) if lang_stats["size"] else None,
|
88 |
+
)
|
89 |
+
for lang, lang_stats in STATS["locales"].items()
|
90 |
+
]
|
91 |
+
|
92 |
+
def _info(self):
|
93 |
+
total_languages = len(STATS["locales"])
|
94 |
+
total_valid_hours = STATS["totalValidHrs"]
|
95 |
+
description = (
|
96 |
+
"Common Voice is Mozilla's initiative to help teach machines how real people speak. "
|
97 |
+
f"The dataset currently consists of {total_valid_hours} validated hours of speech "
|
98 |
+
f" in {total_languages} languages, but more voices and languages are always added."
|
99 |
+
)
|
100 |
+
features = datasets.Features(
|
101 |
+
{
|
102 |
+
"client_id": datasets.Value("string"),
|
103 |
+
"path": datasets.Value("string"),
|
104 |
+
"audio": datasets.features.Audio(sampling_rate=48_000),
|
105 |
+
"sentence": datasets.Value("string"),
|
106 |
+
"up_votes": datasets.Value("int64"),
|
107 |
+
"down_votes": datasets.Value("int64"),
|
108 |
+
"age": datasets.Value("string"),
|
109 |
+
"gender": datasets.Value("string"),
|
110 |
+
"accent": datasets.Value("string"),
|
111 |
+
"locale": datasets.Value("string"),
|
112 |
+
"segment": datasets.Value("string"),
|
113 |
+
}
|
114 |
+
)
|
115 |
+
|
116 |
+
return datasets.DatasetInfo(
|
117 |
+
description=description,
|
118 |
+
features=features,
|
119 |
+
supervised_keys=None,
|
120 |
+
homepage=_HOMEPAGE,
|
121 |
+
license=_LICENSE,
|
122 |
+
citation=_CITATION,
|
123 |
+
version=self.config.version,
|
124 |
+
# task_templates=[
|
125 |
+
# AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence")
|
126 |
+
# ],
|
127 |
+
)
|
128 |
+
|
129 |
+
def _get_bundle_url(self, locale, url_template):
|
130 |
+
# path = encodeURIComponent(path)
|
131 |
+
path = url_template.replace("{locale}", locale)
|
132 |
+
path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'")
|
133 |
+
# use_cdn = self.config.size_bytes < 20 * 1024 * 1024 * 1024
|
134 |
+
# response = requests.get(f"{_API_URL}/bucket/dataset/{path}/{use_cdn}", timeout=10.0).json()
|
135 |
+
response = requests.get(f"{_API_URL}/bucket/dataset/{path}", timeout=10.0).json()
|
136 |
+
return response["url"]
|
137 |
+
|
138 |
+
def _log_download(self, locale, bundle_version, auth_token):
|
139 |
+
if isinstance(auth_token, bool):
|
140 |
+
auth_token = HfFolder().get_token()
|
141 |
+
whoami = HfApi().whoami(auth_token)
|
142 |
+
email = whoami["email"] if "email" in whoami else ""
|
143 |
+
payload = {"email": email, "locale": locale, "dataset": bundle_version}
|
144 |
+
requests.post(f"{_API_URL}/{locale}/downloaders", json=payload).json()
|
145 |
+
|
146 |
+
def _split_generators(self, dl_manager):
|
147 |
+
"""Returns SplitGenerators."""
|
148 |
+
hf_auth_token = dl_manager.download_config.use_auth_token
|
149 |
+
if hf_auth_token is None:
|
150 |
+
raise ConnectionError(
|
151 |
+
"Please set use_auth_token=True or use_auth_token='<TOKEN>' to download this dataset"
|
152 |
+
)
|
153 |
+
|
154 |
+
bundle_url_template = STATS["bundleURLTemplate"]
|
155 |
+
bundle_version = bundle_url_template.split("/")[0]
|
156 |
+
dl_manager.download_config.ignore_url_params = True
|
157 |
+
|
158 |
+
self._log_download(self.config.name, bundle_version, hf_auth_token)
|
159 |
+
archive_path = dl_manager.download(self._get_bundle_url(self.config.name, bundle_url_template))
|
160 |
+
local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else None
|
161 |
+
|
162 |
+
if self.config.version < datasets.Version("5.0.0"):
|
163 |
+
path_to_data = ""
|
164 |
+
else:
|
165 |
+
path_to_data = "/".join([bundle_version, self.config.name])
|
166 |
+
path_to_clips = "/".join([path_to_data, "clips"]) if path_to_data else "clips"
|
167 |
+
|
168 |
+
return [
|
169 |
+
datasets.SplitGenerator(
|
170 |
+
name=datasets.Split.TRAIN,
|
171 |
+
gen_kwargs={
|
172 |
+
"local_extracted_archive": local_extracted_archive,
|
173 |
+
"archive_iterator": dl_manager.iter_archive(archive_path),
|
174 |
+
"metadata_filepath": "/".join([path_to_data, "train.tsv"]) if path_to_data else "train.tsv",
|
175 |
+
"path_to_clips": path_to_clips,
|
176 |
+
},
|
177 |
+
),
|
178 |
+
datasets.SplitGenerator(
|
179 |
+
name=datasets.Split.TEST,
|
180 |
+
gen_kwargs={
|
181 |
+
"local_extracted_archive": local_extracted_archive,
|
182 |
+
"archive_iterator": dl_manager.iter_archive(archive_path),
|
183 |
+
"metadata_filepath": "/".join([path_to_data, "test.tsv"]) if path_to_data else "test.tsv",
|
184 |
+
"path_to_clips": path_to_clips,
|
185 |
+
},
|
186 |
+
),
|
187 |
+
datasets.SplitGenerator(
|
188 |
+
name=datasets.Split.VALIDATION,
|
189 |
+
gen_kwargs={
|
190 |
+
"local_extracted_archive": local_extracted_archive,
|
191 |
+
"archive_iterator": dl_manager.iter_archive(archive_path),
|
192 |
+
"metadata_filepath": "/".join([path_to_data, "dev.tsv"]) if path_to_data else "dev.tsv",
|
193 |
+
"path_to_clips": path_to_clips,
|
194 |
+
},
|
195 |
+
),
|
196 |
+
datasets.SplitGenerator(
|
197 |
+
name="other",
|
198 |
+
gen_kwargs={
|
199 |
+
"local_extracted_archive": local_extracted_archive,
|
200 |
+
"archive_iterator": dl_manager.iter_archive(archive_path),
|
201 |
+
"metadata_filepath": "/".join([path_to_data, "other.tsv"]) if path_to_data else "other.tsv",
|
202 |
+
"path_to_clips": path_to_clips,
|
203 |
+
},
|
204 |
+
),
|
205 |
+
datasets.SplitGenerator(
|
206 |
+
name="invalidated",
|
207 |
+
gen_kwargs={
|
208 |
+
"local_extracted_archive": local_extracted_archive,
|
209 |
+
"archive_iterator": dl_manager.iter_archive(archive_path),
|
210 |
+
"metadata_filepath": "/".join([path_to_data, "invalidated.tsv"])
|
211 |
+
if path_to_data
|
212 |
+
else "invalidated.tsv",
|
213 |
+
"path_to_clips": path_to_clips,
|
214 |
+
},
|
215 |
+
),
|
216 |
+
]
|
217 |
+
|
218 |
+
def _generate_examples(
|
219 |
+
self,
|
220 |
+
local_extracted_archive,
|
221 |
+
archive_iterator,
|
222 |
+
metadata_filepath,
|
223 |
+
path_to_clips,
|
224 |
+
):
|
225 |
+
"""Yields examples."""
|
226 |
+
data_fields = list(self._info().features.keys())
|
227 |
+
metadata = {}
|
228 |
+
metadata_found = False
|
229 |
+
for path, f in archive_iterator:
|
230 |
+
if path == metadata_filepath:
|
231 |
+
metadata_found = True
|
232 |
+
lines = (line.decode("utf-8") for line in f)
|
233 |
+
reader = csv.DictReader(lines, delimiter="\t", quoting=csv.QUOTE_NONE)
|
234 |
+
for row in reader:
|
235 |
+
# set absolute path for mp3 audio file
|
236 |
+
if not row["path"].endswith(".mp3"):
|
237 |
+
row["path"] += ".mp3"
|
238 |
+
row["path"] = os.path.join(path_to_clips, row["path"])
|
239 |
+
# accent -> accents in CV 8.0
|
240 |
+
if "accents" in row:
|
241 |
+
row["accent"] = row["accents"]
|
242 |
+
del row["accents"]
|
243 |
+
# if data is incomplete, fill with empty values
|
244 |
+
for field in data_fields:
|
245 |
+
if field not in row:
|
246 |
+
row[field] = ""
|
247 |
+
metadata[row["path"]] = row
|
248 |
+
elif path.startswith(path_to_clips):
|
249 |
+
assert metadata_found, "Found audio clips before the metadata TSV file."
|
250 |
+
if not metadata:
|
251 |
+
break
|
252 |
+
if path in metadata:
|
253 |
+
result = dict(metadata[path])
|
254 |
+
# set the audio feature and the path to the extracted file
|
255 |
+
path = os.path.join(local_extracted_archive, path) if local_extracted_archive else path
|
256 |
+
result["audio"] = {"path": path, "bytes": f.read()}
|
257 |
+
# set path to None if the audio file doesn't exist locally (i.e. in streaming mode)
|
258 |
+
result["path"] = path if local_extracted_archive else None
|
259 |
+
|
260 |
+
yield path, result
|
generate_datasets.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
import requests
|
6 |
+
|
7 |
+
RELEASE_STATS_URL = "https://commonvoice.mozilla.org/dist/releases/{}.json"
|
8 |
+
VERSIONS = [
|
9 |
+
{"semver": "1.0.0", "name": "common_voice_1_0", "release": "cv-corpus-1"},
|
10 |
+
{"semver": "2.0.0", "name": "common_voice_2_0", "release": "cv-corpus-2"},
|
11 |
+
{"semver": "3.0.0", "name": "common_voice_3_0", "release": "cv-corpus-3"},
|
12 |
+
{
|
13 |
+
"semver": "4.0.0",
|
14 |
+
"name": "common_voice_4_0",
|
15 |
+
"release": "cv-corpus-4-2019-12-10",
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"semver": "5.0.0",
|
19 |
+
"name": "common_voice_5_0",
|
20 |
+
"release": "cv-corpus-5-2020-06-22",
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"semver": "5.1.0",
|
24 |
+
"name": "common_voice_5_1",
|
25 |
+
"release": "cv-corpus-5.1-2020-06-22",
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"semver": "6.0.0",
|
29 |
+
"name": "common_voice_6_0",
|
30 |
+
"release": "cv-corpus-6.0-2020-12-11",
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"semver": "6.1.0",
|
34 |
+
"name": "common_voice_6_1",
|
35 |
+
"release": "cv-corpus-6.1-2020-12-11",
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"semver": "7.0.0",
|
39 |
+
"name": "common_voice_7_0",
|
40 |
+
"release": "cv-corpus-7.0-2021-07-21",
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"semver": "8.0.0",
|
44 |
+
"name": "common_voice_8_0",
|
45 |
+
"release": "cv-corpus-8.0-2022-01-19",
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"semver": "9.0.0",
|
49 |
+
"name": "common_voice_9_0",
|
50 |
+
"release": "cv-corpus-9.0-2022-04-27",
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"semver": "10.0.0",
|
54 |
+
"name": "common_voice_10_0",
|
55 |
+
"release": "cv-corpus-10.0-2022-07-04",
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"semver": "11.0.0",
|
59 |
+
"name": "common_voice_11_0",
|
60 |
+
"release": "cv-corpus-11.0-2022-09-21",
|
61 |
+
},
|
62 |
+
]
|
63 |
+
|
64 |
+
|
65 |
+
def num_to_size(num: int):
|
66 |
+
if num < 1000:
|
67 |
+
return "n<1K"
|
68 |
+
elif num < 10_000:
|
69 |
+
return "1K<n<10K"
|
70 |
+
elif num < 100_000:
|
71 |
+
return "10K<n<100K"
|
72 |
+
elif num < 1_000_000:
|
73 |
+
return "100K<n<1M"
|
74 |
+
elif num < 10_000_000:
|
75 |
+
return "1M<n<10M"
|
76 |
+
elif num < 100_000_000:
|
77 |
+
return "10M<n<100M"
|
78 |
+
elif num < 1_000_000_000:
|
79 |
+
return "100M<n<1B"
|
80 |
+
|
81 |
+
|
82 |
+
def get_language_names():
|
83 |
+
# source: https://github.com/common-voice/common-voice/blob/release-v1.71.0/web/locales/en/messages.ftl
|
84 |
+
languages = {}
|
85 |
+
with open("languages.ftl") as fin:
|
86 |
+
for line in fin:
|
87 |
+
lang_code, lang_name = line.strip().split(" = ")
|
88 |
+
languages[lang_code] = lang_name
|
89 |
+
|
90 |
+
return languages
|
91 |
+
|
92 |
+
|
93 |
+
def main():
|
94 |
+
language_names = get_language_names()
|
95 |
+
|
96 |
+
for version in VERSIONS:
|
97 |
+
stats_url = RELEASE_STATS_URL.format(version["release"])
|
98 |
+
release_stats = requests.get(stats_url).text
|
99 |
+
release_stats = json.loads(release_stats)
|
100 |
+
release_stats["version"] = version["semver"]
|
101 |
+
|
102 |
+
dataset_path = version["name"]
|
103 |
+
os.makedirs(dataset_path, exist_ok=True)
|
104 |
+
with open(f"{dataset_path}/release_stats.py", "w") as fout:
|
105 |
+
fout.write("STATS = " + str(release_stats))
|
106 |
+
|
107 |
+
with open(f"README.template.md", "r") as fin:
|
108 |
+
readme = fin.read()
|
109 |
+
readme = readme.replace("{{NAME}}", release_stats["name"])
|
110 |
+
readme = readme.replace("{{DATASET_PATH}}", version["name"])
|
111 |
+
|
112 |
+
locales = sorted(release_stats["locales"].keys())
|
113 |
+
languages = [f"- {loc}" for loc in locales]
|
114 |
+
readme = readme.replace("{{LANGUAGES}}", "\n".join(languages))
|
115 |
+
|
116 |
+
sizes = [f" {loc}:\n - {num_to_size(release_stats['locales'][loc]['clips'])}" for loc in locales]
|
117 |
+
readme = readme.replace("{{SIZES}}", "\n".join(sizes))
|
118 |
+
|
119 |
+
languages_human = sorted([language_names[loc] for loc in locales])
|
120 |
+
readme = readme.replace("{{LANGUAGES_HUMAN}}", ", ".join(languages_human))
|
121 |
+
|
122 |
+
readme = readme.replace("{{TOTAL_HRS}}", str(release_stats["totalHrs"]))
|
123 |
+
readme = readme.replace("{{VAL_HRS}}", str(release_stats["totalValidHrs"]))
|
124 |
+
readme = readme.replace("{{NUM_LANGS}}", str(len(locales)))
|
125 |
+
|
126 |
+
with open(f"{dataset_path}/README.md", "w") as fout:
|
127 |
+
fout.write(readme)
|
128 |
+
with open(f"{dataset_path}/languages.py", "w") as fout:
|
129 |
+
fout.write("LANGUAGES = " + str(language_names))
|
130 |
+
|
131 |
+
shutil.copy("dataset_script.py", f"{dataset_path}/{dataset_path}.py")
|
132 |
+
|
133 |
+
|
134 |
+
if __name__ == "__main__":
|
135 |
+
main()
|
languages.ftl
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ab = Abkhaz
|
2 |
+
ace = Acehnese
|
3 |
+
ady = Adyghe
|
4 |
+
af = Afrikaans
|
5 |
+
am = Amharic
|
6 |
+
an = Aragonese
|
7 |
+
ar = Arabic
|
8 |
+
arn = Mapudungun
|
9 |
+
as = Assamese
|
10 |
+
ast = Asturian
|
11 |
+
az = Azerbaijani
|
12 |
+
ba = Bashkir
|
13 |
+
bas = Basaa
|
14 |
+
be = Belarusian
|
15 |
+
bg = Bulgarian
|
16 |
+
bn = Bengali
|
17 |
+
br = Breton
|
18 |
+
bs = Bosnian
|
19 |
+
bxr = Buryat
|
20 |
+
ca = Catalan
|
21 |
+
cak = Kaqchikel
|
22 |
+
ckb = Central Kurdish
|
23 |
+
cnh = Hakha Chin
|
24 |
+
co = Corsican
|
25 |
+
cs = Czech
|
26 |
+
cv = Chuvash
|
27 |
+
cy = Welsh
|
28 |
+
da = Danish
|
29 |
+
de = German
|
30 |
+
dsb = Sorbian, Lower
|
31 |
+
dv = Dhivehi
|
32 |
+
dyu = Dioula
|
33 |
+
el = Greek
|
34 |
+
en = English
|
35 |
+
eo = Esperanto
|
36 |
+
es = Spanish
|
37 |
+
et = Estonian
|
38 |
+
eu = Basque
|
39 |
+
fa = Persian
|
40 |
+
ff = Fulah
|
41 |
+
fi = Finnish
|
42 |
+
fo = Faroese
|
43 |
+
fr = French
|
44 |
+
fy-NL = Frisian
|
45 |
+
ga-IE = Irish
|
46 |
+
gl = Galician
|
47 |
+
gn = Guarani
|
48 |
+
gom = Goan Konkani
|
49 |
+
ha = Hausa
|
50 |
+
he = Hebrew
|
51 |
+
hi = Hindi
|
52 |
+
hil = Hiligaynon
|
53 |
+
hr = Croatian
|
54 |
+
hsb = Sorbian, Upper
|
55 |
+
ht = Haitian
|
56 |
+
hu = Hungarian
|
57 |
+
hy-AM = Armenian
|
58 |
+
hyw = Armenian Western
|
59 |
+
ia = Interlingua
|
60 |
+
id = Indonesian
|
61 |
+
ie = Interlingue
|
62 |
+
ig = Igbo
|
63 |
+
is = Icelandic
|
64 |
+
it = Italian
|
65 |
+
izh = Izhorian
|
66 |
+
ja = Japanese
|
67 |
+
jbo = Lojban
|
68 |
+
ka = Georgian
|
69 |
+
kaa = Karakalpak
|
70 |
+
kab = Kabyle
|
71 |
+
kbd = Kabardian
|
72 |
+
ki = Kikuyu
|
73 |
+
kk = Kazakh
|
74 |
+
km = Khmer
|
75 |
+
kmr = Kurmanji Kurdish
|
76 |
+
kn = Kannada
|
77 |
+
knn = Konkani (Devanagari)
|
78 |
+
ko = Korean
|
79 |
+
kpv = Komi-Zyrian
|
80 |
+
kw = Cornish
|
81 |
+
ky = Kyrgyz
|
82 |
+
lb = Luxembourgish
|
83 |
+
lg = Luganda
|
84 |
+
lij = Ligurian
|
85 |
+
ln = Lingala
|
86 |
+
lo = Lao
|
87 |
+
lt = Lithuanian
|
88 |
+
lv = Latvian
|
89 |
+
mai = Maithili
|
90 |
+
mdf = Moksha
|
91 |
+
mg = Malagasy
|
92 |
+
mhr = Meadow Mari
|
93 |
+
mk = Macedonian
|
94 |
+
ml = Malayalam
|
95 |
+
mn = Mongolian
|
96 |
+
mni = Meetei Lon
|
97 |
+
mos = Mossi
|
98 |
+
mr = Marathi
|
99 |
+
mrj = Hill Mari
|
100 |
+
ms = Malay
|
101 |
+
mt = Maltese
|
102 |
+
my = Burmese
|
103 |
+
myv = Erzya
|
104 |
+
nan-tw = Taiwanese (Minnan)
|
105 |
+
nb-NO = Norwegian Bokmål
|
106 |
+
nd = IsiNdebele (North)
|
107 |
+
ne-NP = Nepali
|
108 |
+
nia = Nias
|
109 |
+
nl = Dutch
|
110 |
+
nn-NO = Norwegian Nynorsk
|
111 |
+
nr = IsiNdebele (South)
|
112 |
+
nso = Northern Sotho
|
113 |
+
nyn = Runyankole
|
114 |
+
oc = Occitan
|
115 |
+
om = Afaan Ormoo
|
116 |
+
or = Odia
|
117 |
+
pa-IN = Punjabi
|
118 |
+
pap-AW = Papiamento (Aruba)
|
119 |
+
pl = Polish
|
120 |
+
ps = Pashto
|
121 |
+
pt = Portuguese
|
122 |
+
quc = K'iche'
|
123 |
+
quy = Quechua Chanka
|
124 |
+
rm-sursilv = Romansh Sursilvan
|
125 |
+
rm-vallader = Romansh Vallader
|
126 |
+
ro = Romanian
|
127 |
+
ru = Russian
|
128 |
+
rw = Kinyarwanda
|
129 |
+
sah = Sakha
|
130 |
+
sat = Santali (Ol Chiki)
|
131 |
+
sc = Sardinian
|
132 |
+
scn = Sicilian
|
133 |
+
sdh = Southern Kurdish
|
134 |
+
shi = Shilha
|
135 |
+
si = Sinhala
|
136 |
+
sk = Slovak
|
137 |
+
skr = Saraiki
|
138 |
+
sl = Slovenian
|
139 |
+
snk = Soninke
|
140 |
+
so = Somali
|
141 |
+
sq = Albanian
|
142 |
+
sr = Serbian
|
143 |
+
ss = Siswati
|
144 |
+
st = Southern Sotho
|
145 |
+
sv-SE = Swedish
|
146 |
+
sw = Swahili
|
147 |
+
syr = Syriac
|
148 |
+
ta = Tamil
|
149 |
+
te = Telugu
|
150 |
+
tg = Tajik
|
151 |
+
th = Thai
|
152 |
+
ti = Tigrinya
|
153 |
+
tig = Tigre
|
154 |
+
tk = Turkmen
|
155 |
+
tl = Tagalog
|
156 |
+
tn = Setswana
|
157 |
+
tok = Toki Pona
|
158 |
+
tr = Turkish
|
159 |
+
ts = Xitsonga
|
160 |
+
tt = Tatar
|
161 |
+
tw = Twi
|
162 |
+
ty = Tahitian
|
163 |
+
uby = Ubykh
|
164 |
+
udm = Udmurt
|
165 |
+
ug = Uyghur
|
166 |
+
uk = Ukrainian
|
167 |
+
ur = Urdu
|
168 |
+
uz = Uzbek
|
169 |
+
ve = Tshivenda
|
170 |
+
vec = Venetian
|
171 |
+
vi = Vietnamese
|
172 |
+
vot = Votic
|
173 |
+
xh = Xhosa
|
174 |
+
yi = Yiddish
|
175 |
+
yo = Yoruba
|
176 |
+
yue = Cantonese
|
177 |
+
zgh = Tamazight
|
178 |
+
zh-CN = Chinese (China)
|
179 |
+
zh-HK = Chinese (Hong Kong)
|
180 |
+
zh-TW = Chinese (Taiwan)
|
181 |
+
zu = Zulu
|
publish.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import create_repo
|
2 |
+
|
3 |
+
create_repo("mozilla-foundation/common_voice_10_0", repo_type="dataset")
|
test.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
|
3 |
+
dataset = load_dataset("./common_voice_11_0", "et", split="test", use_auth_token=True)
|
4 |
+
print(dataset)
|
5 |
+
print(dataset[100])
|