add CV 10
Browse files- .gitattributes +0 -0
- .gitignore +0 -0
- README.template.md +35 -3
- dataset_script.py +3 -3
- generate_datasets.py +5 -0
- languages.ftl +13 -1
- publish.py +3 -0
- test.py +1 -1
.gitattributes
CHANGED
File without changes
|
.gitignore
CHANGED
File without changes
|
README.template.md
CHANGED
@@ -4,9 +4,9 @@ annotations_creators:
|
|
4 |
- crowdsourced
|
5 |
language_creators:
|
6 |
- crowdsourced
|
7 |
-
|
8 |
{{LANGUAGES}}
|
9 |
-
|
10 |
- cc0-1.0
|
11 |
multilinguality:
|
12 |
- multilingual
|
@@ -68,7 +68,7 @@ Take a look at the [Languages](https://commonvoice.mozilla.org/en/languages) pag
|
|
68 |
### Supported Tasks and Leaderboards
|
69 |
|
70 |
The results for models trained on the Common Voice datasets are available via the
|
71 |
-
[
|
72 |
|
73 |
### Languages
|
74 |
|
@@ -142,6 +142,38 @@ The other data is data that has not yet been reviewed.
|
|
142 |
|
143 |
The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
## Dataset Creation
|
146 |
|
147 |
### Curation Rationale
|
|
|
4 |
- crowdsourced
|
5 |
language_creators:
|
6 |
- crowdsourced
|
7 |
+
language_bcp47:
|
8 |
{{LANGUAGES}}
|
9 |
+
license:
|
10 |
- cc0-1.0
|
11 |
multilinguality:
|
12 |
- multilingual
|
|
|
68 |
### Supported Tasks and Leaderboards
|
69 |
|
70 |
The results for models trained on the Common Voice datasets are available via the
|
71 |
+
[🤗 Speech Bench](https://huggingface.co/spaces/huggingface/hf-speech-bench)
|
72 |
|
73 |
### Languages
|
74 |
|
|
|
142 |
|
143 |
The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
|
144 |
|
145 |
+
## Data Preprocessing Recommended by Hugging Face
|
146 |
+
|
147 |
+
The following are data preprocessing steps advised by the Hugging Face team. They are accompanied by an example code snippet that shows how to put them to practice.
|
148 |
+
|
149 |
+
Many examples in this dataset have trailing quotations marks, e.g _“the cat sat on the mat.“_. These trailing quotation marks do not change the actual meaning of the sentence, and it is near impossible to infer whether a sentence is a quotation or not a quotation from audio data alone. In these cases, it is advised to strip the quotation marks, leaving: _the cat sat on the mat_.
|
150 |
+
|
151 |
+
In addition, the majority of training sentences end in punctuation ( . or ? or ! ), whereas just a small proportion do not. In the dev set, **almost all** sentences end in punctuation. Thus, it is recommended to append a full-stop ( . ) to the end of the small number of training examples that do not end in punctuation.
|
152 |
+
|
153 |
+
```python
|
154 |
+
from datasets import load_dataset
|
155 |
+
|
156 |
+
ds = load_dataset("mozilla-foundation/{{NAME}}", "en", use_auth_token=True)
|
157 |
+
|
158 |
+
def prepare_dataset(batch):
|
159 |
+
"""Function to preprocess the dataset with the .map method"""
|
160 |
+
transcription = batch["sentence"]
|
161 |
+
|
162 |
+
if transcription.startswith('"') and transcription.endswith('"'):
|
163 |
+
# we can remove trailing quotation marks as they do not affect the transcription
|
164 |
+
transcription = transcription[1:-1]
|
165 |
+
|
166 |
+
if transcription[-1] not in [".", "?", "!"]:
|
167 |
+
# append a full-stop to sentences that do not end in punctuation
|
168 |
+
transcription = transcription + "."
|
169 |
+
|
170 |
+
batch["sentence"] = transcription
|
171 |
+
|
172 |
+
return batch
|
173 |
+
|
174 |
+
ds = ds.map(prepare_dataset, desc="preprocess dataset")
|
175 |
+
```
|
176 |
+
|
177 |
## Dataset Creation
|
178 |
|
179 |
### Curation Rationale
|
dataset_script.py
CHANGED
@@ -82,9 +82,9 @@ class CommonVoice(datasets.GeneratorBasedBuilder):
|
|
82 |
release_date=STATS["date"],
|
83 |
num_clips=lang_stats["clips"],
|
84 |
num_speakers=lang_stats["users"],
|
85 |
-
validated_hr=float(lang_stats["validHrs"]),
|
86 |
-
total_hr=float(lang_stats["totalHrs"]),
|
87 |
-
size_bytes=int(lang_stats["size"]),
|
88 |
)
|
89 |
for lang, lang_stats in STATS["locales"].items()
|
90 |
]
|
|
|
82 |
release_date=STATS["date"],
|
83 |
num_clips=lang_stats["clips"],
|
84 |
num_speakers=lang_stats["users"],
|
85 |
+
validated_hr=float(lang_stats["validHrs"]) if lang_stats["validHrs"] else None,
|
86 |
+
total_hr=float(lang_stats["totalHrs"]) if lang_stats["totalHrs"] else None,
|
87 |
+
size_bytes=int(lang_stats["size"]) if lang_stats["size"] else None,
|
88 |
)
|
89 |
for lang, lang_stats in STATS["locales"].items()
|
90 |
]
|
generate_datasets.py
CHANGED
@@ -49,6 +49,11 @@ VERSIONS = [
|
|
49 |
"name": "common_voice_9_0",
|
50 |
"release": "cv-corpus-9.0-2022-04-27",
|
51 |
},
|
|
|
|
|
|
|
|
|
|
|
52 |
]
|
53 |
|
54 |
|
|
|
49 |
"name": "common_voice_9_0",
|
50 |
"release": "cv-corpus-9.0-2022-04-27",
|
51 |
},
|
52 |
+
{
|
53 |
+
"semver": "10.0.0",
|
54 |
+
"name": "common_voice_10_0",
|
55 |
+
"release": "cv-corpus-10.0-2022-07-04",
|
56 |
+
},
|
57 |
]
|
58 |
|
59 |
|
languages.ftl
CHANGED
@@ -29,6 +29,7 @@ da = Danish
|
|
29 |
de = German
|
30 |
dsb = Sorbian, Lower
|
31 |
dv = Dhivehi
|
|
|
32 |
el = Greek
|
33 |
en = English
|
34 |
eo = Esperanto
|
@@ -97,12 +98,16 @@ my = Burmese
|
|
97 |
myv = Erzya
|
98 |
nan-tw = Taiwanese (Minnan)
|
99 |
nb-NO = Norwegian Bokmål
|
|
|
100 |
ne-NP = Nepali
|
101 |
nia = Nias
|
102 |
nl = Dutch
|
103 |
nn-NO = Norwegian Nynorsk
|
|
|
|
|
104 |
nyn = Runyankole
|
105 |
oc = Occitan
|
|
|
106 |
or = Odia
|
107 |
pa-IN = Punjabi
|
108 |
pap-AW = Papiamento (Aruba)
|
@@ -128,6 +133,8 @@ sl = Slovenian
|
|
128 |
so = Somali
|
129 |
sq = Albanian
|
130 |
sr = Serbian
|
|
|
|
|
131 |
sv-SE = Swedish
|
132 |
sw = Swahili
|
133 |
syr = Syriac
|
@@ -139,8 +146,10 @@ ti = Tigrinya
|
|
139 |
tig = Tigre
|
140 |
tk = Turkmen
|
141 |
tl = Tagalog
|
|
|
142 |
tok = Toki Pona
|
143 |
tr = Turkish
|
|
|
144 |
tt = Tatar
|
145 |
tw = Twi
|
146 |
ty = Tahitian
|
@@ -150,12 +159,15 @@ ug = Uyghur
|
|
150 |
uk = Ukrainian
|
151 |
ur = Urdu
|
152 |
uz = Uzbek
|
|
|
153 |
vec = Venetian
|
154 |
vi = Vietnamese
|
155 |
vot = Votic
|
|
|
156 |
yi = Yiddish
|
157 |
yo = Yoruba
|
158 |
yue = Cantonese
|
159 |
zh-CN = Chinese (China)
|
160 |
zh-HK = Chinese (Hong Kong)
|
161 |
-
zh-TW = Chinese (Taiwan)
|
|
|
|
29 |
de = German
|
30 |
dsb = Sorbian, Lower
|
31 |
dv = Dhivehi
|
32 |
+
dyu = Dioula
|
33 |
el = Greek
|
34 |
en = English
|
35 |
eo = Esperanto
|
|
|
98 |
myv = Erzya
|
99 |
nan-tw = Taiwanese (Minnan)
|
100 |
nb-NO = Norwegian Bokmål
|
101 |
+
nd = IsiNdebele (North)
|
102 |
ne-NP = Nepali
|
103 |
nia = Nias
|
104 |
nl = Dutch
|
105 |
nn-NO = Norwegian Nynorsk
|
106 |
+
nr = IsiNdebele (South)
|
107 |
+
nso = Northern Sotho
|
108 |
nyn = Runyankole
|
109 |
oc = Occitan
|
110 |
+
om = Afaan Ormoo
|
111 |
or = Odia
|
112 |
pa-IN = Punjabi
|
113 |
pap-AW = Papiamento (Aruba)
|
|
|
133 |
so = Somali
|
134 |
sq = Albanian
|
135 |
sr = Serbian
|
136 |
+
ss = Siswati
|
137 |
+
st = Southern Sotho
|
138 |
sv-SE = Swedish
|
139 |
sw = Swahili
|
140 |
syr = Syriac
|
|
|
146 |
tig = Tigre
|
147 |
tk = Turkmen
|
148 |
tl = Tagalog
|
149 |
+
tn = Setswana
|
150 |
tok = Toki Pona
|
151 |
tr = Turkish
|
152 |
+
ts = Xitsonga
|
153 |
tt = Tatar
|
154 |
tw = Twi
|
155 |
ty = Tahitian
|
|
|
159 |
uk = Ukrainian
|
160 |
ur = Urdu
|
161 |
uz = Uzbek
|
162 |
+
ve = Tshivenda
|
163 |
vec = Venetian
|
164 |
vi = Vietnamese
|
165 |
vot = Votic
|
166 |
+
xh = Xhosa
|
167 |
yi = Yiddish
|
168 |
yo = Yoruba
|
169 |
yue = Cantonese
|
170 |
zh-CN = Chinese (China)
|
171 |
zh-HK = Chinese (Hong Kong)
|
172 |
+
zh-TW = Chinese (Taiwan)
|
173 |
+
zu = Zulu
|
publish.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import create_repo
|
2 |
+
|
3 |
+
create_repo("mozilla-foundation/common_voice_10_0", repo_type="dataset")
|
test.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from datasets import load_dataset
|
2 |
|
3 |
-
dataset = load_dataset("
|
4 |
print(dataset)
|
5 |
print(dataset[100])
|
|
|
1 |
from datasets import load_dataset
|
2 |
|
3 |
+
dataset = load_dataset("mozilla-foundation/common_voice_10_0", "et", split="test", use_auth_token=True)
|
4 |
print(dataset)
|
5 |
print(dataset[100])
|