|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Common Voice Dataset""" |
|
|
|
|
|
import os |
|
|
|
import datasets |
|
|
|
|
|
_DATA_URL = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/{}.tar.gz" |
|
|
|
_CITATION = """\ |
|
@inproceedings{commonvoice:2020, |
|
author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.}, |
|
title = {Common Voice: A Massively-Multilingual Speech Corpus}, |
|
booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)}, |
|
pages = {4211--4215}, |
|
year = 2020 |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
Common Voice is Mozilla's initiative to help teach machines how real people speak. |
|
The dataset currently consists of 7,335 validated hours of speech in 60 languages, but we’re always adding more voices and languages. |
|
""" |
|
|
|
_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets" |
|
|
|
_LICENSE = "https://github.com/common-voice/common-voice/blob/main/LICENSE" |
|
|
|
_LANGUAGES = { |
|
"ab": { |
|
"Language": "Abkhaz", |
|
"Date": "2020-12-11", |
|
"Size": "39 MB", |
|
"Version": "ab_1h_2020-12-11", |
|
"Validated_Hr_Total": 0.05, |
|
"Overall_Hr_Total": 1, |
|
"Number_Of_Voice": 14, |
|
}, |
|
"ar": { |
|
"Language": "Arabic", |
|
"Date": "2020-12-11", |
|
"Size": "2 GB", |
|
"Version": "ar_77h_2020-12-11", |
|
"Validated_Hr_Total": 49, |
|
"Overall_Hr_Total": 77, |
|
"Number_Of_Voice": 672, |
|
}, |
|
"as": { |
|
"Language": "Assamese", |
|
"Date": "2020-12-11", |
|
"Size": "21 MB", |
|
"Version": "as_0.78h_2020-12-11", |
|
"Validated_Hr_Total": 0.74, |
|
"Overall_Hr_Total": 0.78, |
|
"Number_Of_Voice": 17, |
|
}, |
|
"br": { |
|
"Language": "Breton", |
|
"Date": "2020-12-11", |
|
"Size": "444 MB", |
|
"Version": "br_16h_2020-12-11", |
|
"Validated_Hr_Total": 7, |
|
"Overall_Hr_Total": 16, |
|
"Number_Of_Voice": 157, |
|
}, |
|
"ca": { |
|
"Language": "Catalan", |
|
"Date": "2020-12-11", |
|
"Size": "19 GB", |
|
"Version": "ca_748h_2020-12-11", |
|
"Validated_Hr_Total": 623, |
|
"Overall_Hr_Total": 748, |
|
"Number_Of_Voice": 5376, |
|
}, |
|
"cnh": { |
|
"Language": "Hakha Chin", |
|
"Date": "2020-12-11", |
|
"Size": "39 MB", |
|
"Version": "ab_1h_2020-12-11", |
|
"Validated_Hr_Total": 0.05, |
|
"Overall_Hr_Total": 1, |
|
"Number_Of_Voice": 14, |
|
}, |
|
"cs": { |
|
"Language": "Czech", |
|
"Date": "2020-12-11", |
|
"Size": "39 MB", |
|
"Version": "ab_1h_2020-12-11", |
|
"Validated_Hr_Total": 0.05, |
|
"Overall_Hr_Total": 1, |
|
"Number_Of_Voice": 14, |
|
}, |
|
"cv": { |
|
"Language": "Chuvash", |
|
"Date": "2020-12-11", |
|
"Size": "419 MB", |
|
"Version": "cv_16h_2020-12-11", |
|
"Validated_Hr_Total": 4, |
|
"Overall_Hr_Total": 16, |
|
"Number_Of_Voice": 92, |
|
}, |
|
"cy": { |
|
"Language": "Welsh", |
|
"Date": "2020-12-11", |
|
"Size": "3 GB", |
|
"Version": "cy_124h_2020-12-11", |
|
"Validated_Hr_Total": 95, |
|
"Overall_Hr_Total": 124, |
|
"Number_Of_Voice": 1382, |
|
}, |
|
"de": { |
|
"Language": "German", |
|
"Date": "2020-12-11", |
|
"Size": "22 GB", |
|
"Version": "de_836h_2020-12-11", |
|
"Validated_Hr_Total": 777, |
|
"Overall_Hr_Total": 836, |
|
"Number_Of_Voice": 12659, |
|
}, |
|
"dv": { |
|
"Language": "Dhivehi", |
|
"Date": "2020-12-11", |
|
"Size": "515 MB", |
|
"Version": "dv_19h_2020-12-11", |
|
"Validated_Hr_Total": 18, |
|
"Overall_Hr_Total": 19, |
|
"Number_Of_Voice": 167, |
|
}, |
|
"el": { |
|
"Language": "Greek", |
|
"Date": "2020-12-11", |
|
"Size": "364 MB", |
|
"Version": "el_13h_2020-12-11", |
|
"Validated_Hr_Total": 6, |
|
"Overall_Hr_Total": 13, |
|
"Number_Of_Voice": 118, |
|
}, |
|
"en": { |
|
"Language": "English", |
|
"Date": "2020-12-11", |
|
"Size": "56 GB", |
|
"Version": "en_2181h_2020-12-11", |
|
"Validated_Hr_Total": 1686, |
|
"Overall_Hr_Total": 2181, |
|
"Number_Of_Voice": 66173, |
|
}, |
|
"eo": { |
|
"Language": "Esperanto", |
|
"Date": "2020-12-11", |
|
"Size": "3 GB", |
|
"Version": "eo_102h_2020-12-11", |
|
"Validated_Hr_Total": 90, |
|
"Overall_Hr_Total": 102, |
|
"Number_Of_Voice": 574, |
|
}, |
|
"es": { |
|
"Language": "Spanish", |
|
"Date": "2020-12-11", |
|
"Size": "15 GB", |
|
"Version": "es_579h_2020-12-11", |
|
"Validated_Hr_Total": 324, |
|
"Overall_Hr_Total": 579, |
|
"Number_Of_Voice": 19484, |
|
}, |
|
"et": { |
|
"Language": "Estonian", |
|
"Date": "2020-12-11", |
|
"Size": "732 MB", |
|
"Version": "et_27h_2020-12-11", |
|
"Validated_Hr_Total": 19, |
|
"Overall_Hr_Total": 27, |
|
"Number_Of_Voice": 543, |
|
}, |
|
"eu": { |
|
"Language": "Basque", |
|
"Date": "2020-12-11", |
|
"Size": "3 GB", |
|
"Version": "eu_131h_2020-12-11", |
|
"Validated_Hr_Total": 89, |
|
"Overall_Hr_Total": 131, |
|
"Number_Of_Voice": 1028, |
|
}, |
|
"fa": { |
|
"Language": "Persian", |
|
"Date": "2020-12-11", |
|
"Size": "8 GB", |
|
"Version": "fa_321h_2020-12-11", |
|
"Validated_Hr_Total": 282, |
|
"Overall_Hr_Total": 321, |
|
"Number_Of_Voice": 3655, |
|
}, |
|
"fi": { |
|
"Language": "Finnish", |
|
"Date": "2020-12-11", |
|
"Size": "48 MB", |
|
"Version": "fi_1h_2020-12-11", |
|
"Validated_Hr_Total": 1, |
|
"Overall_Hr_Total": 1, |
|
"Number_Of_Voice": 27, |
|
}, |
|
"fr": { |
|
"Language": "French", |
|
"Date": "2020-12-11", |
|
"Size": "18 GB", |
|
"Version": "fr_682h_2020-12-11", |
|
"Validated_Hr_Total": 623, |
|
"Overall_Hr_Total": 682, |
|
"Number_Of_Voice": 12953, |
|
}, |
|
"fy-NL": { |
|
"Language": "Frisian", |
|
"Date": "2020-12-11", |
|
"Size": "1 GB", |
|
"Version": "fy-NL_46h_2020-12-11", |
|
"Validated_Hr_Total": 14, |
|
"Overall_Hr_Total": 46, |
|
"Number_Of_Voice": 467, |
|
}, |
|
"ga-IE": { |
|
"Language": "Irish", |
|
"Date": "2020-12-11", |
|
"Size": "149 MB", |
|
"Version": "ga-IE_5h_2020-12-11", |
|
"Validated_Hr_Total": 3, |
|
"Overall_Hr_Total": 5, |
|
"Number_Of_Voice": 101, |
|
}, |
|
"hi": { |
|
"Language": "Hindi", |
|
"Date": "2020-12-11", |
|
"Size": "20 MB", |
|
"Version": "hi_0.8h_2020-12-11", |
|
"Validated_Hr_Total": 0.54, |
|
"Overall_Hr_Total": 0.8, |
|
"Number_Of_Voice": 31, |
|
}, |
|
"hsb": { |
|
"Language": "Sorbian, Upper", |
|
"Date": "2020-12-11", |
|
"Size": "76 MB", |
|
"Version": "hsb_2h_2020-12-11", |
|
"Validated_Hr_Total": 2, |
|
"Overall_Hr_Total": 2, |
|
"Number_Of_Voice": 19, |
|
}, |
|
"hu": { |
|
"Language": "Hungarian", |
|
"Date": "2020-12-11", |
|
"Size": "232 MB", |
|
"Version": "hu_8h_2020-12-11", |
|
"Validated_Hr_Total": 8, |
|
"Overall_Hr_Total": 8, |
|
"Number_Of_Voice": 47, |
|
}, |
|
"ia": { |
|
"Language": "InterLinguia", |
|
"Date": "2020-12-11", |
|
"Size": "216 MB", |
|
"Version": "ia_8h_2020-12-11", |
|
"Validated_Hr_Total": 6, |
|
"Overall_Hr_Total": 8, |
|
"Number_Of_Voice": 36, |
|
}, |
|
"id": { |
|
"Language": "Indonesian", |
|
"Date": "2020-12-11", |
|
"Size": "454 MB", |
|
"Version": "id_17h_2020-12-11", |
|
"Validated_Hr_Total": 9, |
|
"Overall_Hr_Total": 17, |
|
"Number_Of_Voice": 219, |
|
}, |
|
"it": { |
|
"Language": "Italian", |
|
"Date": "2020-12-11", |
|
"Size": "5 GB", |
|
"Version": "it_199h_2020-12-11", |
|
"Validated_Hr_Total": 158, |
|
"Overall_Hr_Total": 199, |
|
"Number_Of_Voice": 5729, |
|
}, |
|
"ja": { |
|
"Language": "Japanese", |
|
"Date": "2020-12-11", |
|
"Size": "146 MB", |
|
"Version": "ja_5h_2020-12-11", |
|
"Validated_Hr_Total": 3, |
|
"Overall_Hr_Total": 5, |
|
"Number_Of_Voice": 235, |
|
}, |
|
"ka": { |
|
"Language": "Georgian", |
|
"Date": "2020-12-11", |
|
"Size": "99 MB", |
|
"Version": "ka_3h_2020-12-11", |
|
"Validated_Hr_Total": 3, |
|
"Overall_Hr_Total": 3, |
|
"Number_Of_Voice": 44, |
|
}, |
|
"kab": { |
|
"Language": "Kabyle", |
|
"Date": "2020-12-11", |
|
"Size": "16 GB", |
|
"Version": "kab_622h_2020-12-11", |
|
"Validated_Hr_Total": 525, |
|
"Overall_Hr_Total": 622, |
|
"Number_Of_Voice": 1309, |
|
}, |
|
"ky": { |
|
"Language": "Kyrgyz", |
|
"Date": "2020-12-11", |
|
"Size": "553 MB", |
|
"Version": "ky_22h_2020-12-11", |
|
"Validated_Hr_Total": 11, |
|
"Overall_Hr_Total": 22, |
|
"Number_Of_Voice": 134, |
|
}, |
|
"lg": { |
|
"Language": "Luganda", |
|
"Date": "2020-12-11", |
|
"Size": "199 MB", |
|
"Version": "lg_8h_2020-12-11", |
|
"Validated_Hr_Total": 3, |
|
"Overall_Hr_Total": 8, |
|
"Number_Of_Voice": 76, |
|
}, |
|
"lt": { |
|
"Language": "Lithuanian", |
|
"Date": "2020-12-11", |
|
"Size": "129 MB", |
|
"Version": "lt_4h_2020-12-11", |
|
"Validated_Hr_Total": 2, |
|
"Overall_Hr_Total": 4, |
|
"Number_Of_Voice": 30, |
|
}, |
|
"lv": { |
|
"Language": "Latvian", |
|
"Date": "2020-12-11", |
|
"Size": "199 MB", |
|
"Version": "lv_7h_2020-12-11", |
|
"Validated_Hr_Total": 6, |
|
"Overall_Hr_Total": 7, |
|
"Number_Of_Voice": 99, |
|
}, |
|
"mn": { |
|
"Language": "Mongolian", |
|
"Date": "2020-12-11", |
|
"Size": "464 MB", |
|
"Version": "mn_17h_2020-12-11", |
|
"Validated_Hr_Total": 11, |
|
"Overall_Hr_Total": 17, |
|
"Number_Of_Voice": 376, |
|
}, |
|
"mt": { |
|
"Language": "Maltese", |
|
"Date": "2020-12-11", |
|
"Size": "405 MB", |
|
"Version": "mt_15h_2020-12-11", |
|
"Validated_Hr_Total": 7, |
|
"Overall_Hr_Total": 15, |
|
"Number_Of_Voice": 171, |
|
}, |
|
"nl": { |
|
"Language": "Dutch", |
|
"Date": "2020-12-11", |
|
"Size": "2 GB", |
|
"Version": "nl_63h_2020-12-11", |
|
"Validated_Hr_Total": 59, |
|
"Overall_Hr_Total": 63, |
|
"Number_Of_Voice": 1012, |
|
}, |
|
"or": { |
|
"Language": "Odia", |
|
"Date": "2020-12-11", |
|
"Size": "190 MB", |
|
"Version": "or_7h_2020-12-11", |
|
"Validated_Hr_Total": 0.87, |
|
"Overall_Hr_Total": 7, |
|
"Number_Of_Voice": 34, |
|
}, |
|
"pa-IN": { |
|
"Language": "Punjabi", |
|
"Date": "2020-12-11", |
|
"Size": "67 MB", |
|
"Version": "pa-IN_2h_2020-12-11", |
|
"Validated_Hr_Total": 0.5, |
|
"Overall_Hr_Total": 2, |
|
"Number_Of_Voice": 26, |
|
}, |
|
"pl": { |
|
"Language": "Polish", |
|
"Date": "2020-12-11", |
|
"Size": "3 GB", |
|
"Version": "pl_129h_2020-12-11", |
|
"Validated_Hr_Total": 108, |
|
"Overall_Hr_Total": 129, |
|
"Number_Of_Voice": 2647, |
|
}, |
|
"pt": { |
|
"Language": "Portuguese", |
|
"Date": "2020-12-11", |
|
"Size": "2 GB", |
|
"Version": "pt_63h_2020-12-11", |
|
"Validated_Hr_Total": 50, |
|
"Overall_Hr_Total": 63, |
|
"Number_Of_Voice": 1120, |
|
}, |
|
"rm-sursilv": { |
|
"Language": "Romansh Sursilvan", |
|
"Date": "2020-12-11", |
|
"Size": "263 MB", |
|
"Version": "rm-sursilv_9h_2020-12-11", |
|
"Validated_Hr_Total": 5, |
|
"Overall_Hr_Total": 9, |
|
"Number_Of_Voice": 78, |
|
}, |
|
"rm-vallader": { |
|
"Language": "Romansh Vallader", |
|
"Date": "2020-12-11", |
|
"Size": "103 MB", |
|
"Version": "rm-vallader_3h_2020-12-11", |
|
"Validated_Hr_Total": 2, |
|
"Overall_Hr_Total": 3, |
|
"Number_Of_Voice": 39, |
|
}, |
|
"ro": { |
|
"Language": "Romanian", |
|
"Date": "2020-12-11", |
|
"Size": "250 MB", |
|
"Version": "ro_9h_2020-12-11", |
|
"Validated_Hr_Total": 6, |
|
"Overall_Hr_Total": 9, |
|
"Number_Of_Voice": 130, |
|
}, |
|
"ru": { |
|
"Language": "Russian", |
|
"Date": "2020-12-11", |
|
"Size": "3 GB", |
|
"Version": "ru_130h_2020-12-11", |
|
"Validated_Hr_Total": 111, |
|
"Overall_Hr_Total": 130, |
|
"Number_Of_Voice": 1412, |
|
}, |
|
"rw": { |
|
"Language": "Kinyarwanda", |
|
"Date": "2020-12-11", |
|
"Size": "40 GB", |
|
"Version": "rw_1510h_2020-12-11", |
|
"Validated_Hr_Total": 1183, |
|
"Overall_Hr_Total": 1510, |
|
"Number_Of_Voice": 410, |
|
}, |
|
"sah": { |
|
"Language": "Sakha", |
|
"Date": "2020-12-11", |
|
"Size": "173 MB", |
|
"Version": "sah_6h_2020-12-11", |
|
"Validated_Hr_Total": 4, |
|
"Overall_Hr_Total": 6, |
|
"Number_Of_Voice": 42, |
|
}, |
|
"sl": { |
|
"Language": "Slovenian", |
|
"Date": "2020-12-11", |
|
"Size": "212 MB", |
|
"Version": "sl_7h_2020-12-11", |
|
"Validated_Hr_Total": 5, |
|
"Overall_Hr_Total": 7, |
|
"Number_Of_Voice": 82, |
|
}, |
|
"sv-SE": { |
|
"Language": "Swedish", |
|
"Date": "2020-12-11", |
|
"Size": "402 MB", |
|
"Version": "sv-SE_15h_2020-12-11", |
|
"Validated_Hr_Total": 12, |
|
"Overall_Hr_Total": 15, |
|
"Number_Of_Voice": 222, |
|
}, |
|
"ta": { |
|
"Language": "Tamil", |
|
"Date": "2020-12-11", |
|
"Size": "648 MB", |
|
"Version": "ta_24h_2020-12-11", |
|
"Validated_Hr_Total": 14, |
|
"Overall_Hr_Total": 24, |
|
"Number_Of_Voice": 266, |
|
}, |
|
"th": { |
|
"Language": "Thai", |
|
"Date": "2020-12-11", |
|
"Size": "325 MB", |
|
"Version": "th_12h_2020-12-11", |
|
"Validated_Hr_Total": 8, |
|
"Overall_Hr_Total": 12, |
|
"Number_Of_Voice": 182, |
|
}, |
|
"tr": { |
|
"Language": "Turkish", |
|
"Date": "2020-12-11", |
|
"Size": "592 MB", |
|
"Version": "tr_22h_2020-12-11", |
|
"Validated_Hr_Total": 20, |
|
"Overall_Hr_Total": 22, |
|
"Number_Of_Voice": 678, |
|
}, |
|
"tt": { |
|
"Language": "Tatar", |
|
"Date": "2020-12-11", |
|
"Size": "741 MB", |
|
"Version": "tt_28h_2020-12-11", |
|
"Validated_Hr_Total": 26, |
|
"Overall_Hr_Total": 28, |
|
"Number_Of_Voice": 185, |
|
}, |
|
"uk": { |
|
"Language": "Ukrainian", |
|
"Date": "2020-12-11", |
|
"Size": "1 GB", |
|
"Version": "uk_43h_2020-12-11", |
|
"Validated_Hr_Total": 30, |
|
"Overall_Hr_Total": 43, |
|
"Number_Of_Voice": 459, |
|
}, |
|
"vi": { |
|
"Language": "Vietnamese", |
|
"Date": "2020-12-11", |
|
"Size": "50 MB", |
|
"Version": "vi_1h_2020-12-11", |
|
"Validated_Hr_Total": 0.74, |
|
"Overall_Hr_Total": 1, |
|
"Number_Of_Voice": 62, |
|
}, |
|
"vot": { |
|
"Language": "Votic", |
|
"Date": "2020-12-11", |
|
"Size": "7 MB", |
|
"Version": "vot_0.28h_2020-12-11", |
|
"Validated_Hr_Total": 0, |
|
"Overall_Hr_Total": 0.28, |
|
"Number_Of_Voice": 3, |
|
}, |
|
"zh-CN": { |
|
"Language": "Chinese (China)", |
|
"Date": "2020-12-11", |
|
"Size": "2 GB", |
|
"Version": "zh-CN_78h_2020-12-11", |
|
"Validated_Hr_Total": 56, |
|
"Overall_Hr_Total": 78, |
|
"Number_Of_Voice": 3501, |
|
}, |
|
"zh-HK": { |
|
"Language": "Chinese (Hong Kong)", |
|
"Date": "2020-12-11", |
|
"Size": "3 GB", |
|
"Version": "zh-HK_100h_2020-12-11", |
|
"Validated_Hr_Total": 50, |
|
"Overall_Hr_Total": 100, |
|
"Number_Of_Voice": 2536, |
|
}, |
|
"zh-TW": { |
|
"Language": "Chinese (Taiwan)", |
|
"Date": "2020-12-11", |
|
"Size": "2 GB", |
|
"Version": "zh-TW_78h_2020-12-11", |
|
"Validated_Hr_Total": 55, |
|
"Overall_Hr_Total": 78, |
|
"Number_Of_Voice": 1444, |
|
}, |
|
} |
|
|
|
|
|
class CommonVoiceConfig(datasets.BuilderConfig): |
|
"""BuilderConfig for CommonVoice.""" |
|
|
|
def __init__(self, name, sub_version, **kwargs): |
|
""" |
|
Args: |
|
data_dir: `string`, the path to the folder containing the files in the |
|
downloaded .tar |
|
citation: `string`, citation for the data set |
|
url: `string`, url for information about the data set |
|
**kwargs: keyword arguments forwarded to super. |
|
""" |
|
self.sub_version = sub_version |
|
self.language = kwargs.pop("language", None) |
|
self.date_of_snapshot = kwargs.pop("date", None) |
|
self.size = kwargs.pop("size", None) |
|
self.validated_hr_total = kwargs.pop("val_hrs", None) |
|
self.total_hr_total = kwargs.pop("total_hrs", None) |
|
self.num_of_voice = kwargs.pop("num_of_voice", None) |
|
description = f"Common Voice speech to text dataset in {self.language} version {self.sub_version} of {self.date_of_snapshot}. The dataset comprises {self.validated_hr_total} of validated transcribed speech data from {self.num_of_voice} speakers. The dataset has a size of {self.size}" |
|
super(CommonVoiceConfig, self).__init__( |
|
name=name, version=datasets.Version("7.0.0", ""), description=description, **kwargs |
|
) |
|
|
|
|
|
class CommonVoice(datasets.GeneratorBasedBuilder): |
|
|
|
BUILDER_CONFIGS = [ |
|
CommonVoiceConfig( |
|
name=lang_id, |
|
language=_LANGUAGES[lang_id]["Language"], |
|
sub_version=_LANGUAGES[lang_id]["Version"], |
|
date=_LANGUAGES[lang_id]["Date"], |
|
size=_LANGUAGES[lang_id]["Size"], |
|
val_hrs=_LANGUAGES[lang_id]["Validated_Hr_Total"], |
|
total_hrs=_LANGUAGES[lang_id]["Overall_Hr_Total"], |
|
num_of_voice=_LANGUAGES[lang_id]["Number_Of_Voice"], |
|
) |
|
for lang_id in _LANGUAGES.keys() |
|
] |
|
|
|
def _info(self): |
|
features = datasets.Features( |
|
{ |
|
"client_id": datasets.Value("string"), |
|
"path": datasets.Value("string"), |
|
"sentence": datasets.Value("string"), |
|
"up_votes": datasets.Value("int64"), |
|
"down_votes": datasets.Value("int64"), |
|
"age": datasets.Value("string"), |
|
"gender": datasets.Value("string"), |
|
"accent": datasets.Value("string"), |
|
"locale": datasets.Value("string"), |
|
"segment": datasets.Value("string"), |
|
} |
|
) |
|
|
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=features, |
|
supervised_keys=None, |
|
homepage=_HOMEPAGE, |
|
license=_LICENSE, |
|
citation=_CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
"""Returns SplitGenerators.""" |
|
if self.config.data_dir: |
|
abs_path_to_data = os.path.join(self.config.data_dir, self.config.name) |
|
else: |
|
dl_path = dl_manager.download_and_extract(_DATA_URL.format(self.config.name)) |
|
abs_path_to_data = os.path.join(dl_path, "cv-corpus-6.1-2020-12-11", self.config.name) |
|
abs_path_to_clips = os.path.join(abs_path_to_data, "clips") |
|
|
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
gen_kwargs={ |
|
"filepath": os.path.join(abs_path_to_data, "train.tsv"), |
|
"path_to_clips": abs_path_to_clips, |
|
}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TEST, |
|
gen_kwargs={ |
|
"filepath": os.path.join(abs_path_to_data, "test.tsv"), |
|
"path_to_clips": abs_path_to_clips, |
|
}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
gen_kwargs={ |
|
"filepath": os.path.join(abs_path_to_data, "dev.tsv"), |
|
"path_to_clips": abs_path_to_clips, |
|
}, |
|
), |
|
datasets.SplitGenerator( |
|
name="other", |
|
gen_kwargs={ |
|
"filepath": os.path.join(abs_path_to_data, "other.tsv"), |
|
"path_to_clips": abs_path_to_clips, |
|
}, |
|
), |
|
datasets.SplitGenerator( |
|
name="invalidated", |
|
gen_kwargs={ |
|
"filepath": os.path.join(abs_path_to_data, "invalidated.tsv"), |
|
"path_to_clips": abs_path_to_clips, |
|
}, |
|
), |
|
] |
|
|
|
def _generate_examples(self, filepath, path_to_clips): |
|
"""Yields examples.""" |
|
data_fields = list(self._info().features.keys()) |
|
path_idx = data_fields.index("path") |
|
|
|
with open(filepath, encoding="utf-8") as f: |
|
lines = f.readlines() |
|
headline = lines[0] |
|
|
|
column_names = headline.strip().split("\t") |
|
assert ( |
|
column_names == data_fields |
|
), f"The file should have {data_fields} as column names, but has {column_names}" |
|
|
|
for id_, line in enumerate(lines[1:]): |
|
field_values = line.strip().split("\t") |
|
|
|
|
|
field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx]) |
|
|
|
|
|
if len(field_values) < len(data_fields): |
|
field_values += (len(data_fields) - len(field_values)) * ["''"] |
|
|
|
yield id_, {key: value for key, value in zip(data_fields, field_values)} |
|
|