import csv import os import datasets logger = datasets.logging.get_logger(__name__) _CITATION = """\ ASA } """ _DESCRIPTION = """\ ASA """ _ALL_CONFIGS = sorted([ "id" ]) _DESCRIPTION = "ASA" _HOMEPAGE_URL = "https://huggingface.co/PokeABear00/usu_asa_asr" _DATA_URL = "ASA.zip" class ASAConfig(datasets.BuilderConfig): """BuilderConfig for xtreme-s""" def __init__( self, name, description, homepage, data_url ): super(ASAConfig, self).__init__( name=self.name, version=datasets.Version("1.0.0", ""), description=self.description, ) self.name = name self.description = description self.homepage = homepage self.data_url = data_url def _build_config(name): return ASAConfig( name=name, description=_DESCRIPTION, homepage=_HOMEPAGE_URL, data_url=_DATA_URL, ) class ASA(datasets.GeneratorBasedBuilder): DEFAULT_WRITER_BATCH_SIZE = 1000 BUILDER_CONFIGS = [_build_config(name) for name in _ALL_CONFIGS + ["all"]] def _info(self): task_templates = None langs = _ALL_CONFIGS features = datasets.Features( { "path": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "transcription": datasets.Value("string"), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=("audio", "transcription"), homepage=self.config.homepage, citation=_CITATION, task_templates=task_templates, ) def _split_generators(self, dl_manager): langs = ( _ALL_CONFIGS if self.config.name == "all" else [self.config.name] ) archive_path = dl_manager.download_and_extract(self.config.data_url) audio_path = dl_manager.extract( os.path.join(archive_path, "usu_asa_asr", "audio.zip") ) text_path = dl_manager.extract( os.path.join(archive_path, "usu_asa_asr", "data.csv") ) text_path = {l: os.path.join(text_path, f"{l}.csv") for l in langs} return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "audio_path": audio_path, "text_paths": text_path, }, ) ] def _generate_examples(self, audio_path, text_paths): key = 0 for lang in text_paths.keys(): text_path = text_paths[lang] with open(text_path, encoding="utf-8") as csv_file: csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True) next(csv_reader) for row in csv_reader: file_path, transcription = row file_path = os.path.join(audio_path, *file_path.split("/")) yield key, { "path": file_path, "audio": file_path, "transcription": transcription, } key += 1