File size: 3,283 Bytes
6b934c8 cd5dfd8 6b934c8 cd5dfd8 6b934c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import csv
import os
import datasets
logger = datasets.logging.get_logger(__name__)
_CITATION = """\
ASA
}
"""
_DESCRIPTION = """\
ASA
"""
_ALL_CONFIGS = sorted([
"id"
])
_DESCRIPTION = "ASA"
_HOMEPAGE_URL = "https://huggingface.co/PokeABear00/usu_asa_asr"
_DATA_URL = "ASA.zip"
class ASAConfig(datasets.BuilderConfig):
"""BuilderConfig for xtreme-s"""
def __init__(
self, name, description, homepage, data_url
):
super(ASAConfig, self).__init__(
name=self.name,
version=datasets.Version("1.0.0", ""),
description=self.description,
)
self.name = name
self.description = description
self.homepage = homepage
self.data_url = data_url
def _build_config(name):
return ASAConfig(
name=name,
description=_DESCRIPTION,
homepage=_HOMEPAGE_URL,
data_url=_DATA_URL,
)
class ASA(datasets.GeneratorBasedBuilder):
DEFAULT_WRITER_BATCH_SIZE = 1000
BUILDER_CONFIGS = [_build_config(name) for name in _ALL_CONFIGS + ["all"]]
def _info(self):
task_templates = None
langs = _ALL_CONFIGS
features = datasets.Features(
{
"path": datasets.Value("string"),
"audio": datasets.Audio(sampling_rate=16_000),
"transcription": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=("audio", "transcription"),
homepage=self.config.homepage,
citation=_CITATION,
task_templates=task_templates,
)
def _split_generators(self, dl_manager):
langs = (
_ALL_CONFIGS
if self.config.name == "all"
else [self.config.name]
)
archive_path = dl_manager.download_and_extract(self.config.data_url)
audio_path = dl_manager.extract(
os.path.join(archive_path, "usu_asa_asr", "audio.zip")
)
text_path = dl_manager.extract(
os.path.join(archive_path, "usu_asa_asr", "data.csv")
)
text_path = {l: os.path.join(text_path, f"{l}.csv") for l in langs}
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"audio_path": audio_path,
"text_paths": text_path,
},
)
]
def _generate_examples(self, audio_path, text_paths):
key = 0
for lang in text_paths.keys():
text_path = text_paths[lang]
with open(text_path, encoding="utf-8") as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True)
next(csv_reader)
for row in csv_reader:
file_path, transcription = row
file_path = os.path.join(audio_path, *file_path.split("/"))
yield key, {
"path": file_path,
"audio": file_path,
"transcription": transcription,
}
key += 1
|