usu_asa_asr / usu_asa_asr.py
PokeABear00's picture
Update usu_asa_asr.py
cd5dfd8
raw
history blame contribute delete
No virus
3.28 kB
import csv
import os
import datasets
logger = datasets.logging.get_logger(__name__)
_CITATION = """\
ASA
}
"""
_DESCRIPTION = """\
ASA
"""
_ALL_CONFIGS = sorted([
"id"
])
_DESCRIPTION = "ASA"
_HOMEPAGE_URL = "https://huggingface.co/PokeABear00/usu_asa_asr"
_DATA_URL = "ASA.zip"
class ASAConfig(datasets.BuilderConfig):
"""BuilderConfig for xtreme-s"""
def __init__(
self, name, description, homepage, data_url
):
super(ASAConfig, self).__init__(
name=self.name,
version=datasets.Version("1.0.0", ""),
description=self.description,
)
self.name = name
self.description = description
self.homepage = homepage
self.data_url = data_url
def _build_config(name):
return ASAConfig(
name=name,
description=_DESCRIPTION,
homepage=_HOMEPAGE_URL,
data_url=_DATA_URL,
)
class ASA(datasets.GeneratorBasedBuilder):
DEFAULT_WRITER_BATCH_SIZE = 1000
BUILDER_CONFIGS = [_build_config(name) for name in _ALL_CONFIGS + ["all"]]
def _info(self):
task_templates = None
langs = _ALL_CONFIGS
features = datasets.Features(
{
"path": datasets.Value("string"),
"audio": datasets.Audio(sampling_rate=16_000),
"transcription": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=("audio", "transcription"),
homepage=self.config.homepage,
citation=_CITATION,
task_templates=task_templates,
)
def _split_generators(self, dl_manager):
langs = (
_ALL_CONFIGS
if self.config.name == "all"
else [self.config.name]
)
archive_path = dl_manager.download_and_extract(self.config.data_url)
audio_path = dl_manager.extract(
os.path.join(archive_path, "usu_asa_asr", "audio.zip")
)
text_path = dl_manager.extract(
os.path.join(archive_path, "usu_asa_asr", "data.csv")
)
text_path = {l: os.path.join(text_path, f"{l}.csv") for l in langs}
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"audio_path": audio_path,
"text_paths": text_path,
},
)
]
def _generate_examples(self, audio_path, text_paths):
key = 0
for lang in text_paths.keys():
text_path = text_paths[lang]
with open(text_path, encoding="utf-8") as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True)
next(csv_reader)
for row in csv_reader:
file_path, transcription = row
file_path = os.path.join(audio_path, *file_path.split("/"))
yield key, {
"path": file_path,
"audio": file_path,
"transcription": transcription,
}
key += 1