File size: 3,283 Bytes
6b934c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd5dfd8
6b934c8
 
cd5dfd8
6b934c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import csv
import os
import datasets

logger = datasets.logging.get_logger(__name__)

_CITATION = """\
ASA
}
"""

_DESCRIPTION = """\
ASA
"""

_ALL_CONFIGS = sorted([
    "id"
])


_DESCRIPTION = "ASA"

_HOMEPAGE_URL = "https://huggingface.co/PokeABear00/usu_asa_asr"

_DATA_URL = "ASA.zip"

class ASAConfig(datasets.BuilderConfig):
    """BuilderConfig for xtreme-s"""

    def __init__(
        self, name, description, homepage, data_url
    ):
        super(ASAConfig, self).__init__(
            name=self.name,
            version=datasets.Version("1.0.0", ""),
            description=self.description,
        )
        self.name = name
        self.description = description
        self.homepage = homepage
        self.data_url = data_url

def _build_config(name):
    return ASAConfig(
        name=name,
        description=_DESCRIPTION,
        homepage=_HOMEPAGE_URL,
        data_url=_DATA_URL,
    )

class ASA(datasets.GeneratorBasedBuilder):

    DEFAULT_WRITER_BATCH_SIZE = 1000
    BUILDER_CONFIGS = [_build_config(name) for name in _ALL_CONFIGS + ["all"]]

    def _info(self):
        task_templates = None
        langs = _ALL_CONFIGS
        features = datasets.Features(
            {
                "path": datasets.Value("string"),
                "audio": datasets.Audio(sampling_rate=16_000),
                "transcription": datasets.Value("string"),
            }
        )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=("audio", "transcription"),
            homepage=self.config.homepage,
            citation=_CITATION,
            task_templates=task_templates,
        )

    def _split_generators(self, dl_manager):
        langs = (
            _ALL_CONFIGS
            if self.config.name == "all"
            else [self.config.name]
        )

        archive_path = dl_manager.download_and_extract(self.config.data_url)
        audio_path = dl_manager.extract(
            os.path.join(archive_path, "usu_asa_asr", "audio.zip")
        )
        text_path = dl_manager.extract(
            os.path.join(archive_path, "usu_asa_asr", "data.csv")
        )

        text_path = {l: os.path.join(text_path, f"{l}.csv") for l in langs}

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "audio_path": audio_path,
                    "text_paths": text_path,
                },
            )
        ]


    def _generate_examples(self, audio_path, text_paths):
        key = 0
        for lang in text_paths.keys():
            text_path = text_paths[lang]
            with open(text_path, encoding="utf-8") as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True)
                next(csv_reader)
                for row in csv_reader:
                    file_path, transcription = row

                    file_path = os.path.join(audio_path, *file_path.split("/"))
                    yield key, {
                        "path": file_path,
                        "audio": file_path,
                        "transcription": transcription,
                    }
                    key += 1