File size: 2,384 Bytes
b7f4dbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Lint as: python3
"""CREMA-D dataset."""

import os
from typing import Union

import datasets
import pandas as pd

_DESCRIPTION = """\
    CREMA-D is a data set of 7,442 original clips from 91 actors.
    These clips were from 48 male and 43 female actors between the ages of 20 and 74
    coming from a variety of races and ethnicities (African America, Asian,
    Caucasian, Hispanic, and Unspecified). Actors spoke from a selection of 12
    sentences. The sentences were presented using one of six different emotions
    (Anger, Disgust, Fear, Happy, Neutral, and Sad) and four different emotion
    levels (Low, Medium, High, and Unspecified).
"""

_HOMEPAGE = "https://github.com/CheyneyComputerScience/CREMA-D"

DATA_DIR = {"train": "AudioWAV"}


class Crema(datasets.GeneratorBasedBuilder):
    """Crema-D dataset."""

    DEFAULT_WRITER_BATCH_SIZE = 256
    BUILDER_CONFIGS = [datasets.BuilderConfig(name="clean", description="Train Set.")]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {"file": datasets.Value("string"), "label": datasets.Value("string")}
            ),
            supervised_keys=("file", "label"),
            homepage=_HOMEPAGE,
        )

    def _split_generators(
        self, dl_manager: datasets.utils.download_manager.DownloadManager
    ):
        data_dir = dl_manager.extract(self.config.data_dir)
        if self.config.name == "clean":
            train_splits = [
                datasets.SplitGenerator(
                    name="train", gen_kwargs={"files": data_dir, "name": "train"}
                )
            ]

        return train_splits

    def _generate_examples(self, files: Union[str, os.PathLike], name: str):
        """Generate examples from a Crema unzipped directory."""
        key = 0
        examples = list()

        audio_dir = os.path.join(files, DATA_DIR[name])

        if not os.path.exists(audio_dir):
            raise FileNotFoundError
        else:
            for file in os.listdir(audio_dir):
                res = dict()
                res["file"] = "{}".format(os.path.join(audio_dir, file))
                res["label"] = file.split("_")[-2]
                examples.append(res)

        for example in examples:
            yield key, {**example}
            key += 1
        examples = []