None commited on
Commit
3b17866
1 Parent(s): fa2d879

Create SimpleDataset.py

Browse files

Simplest possible audio dataset for ASR ?

Files changed (1) hide show
  1. SimpleDataset.py +120 -0
SimpleDataset.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Lint as: python3
2
+ """Simple, minimal ASR dataset template."""
3
+
4
+
5
+ import csv
6
+ import os
7
+
8
+ import datasets
9
+ from datasets.tasks import AutomaticSpeechRecognition
10
+
11
+
12
+ _CITATION = ""
13
+
14
+ _DESCRIPTION = """\
15
+ This is a private dataset
16
+ """
17
+
18
+ _URL = "https://localhost"
19
+ _DL_URL = "http://localhost:8000/data_simple.tgz"
20
+
21
+
22
+ class SimpleTplConfig(datasets.BuilderConfig):
23
+ """BuilderConfig for LucerneTest."""
24
+
25
+ def __init__(self, name, **kwargs):
26
+ """
27
+ Args:
28
+ data_dir: `string`, the path to the folder containing the audio files
29
+ in the downloaded .tar.gz file.
30
+ citation: `string`, optional citation for the dataset.
31
+ url: `string`, url for information about the dataset.
32
+ **kwargs: keyword arguments forwarded to super.
33
+ """
34
+ self.num_of_voice = 100
35
+
36
+ description = f"Simple Dataset."
37
+ super(SimpleTplConfig, self).__init__(
38
+ name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs
39
+ )
40
+
41
+ class SimpleTpl(datasets.GeneratorBasedBuilder):
42
+ """Simple Speech dataset."""
43
+
44
+ VERSION = datasets.Version("1.1.0")
45
+ #SimpleTplConfig(name="simpletpl")
46
+
47
+ DEFAULT_WRITER_BATCH_SIZE = 1000
48
+ BUILDER_CONFIGS = [
49
+ datasets.BuilderConfig(
50
+ name="main",
51
+ version=VERSION,
52
+ description="The simple dataset"
53
+ )
54
+ ]
55
+
56
+ def _info(self):
57
+ return datasets.DatasetInfo(
58
+ description=_DESCRIPTION,
59
+ features=datasets.Features(
60
+ {
61
+ "audio": datasets.Audio(sampling_rate=16000),
62
+ "path": datasets.Value("string"),
63
+ "sentence": datasets.Value("string"),
64
+ }
65
+ ),
66
+ supervised_keys=None,
67
+ homepage=_URL,
68
+ citation=_CITATION,
69
+ task_templates=[
70
+ AutomaticSpeechRecognition(
71
+ audio_file_path_column="path",
72
+ transcription_column="sentence")
73
+ ],
74
+ )
75
+
76
+ def _split_generators(self, dl_manager):
77
+ root_path = dl_manager.download_and_extract(_DL_URL)
78
+ root_path = os.path.join(root_path, "data_simple")
79
+ wav_path = os.path.join(root_path, "audio")
80
+ train_csv = os.path.join(root_path, "train.csv")
81
+ valid_csv = os.path.join(root_path, "valid.csv")
82
+ test_csv = os.path.join(root_path, "test.csv")
83
+
84
+ return [
85
+ datasets.SplitGenerator(
86
+ name=datasets.Split.TRAIN,
87
+ gen_kwargs={"wav_path": wav_path, "csv_path": train_csv}
88
+ ),
89
+ datasets.SplitGenerator(
90
+ name=datasets.Split.VALIDATION,
91
+ gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv}
92
+ ),
93
+ datasets.SplitGenerator(
94
+ name=datasets.Split.TEST,
95
+ gen_kwargs={"wav_path": wav_path, "csv_path": test_csv}
96
+ ),
97
+ ]
98
+
99
+ def _generate_examples(self, wav_path, csv_path):
100
+ """Generate examples from a Speech archive_path."""
101
+
102
+ with open(csv_path, encoding="utf-8") as csv_file:
103
+ csv_reader = csv.reader(
104
+ csv_file,
105
+ delimiter=",",
106
+ quotechar=None,
107
+ skipinitialspace=True
108
+ )
109
+
110
+ for idx,row in enumerate(csv_reader):
111
+ if idx == 0:
112
+ continue
113
+ wav_path, sentence = row
114
+ example = {
115
+ "path": wav_path,
116
+ "audio": wav_path,
117
+ "sentence": sentence,
118
+ }
119
+
120
+ yield wav_path, example