Zaid commited on
Commit
3e1fdc5
1 Parent(s): 1f6c681

Create dialects_speech_corpus.py

Browse files
Files changed (1) hide show
  1. dialects_speech_corpus.py +108 -0
dialects_speech_corpus.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Arabic Speech Corpus"""
2
+
3
+ from __future__ import absolute_import, division, print_function
4
+
5
+ import os
6
+
7
+ import datasets
8
+
9
+
10
+ _CITATION = """
11
+ """
12
+
13
+ _DESCRIPTION = """\
14
+
15
+
16
+ ```python
17
+ import soundfile as sf
18
+
19
+ def map_to_array(batch):
20
+ speech_array, _ = sf.read(batch["file"])
21
+ batch["speech"] = speech_array
22
+ return batch
23
+
24
+ dataset = dataset.map(map_to_array, remove_columns=["file"])
25
+ ```
26
+ """
27
+
28
+ _URL = "mgb3.zip"
29
+ corrupt_files = ['familyKids_02_first_12min.wav','sports_04_first_12min.wav',
30
+ 'cooking_05_first_12min.wav', 'moviesDrama_07_first_12min.wav','science_06_first_12min.wav',
31
+ 'comedy_09_first_12min.wav','cultural_08_first_12min.wav','familyKids_11_first_12min.wav',
32
+ 'science_10_first_12min.wav']
33
+ import soundfile as sf
34
+
35
+ class EgyptianSpeechCorpusConfig(datasets.BuilderConfig):
36
+ """BuilderConfig for EgyptianSpeechCorpus."""
37
+
38
+ def __init__(self, **kwargs):
39
+ """
40
+ Args:
41
+ data_dir: `string`, the path to the folder containing the files in the
42
+ downloaded .tar
43
+ citation: `string`, citation for the data set
44
+ url: `string`, url for information about the data set
45
+ **kwargs: keyword arguments forwarded to super.
46
+ """
47
+ super(EgyptianSpeechCorpusConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
48
+
49
+
50
+ def map_to_array(batch):
51
+ start, stop = batch['segment'].split('_')
52
+ speech_array, _ = sf.read(batch["file"], start = start, stop = stop)
53
+ batch["speech"] = speech_array
54
+ return batch
55
+
56
+ class EgyptionSpeechCorpus(datasets.GeneratorBasedBuilder):
57
+ """EgyptianSpeechCorpus dataset."""
58
+
59
+ BUILDER_CONFIGS = [
60
+ EgyptianSpeechCorpusConfig(name="clean", description="'Clean' speech."),
61
+ ]
62
+
63
+ def _info(self):
64
+ return datasets.DatasetInfo(
65
+ description=_DESCRIPTION,
66
+ features=datasets.Features(
67
+ {
68
+ "file": datasets.Value("string"),
69
+ "text": datasets.Value("string"),
70
+ "segment": datasets.Value("string")
71
+ }
72
+ ),
73
+ supervised_keys=("file", "text"),
74
+ homepage=_URL,
75
+ citation=_CITATION,
76
+ )
77
+
78
+ def _split_generators(self, dl_manager):
79
+ self.archive_path = '/content/mgb3'
80
+ return [
81
+ datasets.SplitGenerator(name="train", gen_kwargs={"archive_path": os.path.join(self.archive_path, "adapt")}),
82
+ datasets.SplitGenerator(name="dev", gen_kwargs={"archive_path": os.path.join(self.archive_path, "dev")}),
83
+ datasets.SplitGenerator(name="test", gen_kwargs={"archive_path": os.path.join(self.archive_path, "test")}),
84
+ ]
85
+
86
+ def _generate_examples(self, archive_path):
87
+ """Generate examples from a Librispeech archive_path."""
88
+ text_dir = os.path.join(archive_path, "Alaa")
89
+ wav_dir = os.path.join(self.archive_path, "wav")
90
+
91
+ segments_file = os.path.join(text_dir, "text_noverlap")
92
+
93
+ with open(segments_file, "r", encoding="utf-8") as f:
94
+ for _id, line in enumerate(f):
95
+ segment = line.split(' ')[0]
96
+ text = ' '.join(line.split(' ')[1:])
97
+ wav_file = '_'.join(segment.split('_')[:4]) +'.wav'
98
+ start, stop = segment.split('_')[4:6]
99
+ wav_path = os.path.join(wav_dir, wav_file)
100
+ if (wav_file in corrupt_files) or (wav_file not in os.listdir(wav_dir)):
101
+ continue
102
+ example = {
103
+ "file": wav_path,
104
+ "text": text,
105
+ "segment":('_').join([start, stop])
106
+ }
107
+ yield str(_id), example
108
+