jimregan commited on
Commit
17dc65f
1 Parent(s): 48042db

add timit prep notebook

Browse files
Files changed (1) hide show
  1. timit-fairseq.ipynb +292 -0
timit-fairseq.ipynb ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 73,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from datasets import load_dataset, concatenate_datasets\n",
10
+ "import soundfile as sf"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "PAD = \"<pad>\"\n",
20
+ "UNK = \"<unk>\"\n",
21
+ "SIL = \"<sil>\"\n",
22
+ "SPN = \"<spn>\""
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 16,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "VOCAB_ITEMS =\"\"\"\n",
32
+ "AA\n",
33
+ "AE\n",
34
+ "AH\n",
35
+ "AO\n",
36
+ "AW\n",
37
+ "AX\n",
38
+ "AY\n",
39
+ "EH\n",
40
+ "ER\n",
41
+ "EY\n",
42
+ "IH\n",
43
+ "IY\n",
44
+ "OW\n",
45
+ "OY\n",
46
+ "UH\n",
47
+ "UW\n",
48
+ "UX\n",
49
+ "B\n",
50
+ "CH\n",
51
+ "D\n",
52
+ "DH\n",
53
+ "DX\n",
54
+ "EL\n",
55
+ "EM\n",
56
+ "EN\n",
57
+ "F\n",
58
+ "G\n",
59
+ "HH\n",
60
+ "JH\n",
61
+ "K\n",
62
+ "L\n",
63
+ "M\n",
64
+ "N\n",
65
+ "NG\n",
66
+ "NX\n",
67
+ "P\n",
68
+ "Q\n",
69
+ "R\n",
70
+ "S\n",
71
+ "SH\n",
72
+ "T\n",
73
+ "TH\n",
74
+ "V\n",
75
+ "W\n",
76
+ "WH\n",
77
+ "Y\n",
78
+ "Z\n",
79
+ "ZH\n",
80
+ " \n",
81
+ ".\n",
82
+ ",\n",
83
+ "?\n",
84
+ "!\n",
85
+ "\"\"\""
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 17,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "_VOCAB_SPLIT = VOCAB_ITEMS.split(\"\\n\")[1:-1]"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 18,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "VOCAB = {e[1]:e[0] for e in enumerate(_VOCAB_SPLIT)}"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 68,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "TIMIT_MAPPING = {\n",
113
+ " 'ax': 'AH',\n",
114
+ " 'ax-h': 'AH',\n",
115
+ " 'axr': 'ER',\n",
116
+ " 'dx': 'T',\n",
117
+ " 'el': ['AH', 'L'],\n",
118
+ " 'em': ['AH', 'M'],\n",
119
+ " 'en': ['AH', 'N'],\n",
120
+ " 'eng': ['IH', 'NG'],\n",
121
+ " 'hv': 'HH',\n",
122
+ " 'ix': 'IH',\n",
123
+ " 'nx': ['N', 'T'],\n",
124
+ " 'pau': '<sil>',\n",
125
+ " 'epi': '<sil>',\n",
126
+ " 'ux': 'UW'\n",
127
+ "}\n",
128
+ "TIMIT_IGNORE = ['bcl', 'dcl', 'gcl', 'kcl', 'pcl', 'tcl']\n",
129
+ "TIMIT_DISCARD = ['dx', 'nx', 'q']"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 66,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "def map_timit_to_cmudict(timit):\n",
139
+ " output = []\n",
140
+ "\n",
141
+ " start = 1 if timit[0] == \"h#\" else 0\n",
142
+ " end = -1 if timit[-1] == \"h#\" else None\n",
143
+ " timit = timit[start:end]\n",
144
+ "\n",
145
+ " for phone in timit:\n",
146
+ " if phone in TIMIT_MAPPING:\n",
147
+ " if type(TIMIT_MAPPING[phone]) == list:\n",
148
+ " output += TIMIT_MAPPING[phone]\n",
149
+ " else:\n",
150
+ " output.append(TIMIT_MAPPING[phone])\n",
151
+ " elif phone in TIMIT_IGNORE:\n",
152
+ " pass\n",
153
+ " else:\n",
154
+ " if not phone.upper() in VOCAB:\n",
155
+ " print(\"Invalid phone\", phone.upper())\n",
156
+ " output.append(phone.upper())\n",
157
+ " return output"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "timit = load_dataset('timit_asr')"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 75,
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "def is_discardable(batch):\n",
176
+ " for phoneme in batch[\"phonetic_detail\"][\"utterance\"]:\n",
177
+ " if phoneme in TIMIT_DISCARD:\n",
178
+ " return False\n",
179
+ " return True"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "timit_filt = timit[\"train\"].filter(lambda eg: is_discardable(eg))"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "timit_filt2 = timit[\"test\"].filter(lambda eg: is_discardable(eg))"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 78,
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "timit = concatenate_datasets([timit_filt, timit_filt2])"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 54,
212
+ "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "MAX_TOKENS = 1120000"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 58,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "manifest_path = \"manifest.tsv\"\n",
225
+ "transcript_path = \"transcript\""
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 70,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "BASE = timit[0][\"file\"].split(\"/data/\")[0] + \"/data/\""
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 82,
240
+ "metadata": {},
241
+ "outputs": [],
242
+ "source": [
243
+ "resplit = timit.train_test_split(test_size=0.1)"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 86,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": [
252
+ "for split in [\"train\", \"test\"]:\n",
253
+ " fsplit = split\n",
254
+ " if fsplit == \"test\":\n",
255
+ " fsplit = \"valid\"\n",
256
+ " with open(f\"{fsplit}.tsv\", \"w\") as manifest, open(f\"{fsplit}.ltr\", \"w\") as transcript:\n",
257
+ " manifest.write(BASE + \"\\n\")\n",
258
+ " for item in resplit[split]:\n",
259
+ " frames, sr = sf.read(item[\"file\"])\n",
260
+ " manifest.write(f\"{item['file'].replace(BASE, '')}\\t{len(frames)}\\n\")\n",
261
+ " utt = item['phonetic_detail']['utterance']\n",
262
+ " mapped = map_timit_to_cmudict(utt)\n",
263
+ " transcript.write(f\"{' '.join(mapped)}\\n\")\n"
264
+ ]
265
+ }
266
+ ],
267
+ "metadata": {
268
+ "interpreter": {
269
+ "hash": "279d017b1d681737e71f35b98eaa9087df824225149f0ac59acfe151b4fa281b"
270
+ },
271
+ "kernelspec": {
272
+ "display_name": "Python 3.8.12 ('psst')",
273
+ "language": "python",
274
+ "name": "python3"
275
+ },
276
+ "language_info": {
277
+ "codemirror_mode": {
278
+ "name": "ipython",
279
+ "version": 3
280
+ },
281
+ "file_extension": ".py",
282
+ "mimetype": "text/x-python",
283
+ "name": "python",
284
+ "nbconvert_exporter": "python",
285
+ "pygments_lexer": "ipython3",
286
+ "version": "3.8.12"
287
+ },
288
+ "orig_nbformat": 4
289
+ },
290
+ "nbformat": 4,
291
+ "nbformat_minor": 2
292
+ }