m3hrdadfi commited on
Commit
0b06806
1 Parent(s): c239b93

Add scripts for later job ft

Browse files
notes/{data_preparation.ipynb → data_preparation_ft.ipynb} RENAMED
File without changes
notes/data_preparation_pt.ipynb ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import sys"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 5,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']"
21
+ },
22
+ "execution_count": 5,
23
+ "metadata": {},
24
+ "output_type": "execute_result"
25
+ }
26
+ ],
27
+ "source": [
28
+ "sys.path"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 4,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "if \"../src\" not in sys.path:\n",
38
+ " sys.path.insert(0, \"../src\")"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 6,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from normalizer import normalizer"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 7,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "سلام بر شما که می‌آیید و می‌آموزید که بی‌آرآیم \n",
60
+ "کتاب‌هایمان میدانی کجا‌ها ماه‌هاس که کی‌هامون و کیهان دنباله‌هاشون برای بهای هستند \n",
61
+ "میان‌‌افزار‌های امروزی نرم‌افزار سخت‌افزار امروز نوشت‌افزار‌ها \n",
62
+ "این کتاب بهترین در نوع شتر آسان‌تر هست \n",
63
+ "سه چیز هست که از پژوهش در این زمینه آموخته‌ام \n"
64
+ ]
65
+ }
66
+ ],
67
+ "source": [
68
+ "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n",
69
+ "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
70
+ "\n",
71
+ "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n",
72
+ "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
73
+ "\n",
74
+ "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n",
75
+ "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
76
+ "\n",
77
+ "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n",
78
+ "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
79
+ "\n",
80
+ "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n",
81
+ "print(normalizer({\"sentence\": input_text}, return_dict=False))"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 12,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "# !mkdir -p /home/m3hrdadfi/code/data\n",
91
+ "# %cd /home/m3hrdadfi/code/data\n",
92
+ "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
93
+ "# %cd /home/m3hrdadfi/"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 13,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "# import os\n",
103
+ "\n",
104
+ "# lang = \"fa\"\n",
105
+ "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n",
106
+ "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
107
+ "# print(abs_path_to_data)\n",
108
+ "# print(save_path)\n",
109
+ "# print()\n",
110
+ "# !ls {save_path}\n",
111
+ "# !ls {abs_path_to_data}/*.tsv"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 14,
117
+ "metadata": {},
118
+ "outputs": [],
119
+ "source": [
120
+ "def normalizer_without_batch(text, pruning=False):\n",
121
+ " try:\n",
122
+ " batch = {\n",
123
+ " \"sentence\": text\n",
124
+ " }\n",
125
+ " text = normalizer(batch, return_dict=False)\n",
126
+ " \n",
127
+ " if pruning:\n",
128
+ " if not len(text.split()) > 3:\n",
129
+ " text = None\n",
130
+ " \n",
131
+ " except:\n",
132
+ " print(text)\n",
133
+ " text = None\n",
134
+ " \n",
135
+ " return text"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 15,
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "import pandas as pd\n",
145
+ "import numpy as np\n",
146
+ "from tqdm import tqdm"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 16,
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
156
+ "\n",
157
+ "# print(f\"Step 0: {len(test_df)}\")\n",
158
+ "\n",
159
+ "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
160
+ "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
161
+ "# test_df = test_df.dropna(subset=[\"path\"])\n",
162
+ "# test_df = test_df.drop(\"status\", 1)\n",
163
+ "# print(f\"Step 1: {len(test_df)}\")\n",
164
+ "\n",
165
+ "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n",
166
+ "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
167
+ "# test_df = test_df.dropna(subset=[\"sentence\"])\n",
168
+ "# print(f\"Step 2: {len(test_df)}\")\n",
169
+ "\n",
170
+ "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
171
+ "# test_df = test_df.drop_duplicates(subset=\"path\")\n",
172
+ "# print(f\"Step 3: {len(test_df)}\")\n",
173
+ "\n",
174
+ "# test_df = test_df.reset_index(drop=True)\n",
175
+ "# test_df.head()"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 17,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "# _train_df = pd.concat([\n",
185
+ "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
186
+ "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
187
+ "# ])\n",
188
+ "# print(len(_train_df))\n",
189
+ "\n",
190
+ "# train_df = pd.concat([\n",
191
+ "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
192
+ "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
193
+ "# pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
194
+ "# pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
195
+ "# ])\n",
196
+ "# print(f\"Step 0: {len(train_df)}\")\n",
197
+ "\n",
198
+ "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
199
+ "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
200
+ "# train_df = train_df.dropna(subset=[\"path\"])\n",
201
+ "# train_df = train_df.drop(\"status\", 1)\n",
202
+ "# print(f\"Step 1: {len(train_df)}\")\n",
203
+ "\n",
204
+ "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n",
205
+ "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n",
206
+ "# train_df = train_df.dropna(subset=[\"sentence\"])\n",
207
+ "# print(f\"Step 2: {len(train_df)}\")\n",
208
+ "\n",
209
+ "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
210
+ "# train_df = train_df.drop_duplicates(subset=\"path\")\n",
211
+ "# print(f\"Step 3: {len(train_df)}\")\n",
212
+ "\n",
213
+ "# train_df = train_df.sample(frac=1)\n",
214
+ "# train_df = train_df.reset_index(drop=True)\n",
215
+ "# train_df.head()"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 18,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "# from tqdm import tqdm\n",
225
+ "\n",
226
+ "# testset_indices = []\n",
227
+ "\n",
228
+ "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
229
+ "# _id = row[\"path\"]\n",
230
+ "# finder = train_df[train_df[\"path\"] == _id]\n",
231
+ "# if len(finder) > 0:\n",
232
+ "# testset_indices.extend(list(finder.index))\n",
233
+ "\n",
234
+ "# testset_indices = list(set(testset_indices))\n",
235
+ "# print(f\"Found #{len(testset_indices)} test data\")"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 19,
241
+ "metadata": {},
242
+ "outputs": [],
243
+ "source": [
244
+ "# print(len(train_df))\n",
245
+ "# train_df = train_df.drop(testset_indices)\n",
246
+ "# print(len(train_df))"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 20,
252
+ "metadata": {},
253
+ "outputs": [],
254
+ "source": [
255
+ "# import pandas as pd\n",
256
+ "\n",
257
+ "# df = pd.concat([train_df, test_df], axis=0)\n",
258
+ "# # df = validated_df.copy()\n",
259
+ "# print(df.info())\n",
260
+ "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
261
+ "# # df = df.dropna(subset=[\"sentence\"])\n",
262
+ "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n",
263
+ "# df = df.reset_index(drop=True)\n",
264
+ "# print(df.info())\n",
265
+ "# df.head()"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 21,
271
+ "metadata": {},
272
+ "outputs": [],
273
+ "source": [
274
+ "# import torchaudio\n",
275
+ "# import librosa\n",
276
+ "# import IPython.display as ipd\n",
277
+ "# import numpy as np\n",
278
+ "\n",
279
+ "# def load_audio(path):\n",
280
+ "# speech, sr = torchaudio.load(path)\n",
281
+ "# speech = speech[0].numpy().squeeze() \n",
282
+ "# speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
283
+ " \n",
284
+ "# print(speech.shape, sr)\n",
285
+ " \n",
286
+ "# ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 22,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n",
296
+ "# text = \" \".join(df[\"sentence\"].values.tolist())\n",
297
+ "# vocab = list(sorted(set(text)))\n",
298
+ "\n",
299
+ "# for v in main_vocab:\n",
300
+ "# if v not in vocab:\n",
301
+ "# print(\"v\", v)\n",
302
+ "\n",
303
+ "# print(len(main_vocab), len(vocab))\n",
304
+ "# print(len(vocab), vocab)"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 23,
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "# import numpy as np\n",
314
+ "\n",
315
+ "\n",
316
+ "# idx = np.random.randint(0, len(df))\n",
317
+ "# # idx = 6140\n",
318
+ "# sample = df.iloc[idx]\n",
319
+ "# ipd.display(sample)\n",
320
+ "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
321
+ "# print()\n",
322
+ "# print(sample[\"prev_sentence\"])\n",
323
+ "# print(sample[\"sentence\"])\n",
324
+ "# print()\n",
325
+ "# load_audio(sample[\"path\"])"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 24,
331
+ "metadata": {},
332
+ "outputs": [],
333
+ "source": [
334
+ "# new_train_df = train_df.copy()\n",
335
+ "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
336
+ "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
337
+ "# print(new_train_df.info())\n",
338
+ "# new_train_df.head()"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 25,
344
+ "metadata": {},
345
+ "outputs": [],
346
+ "source": [
347
+ "# new_test_df = test_df.copy()\n",
348
+ "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
349
+ "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
350
+ "# print(new_test_df.info())\n",
351
+ "# new_test_df.head()"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 26,
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "# import shutil\n",
361
+ "# from tqdm import tqdm"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 27,
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "# !mkdir -p {save_path}/clips\n",
371
+ "# !mkdir -p {save_path}/augs"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 28,
377
+ "metadata": {},
378
+ "outputs": [],
379
+ "source": [
380
+ "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
381
+ "# shutil.copy(row[\"_path\"], row[\"path\"])"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 29,
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": [
390
+ "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
391
+ "# shutil.copy(row[\"_path\"], row[\"path\"])"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": 30,
397
+ "metadata": {},
398
+ "outputs": [],
399
+ "source": [
400
+ "# # aug_train_df = new_train_df.copy()\n",
401
+ "# aug_train_df = new_train_df.sample(frac=0.1)\n",
402
+ "# aug_train_df = aug_train_df.reset_index(drop=True)\n",
403
+ "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
404
+ "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
405
+ "# print(aug_train_df.info())\n",
406
+ "# aug_train_df.head()"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 31,
412
+ "metadata": {},
413
+ "outputs": [],
414
+ "source": [
415
+ "# print(aug_train_df.iloc[0][\"_path\"])\n",
416
+ "# print(aug_train_df.iloc[0][\"path\"])"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": 32,
422
+ "metadata": {},
423
+ "outputs": [],
424
+ "source": [
425
+ "# # augmentation\n",
426
+ "\n",
427
+ "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
428
+ "# import numpy as np\n",
429
+ "# import soundfile as sf\n",
430
+ "\n",
431
+ "# augment = Compose([\n",
432
+ "# # AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
433
+ "# # PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
434
+ "# # Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
435
+ "# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
436
+ "# TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
437
+ "# PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
438
+ "# ])\n",
439
+ "\n",
440
+ "# def augmented_speech_file_to_array_fn(in_path, out_path):\n",
441
+ "# speech_array, sampling_rate = torchaudio.load(in_path)\n",
442
+ "# speech_array = speech_array.squeeze().numpy()\n",
443
+ "# speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
444
+ "# sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 33,
450
+ "metadata": {},
451
+ "outputs": [],
452
+ "source": [
453
+ "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
454
+ "# # augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
455
+ "# !ls"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 34,
461
+ "metadata": {},
462
+ "outputs": [],
463
+ "source": [
464
+ "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
465
+ "# new_train_aug_df = new_train_df.copy()\n",
466
+ "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
467
+ "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
468
+ "# print(new_train_aug_df.info())\n",
469
+ "# new_train_aug_df.head()"
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "code",
474
+ "execution_count": 35,
475
+ "metadata": {},
476
+ "outputs": [],
477
+ "source": [
478
+ "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
479
+ "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
480
+ "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 36,
486
+ "metadata": {},
487
+ "outputs": [],
488
+ "source": [
489
+ "# new_train_df.count()"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 37,
495
+ "metadata": {},
496
+ "outputs": [],
497
+ "source": [
498
+ "# new_test_df.count()"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "execution_count": 38,
504
+ "metadata": {},
505
+ "outputs": [],
506
+ "source": [
507
+ "# import pandas as pd\n",
508
+ "\n",
509
+ "# import os\n",
510
+ "# from tqdm import tqdm"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "code",
515
+ "execution_count": 39,
516
+ "metadata": {},
517
+ "outputs": [],
518
+ "source": [
519
+ "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
520
+ "# print(train_df.info())\n",
521
+ "# train_df.head()"
522
+ ]
523
+ },
524
+ {
525
+ "cell_type": "code",
526
+ "execution_count": 40,
527
+ "metadata": {},
528
+ "outputs": [],
529
+ "source": [
530
+ "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
531
+ "# print(test_df.info())\n",
532
+ "# test_df.head()"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 41,
538
+ "metadata": {},
539
+ "outputs": [],
540
+ "source": [
541
+ "# non_existed_train = []\n",
542
+ "\n",
543
+ "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
544
+ "# if not os.path.exists(row[\"path\"]):\n",
545
+ "# non_existed_train.extends(list(index))\n",
546
+ "# break"
547
+ ]
548
+ },
549
+ {
550
+ "cell_type": "code",
551
+ "execution_count": 42,
552
+ "metadata": {},
553
+ "outputs": [],
554
+ "source": [
555
+ "# import numpy as np\n",
556
+ "\n",
557
+ "\n",
558
+ "# idx = np.random.randint(0, len(train_df))\n",
559
+ "# # idx = 6140\n",
560
+ "# sample = train_df.iloc[idx]\n",
561
+ "# ipd.display(sample)\n",
562
+ "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
563
+ "# print()\n",
564
+ "# print(sample[\"prev_sentence\"])\n",
565
+ "# print(sample[\"sentence\"])\n",
566
+ "# print()\n",
567
+ "# load_audio(sample[\"path\"])"
568
+ ]
569
+ },
570
+ {
571
+ "cell_type": "code",
572
+ "execution_count": 43,
573
+ "metadata": {},
574
+ "outputs": [],
575
+ "source": [
576
+ "# train_df_half = train_df.copy()\n",
577
+ "# print(train_df_half.shape)\n",
578
+ "# train_df_half = train_df_half.dropna()\n",
579
+ "# print(train_df_half.shape)\n",
580
+ "# train_df_half = train_df_half.drop_duplicates()\n",
581
+ "# print(train_df_half.shape)\n",
582
+ "\n",
583
+ "# train_df_half = train_df_half.sample(frac=0.5)\n",
584
+ "# train_df_half = train_df_half.reset_index(drop=True)\n",
585
+ "# print(train_df_half.shape)"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": 44,
591
+ "metadata": {},
592
+ "outputs": [],
593
+ "source": [
594
+ "# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": null,
600
+ "metadata": {},
601
+ "outputs": [],
602
+ "source": []
603
+ }
604
+ ],
605
+ "metadata": {
606
+ "kernelspec": {
607
+ "display_name": "transformers",
608
+ "name": "transformers"
609
+ },
610
+ "language_info": {
611
+ "codemirror_mode": {
612
+ "name": "ipython",
613
+ "version": 3
614
+ },
615
+ "file_extension": ".py",
616
+ "mimetype": "text/x-python",
617
+ "name": "python",
618
+ "nbconvert_exporter": "python",
619
+ "pygments_lexer": "ipython3",
620
+ "version": "3.9.4"
621
+ },
622
+ "orig_nbformat": 2
623
+ },
624
+ "nbformat": 4,
625
+ "nbformat_minor": 2
626
+ }
notes/fa.tar.gz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f3c53202d7d12dfe973604737fc11b0a50c9c94b85c4cae70fcc693fe2babb4
3
- size 7020110
 
 
 
 
src/fine-tuning/__init__.py ADDED
File without changes
src/{dictionary.py → fine-tuning/dictionary.py} RENAMED
File without changes
src/{normalizer.py → fine-tuning/normalizer.py} RENAMED
File without changes