pgwi commited on
Commit
ace3439
1 Parent(s): f8851f9

Upload 2 files

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  data/cv-corpus-15.0-2023-09-08/pt/times.txt filter=lfs diff=lfs merge=lfs -text
37
  data/cv-corpus-15.0-2023-09-08/pt/validated.tsv filter=lfs diff=lfs merge=lfs -text
38
  data/cv-corpus-15.0-2023-09-08/tr/validated.tsv filter=lfs diff=lfs merge=lfs -text
 
 
36
  data/cv-corpus-15.0-2023-09-08/pt/times.txt filter=lfs diff=lfs merge=lfs -text
37
  data/cv-corpus-15.0-2023-09-08/pt/validated.tsv filter=lfs diff=lfs merge=lfs -text
38
  data/cv-corpus-15.0-2023-09-08/tr/validated.tsv filter=lfs diff=lfs merge=lfs -text
39
+ titanet_finetune_tr.nemo filter=lfs diff=lfs merge=lfs -text
fine_tune_tianet_tr.ipynb ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "colab": {},
8
+ "colab_type": "code",
9
+ "id": "vnrUh3vuDSRN"
10
+ },
11
+ "outputs": [
12
+ {
13
+ "name": "stdout",
14
+ "output_type": "stream",
15
+ "text": [
16
+ "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import os\n",
23
+ "# prepare the train, dev, test dataset for Turkish language\n",
24
+ "tr_duration_df = pd.read_csv('data/tr/clip_durations.tsv', sep='\\t')\n",
25
+ "tr_train_df = pd.read_csv('data/tr/train.tsv', sep='\\t')\n",
26
+ "tr_dev_df = pd.read_csv('data/tr/dev.tsv', sep='\\t')\n",
27
+ "tr_test_df = pd.read_csv('data/tr/test.tsv', sep='\\t')\n",
28
+ "\n",
29
+ "merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
30
+ "merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
31
+ "merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 2,
37
+ "metadata": {},
38
+ "outputs": [
39
+ {
40
+ "name": "stderr",
41
+ "output_type": "stream",
42
+ "text": [
43
+ "<ipython-input-2-d0e6b5d0e689>:5: FutureWarning: The default value of regex will change from True to False in a future version.\n",
44
+ " merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
45
+ "<ipython-input-2-d0e6b5d0e689>:6: FutureWarning: The default value of regex will change from True to False in a future version.\n",
46
+ " merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
47
+ "<ipython-input-2-d0e6b5d0e689>:7: FutureWarning: The default value of regex will change from True to False in a future version.\n",
48
+ " merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n"
49
+ ]
50
+ }
51
+ ],
52
+ "source": [
53
+ "merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n",
54
+ "merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n",
55
+ "merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n",
56
+ "\n",
57
+ "merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
58
+ "merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
59
+ "merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
60
+ "\n",
61
+ "merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)\n",
62
+ "merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)\n",
63
+ "merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)\n",
64
+ "\n",
65
+ "merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]\n",
66
+ "merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]\n",
67
+ "merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]\n",
68
+ "\n"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 3,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "all_data = pd.concat([merged_tr_train_df, merged_tr_dev_df, merged_tr_test_df])"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 4,
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "unique_labels = all_data[\"label\"].unique()\n",
87
+ "train_rows = []\n",
88
+ "dev_rows = []\n",
89
+ "test_rows = []\n",
90
+ "for val in unique_labels:\n",
91
+ " subset = all_data[all_data['label'] == val].sample(frac=1).reset_index(drop=True) # Shuffle rows for the value\n",
92
+ " n = len(subset)\n",
93
+ " \n",
94
+ " train_end = int(0.8 * n)\n",
95
+ " dev_end = train_end + int(0.1 * n)\n",
96
+ " \n",
97
+ " train_rows.append(subset.iloc[:train_end])\n",
98
+ " dev_rows.append(subset.iloc[train_end:dev_end])\n",
99
+ " test_rows.append(subset.iloc[dev_end:])\n",
100
+ " \n",
101
+ "# Create the train_df first\n",
102
+ "train_df = pd.concat(train_rows, ignore_index=True)\n",
103
+ "dev_df = pd.concat(dev_rows, ignore_index=True)\n",
104
+ "test_df = pd.concat(test_rows, ignore_index=True)\n",
105
+ "test_df = test_df[test_df['label'].isin(train_df['label'].unique())]\n"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 5,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "train_df.to_json('data/tr/train.json', orient='records', lines=True)\n",
115
+ "dev_df.to_json('data/tr/dev.json', orient='records', lines=True)\n",
116
+ "test_df.to_json('data/tr/test.json', orient='records', lines=True)\n"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": null,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "devices: 1\n",
129
+ "accelerator: cpu\n",
130
+ "max_epochs: 10\n",
131
+ "max_steps: -1\n",
132
+ "num_nodes: 1\n",
133
+ "accumulate_grad_batches: 1\n",
134
+ "enable_checkpointing: false\n",
135
+ "logger: false\n",
136
+ "log_every_n_steps: 1\n",
137
+ "val_check_interval: 1.0\n",
138
+ "\n"
139
+ ]
140
+ },
141
+ {
142
+ "name": "stderr",
143
+ "output_type": "stream",
144
+ "text": [
145
+ "GPU available: False, used: False\n",
146
+ "TPU available: False, using: 0 TPU cores\n",
147
+ "IPU available: False, using: 0 IPUs\n",
148
+ "HPU available: False, using: 0 HPUs\n",
149
+ "`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..\n"
150
+ ]
151
+ },
152
+ {
153
+ "name": "stdout",
154
+ "output_type": "stream",
155
+ "text": [
156
+ "[NeMo I 2023-09-29 17:44:57 exp_manager:381] Experiments will be logged at /v3io/users/User/en_tr_titanet_large/tb/TitaNet-Finetune/2023-09-29_17-44-57\n",
157
+ "[NeMo I 2023-09-29 17:44:57 exp_manager:815] TensorboardLogger has been set up\n",
158
+ "[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
159
+ "[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of 41.01 hours.\n",
160
+ "[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels\n"
161
+ ]
162
+ },
163
+ {
164
+ "name": "stderr",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "[NeMo W 2023-09-29 17:44:58 label_models:187] Total number of 1328 found in all the manifest files.\n"
168
+ ]
169
+ },
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
175
+ "[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of 41.01 hours.\n",
176
+ "[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels\n",
177
+ "[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
178
+ "[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 4651 items, total duration of 4.47 hours.\n",
179
+ "[NeMo I 2023-09-29 17:44:59 collections:304] # 4651 files loaded accounting to # 482 labels\n",
180
+ "[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
181
+ "[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 6198 items, total duration of 6.29 hours.\n",
182
+ "[NeMo I 2023-09-29 17:44:59 collections:304] # 6198 files loaded accounting to # 1328 labels\n",
183
+ "[NeMo I 2023-09-29 17:44:59 features:289] PADDING: 16\n",
184
+ "[NeMo I 2023-09-29 17:44:59 cloud:58] Found existing object /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n",
185
+ "[NeMo I 2023-09-29 17:44:59 cloud:64] Re-using file from: /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo\n",
186
+ "[NeMo I 2023-09-29 17:44:59 common:913] Instantiating model from pre-trained checkpoint\n"
187
+ ]
188
+ },
189
+ {
190
+ "name": "stderr",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "[NeMo W 2023-09-29 17:45:00 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
194
+ " Train config : \n",
195
+ " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json\n",
196
+ " sample_rate: 16000\n",
197
+ " labels: null\n",
198
+ " batch_size: 64\n",
199
+ " shuffle: true\n",
200
+ " is_tarred: false\n",
201
+ " tarred_audio_filepaths: null\n",
202
+ " tarred_shard_strategy: scatter\n",
203
+ " augmentor:\n",
204
+ " noise:\n",
205
+ " manifest_path: /manifests/noise/rir_noise_manifest.json\n",
206
+ " prob: 0.5\n",
207
+ " min_snr_db: 0\n",
208
+ " max_snr_db: 15\n",
209
+ " speed:\n",
210
+ " prob: 0.5\n",
211
+ " sr: 16000\n",
212
+ " resample_type: kaiser_fast\n",
213
+ " min_speed_rate: 0.95\n",
214
+ " max_speed_rate: 1.05\n",
215
+ " num_workers: 15\n",
216
+ " pin_memory: true\n",
217
+ " \n",
218
+ "[NeMo W 2023-09-29 17:45:00 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
219
+ " Validation config : \n",
220
+ " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json\n",
221
+ " sample_rate: 16000\n",
222
+ " labels: null\n",
223
+ " batch_size: 128\n",
224
+ " shuffle: false\n",
225
+ " num_workers: 15\n",
226
+ " pin_memory: true\n",
227
+ " \n"
228
+ ]
229
+ },
230
+ {
231
+ "name": "stdout",
232
+ "output_type": "stream",
233
+ "text": [
234
+ "[NeMo I 2023-09-29 17:45:00 features:289] PADDING: 16\n",
235
+ "[NeMo I 2023-09-29 17:45:00 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n",
236
+ "[NeMo I 2023-09-29 17:45:00 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`\n",
237
+ "[NeMo I 2023-09-29 17:45:00 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']\n",
238
+ "[NeMo I 2023-09-29 17:45:00 modelPT:1156] Make sure that this is what you wanted!\n",
239
+ "[NeMo I 2023-09-29 17:45:01 modelPT:735] Optimizer config = AdamW (\n",
240
+ " Parameter Group 0\n",
241
+ " amsgrad: False\n",
242
+ " betas: (0.9, 0.999)\n",
243
+ " capturable: False\n",
244
+ " eps: 1e-08\n",
245
+ " foreach: None\n",
246
+ " lr: 0.0001\n",
247
+ " maximize: False\n",
248
+ " weight_decay: 0.0002\n",
249
+ " \n",
250
+ " Parameter Group 1\n",
251
+ " amsgrad: False\n",
252
+ " betas: (0.9, 0.999)\n",
253
+ " capturable: False\n",
254
+ " eps: 1e-08\n",
255
+ " foreach: None\n",
256
+ " lr: 0.001\n",
257
+ " maximize: False\n",
258
+ " weight_decay: 0.0002\n",
259
+ " )\n",
260
+ "[NeMo I 2023-09-29 17:45:01 lr_scheduler:910] Scheduler \"<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fe14b339850>\" \n",
261
+ " will be used during training (effective maximum steps = 41560) - \n",
262
+ " Parameters : \n",
263
+ " (warmup_ratio: 0.1\n",
264
+ " min_lr: 0.0\n",
265
+ " max_steps: 41560\n",
266
+ " )\n"
267
+ ]
268
+ },
269
+ {
270
+ "name": "stderr",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "\n",
274
+ " | Name | Type | Params\n",
275
+ "----------------------------------------------------------------------\n",
276
+ "0 | loss | AngularSoftmaxLoss | 0 \n",
277
+ "1 | eval_loss | AngularSoftmaxLoss | 0 \n",
278
+ "2 | _accuracy | TopKClassificationAccuracy | 0 \n",
279
+ "3 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 \n",
280
+ "4 | encoder | ConvASREncoder | 19.4 M\n",
281
+ "5 | decoder | SpeakerDecoder | 3.0 M \n",
282
+ "6 | _macro_accuracy | MulticlassAccuracy | 0 \n",
283
+ "----------------------------------------------------------------------\n",
284
+ "22.4 M Trainable params\n",
285
+ "0 Non-trainable params\n",
286
+ "22.4 M Total params\n",
287
+ "89.509 Total estimated model params size (MB)\n"
288
+ ]
289
+ },
290
+ {
291
+ "data": {
292
+ "application/vnd.jupyter.widget-view+json": {
293
+ "model_id": "",
294
+ "version_major": 2,
295
+ "version_minor": 0
296
+ },
297
+ "text/plain": [
298
+ "Sanity Checking: 0it [00:00, ?it/s]"
299
+ ]
300
+ },
301
+ "metadata": {},
302
+ "output_type": "display_data"
303
+ },
304
+ {
305
+ "name": "stderr",
306
+ "output_type": "stream",
307
+ "text": [
308
+ "[NeMo W 2023-09-29 17:45:01 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
309
+ " rank_zero_warn(\n",
310
+ " \n",
311
+ "[NeMo W 2023-09-29 17:45:22 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
312
+ " rank_zero_warn(\n",
313
+ " \n"
314
+ ]
315
+ },
316
+ {
317
+ "data": {
318
+ "application/vnd.jupyter.widget-view+json": {
319
+ "model_id": "45d1cf72025742e884ba3ff4a6b8e7eb",
320
+ "version_major": 2,
321
+ "version_minor": 0
322
+ },
323
+ "text/plain": [
324
+ "Training: 0it [00:00, ?it/s]"
325
+ ]
326
+ },
327
+ "metadata": {},
328
+ "output_type": "display_data"
329
+ },
330
+ {
331
+ "name": "stderr",
332
+ "output_type": "stream",
333
+ "text": [
334
+ "[NeMo W 2023-09-29 17:45:40 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:212: UserWarning: You called `self.log('global_step', ...)` in your `training_step` but the value needs to be floating point. Converting it to torch.float32.\n",
335
+ " warning_cache.warn(\n",
336
+ " \n"
337
+ ]
338
+ },
339
+ {
340
+ "data": {
341
+ "application/vnd.jupyter.widget-view+json": {
342
+ "model_id": "",
343
+ "version_major": 2,
344
+ "version_minor": 0
345
+ },
346
+ "text/plain": [
347
+ "Validation: 0it [00:00, ?it/s]"
348
+ ]
349
+ },
350
+ "metadata": {},
351
+ "output_type": "display_data"
352
+ },
353
+ {
354
+ "data": {
355
+ "application/vnd.jupyter.widget-view+json": {
356
+ "model_id": "",
357
+ "version_major": 2,
358
+ "version_minor": 0
359
+ },
360
+ "text/plain": [
361
+ "Validation: 0it [00:00, ?it/s]"
362
+ ]
363
+ },
364
+ "metadata": {},
365
+ "output_type": "display_data"
366
+ },
367
+ {
368
+ "data": {
369
+ "application/vnd.jupyter.widget-view+json": {
370
+ "model_id": "",
371
+ "version_major": 2,
372
+ "version_minor": 0
373
+ },
374
+ "text/plain": [
375
+ "Validation: 0it [00:00, ?it/s]"
376
+ ]
377
+ },
378
+ "metadata": {},
379
+ "output_type": "display_data"
380
+ },
381
+ {
382
+ "data": {
383
+ "application/vnd.jupyter.widget-view+json": {
384
+ "model_id": "",
385
+ "version_major": 2,
386
+ "version_minor": 0
387
+ },
388
+ "text/plain": [
389
+ "Validation: 0it [00:00, ?it/s]"
390
+ ]
391
+ },
392
+ "metadata": {},
393
+ "output_type": "display_data"
394
+ },
395
+ {
396
+ "data": {
397
+ "application/vnd.jupyter.widget-view+json": {
398
+ "model_id": "",
399
+ "version_major": 2,
400
+ "version_minor": 0
401
+ },
402
+ "text/plain": [
403
+ "Validation: 0it [00:00, ?it/s]"
404
+ ]
405
+ },
406
+ "metadata": {},
407
+ "output_type": "display_data"
408
+ },
409
+ {
410
+ "data": {
411
+ "application/vnd.jupyter.widget-view+json": {
412
+ "model_id": "",
413
+ "version_major": 2,
414
+ "version_minor": 0
415
+ },
416
+ "text/plain": [
417
+ "Validation: 0it [00:00, ?it/s]"
418
+ ]
419
+ },
420
+ "metadata": {},
421
+ "output_type": "display_data"
422
+ },
423
+ {
424
+ "data": {
425
+ "application/vnd.jupyter.widget-view+json": {
426
+ "model_id": "",
427
+ "version_major": 2,
428
+ "version_minor": 0
429
+ },
430
+ "text/plain": [
431
+ "Validation: 0it [00:00, ?it/s]"
432
+ ]
433
+ },
434
+ "metadata": {},
435
+ "output_type": "display_data"
436
+ },
437
+ {
438
+ "data": {
439
+ "application/vnd.jupyter.widget-view+json": {
440
+ "model_id": "f692ed8064c443afb82ad1e965778fd2",
441
+ "version_major": 2,
442
+ "version_minor": 0
443
+ },
444
+ "text/plain": [
445
+ "Validation: 0it [00:00, ?it/s]"
446
+ ]
447
+ },
448
+ "metadata": {},
449
+ "output_type": "display_data"
450
+ }
451
+ ],
452
+ "source": [
453
+ "# Fine-tune the model with Portuguese language\n",
454
+ "\n",
455
+ "import torch\n",
456
+ "import pytorch_lightning as pl\n",
457
+ "import nemo\n",
458
+ "import nemo.collections.asr as nemo_asr\n",
459
+ "from omegaconf import OmegaConf\n",
460
+ "from nemo.utils.exp_manager import exp_manager\n",
461
+ "\n",
462
+ "# Fine-tune the model with Turkish language\n",
463
+ "tr_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
464
+ "## set up the trainer\n",
465
+ "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
466
+ "\n",
467
+ "tr_trainer_config = OmegaConf.create(dict(\n",
468
+ " devices=1,\n",
469
+ " accelerator=accelerator,\n",
470
+ " #num_sanity_val_steps=0,\n",
471
+ " max_epochs=10,\n",
472
+ " max_steps=-1, # computed at runtime if not set\n",
473
+ " num_nodes=1,\n",
474
+ " \n",
475
+ " accumulate_grad_batches=1,\n",
476
+ " enable_checkpointing=False, # Provided by exp_manager\n",
477
+ " logger=False, # Provided by exp_manager\n",
478
+ " log_every_n_steps=1, # Interval of logging.\n",
479
+ " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n",
480
+ "))\n",
481
+ "print(OmegaConf.to_yaml(tr_trainer_config))\n",
482
+ "\n",
483
+ "tr_trainer_finetune = pl.Trainer(**tr_trainer_config)\n",
484
+ "\n",
485
+ "\n",
486
+ "#set up the nemo experiment for logging and monitoring purpose\n",
487
+ "log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get(\"exp_manager\", None))\n",
488
+ "\n",
489
+ "\n",
490
+ "# set up the manifest file for Turkish language\n",
491
+ "tr_config.model.train_ds.manifest_filepath = 'data/tr/train.json'\n",
492
+ "tr_config.model.validation_ds.manifest_filepath = 'data/tr/dev.json'\n",
493
+ "tr_config.model.test_ds.manifest_filepath = 'data/tr/test.json'\n",
494
+ "tr_config.model.decoder.num_classes = train_df['label'].nunique()\n",
495
+ "\n",
496
+ "\n",
497
+ "# set up the model for Turkish language and train the model\n",
498
+ "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)\n",
499
+ "speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)\n",
500
+ "tr_trainer_finetune.fit(speaker_model)\n",
501
+ "#tr_trainer_finetune.test(speaker_model)\n",
502
+ "\n",
503
+ "# Save the model after fine-tuning with Turkish language\n",
504
+ "\n",
505
+ "speaker_model.save_to('titanet_finetune_tr.nemo')"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": null,
511
+ "metadata": {},
512
+ "outputs": [],
513
+ "source": []
514
+ }
515
+ ],
516
+ "metadata": {
517
+ "accelerator": "GPU",
518
+ "colab": {
519
+ "collapsed_sections": [],
520
+ "name": "Speaker_Recogniton_Verification.ipynb",
521
+ "provenance": [],
522
+ "toc_visible": true
523
+ },
524
+ "kernelspec": {
525
+ "display_name": "transcribe",
526
+ "language": "python",
527
+ "name": "conda-env-.conda-transcribe-py"
528
+ },
529
+ "language_info": {
530
+ "codemirror_mode": {
531
+ "name": "ipython",
532
+ "version": 3
533
+ },
534
+ "file_extension": ".py",
535
+ "mimetype": "text/x-python",
536
+ "name": "python",
537
+ "nbconvert_exporter": "python",
538
+ "pygments_lexer": "ipython3",
539
+ "version": "3.9.16"
540
+ }
541
+ },
542
+ "nbformat": 4,
543
+ "nbformat_minor": 4
544
+ }
titanet_finetune_tr.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79e9798aa0ad30e888db59cc61efe5f506d8936226af4734be63674382d8d6b
3
+ size 90009600