{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "vnrUh3vuDSRN" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "# prepare the train, dev, test dataset for Turkish language\n", "tr_duration_df = pd.read_csv('data/tr/clip_durations.tsv', sep='\\t')\n", "tr_train_df = pd.read_csv('data/tr/train.tsv', sep='\\t')\n", "tr_dev_df = pd.read_csv('data/tr/dev.tsv', sep='\\t')\n", "tr_test_df = pd.read_csv('data/tr/test.tsv', sep='\\t')\n", "\n", "merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n", "merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n", "merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":5: FutureWarning: The default value of regex will change from True to False in a future version.\n", " merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", ":6: FutureWarning: The default value of regex will change from True to False in a future version.\n", " merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", ":7: FutureWarning: The default value of regex will change from True to False in a future version.\n", " merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n" ] } ], "source": [ "merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n", "merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n", "merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))\n", "\n", "merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "\n", "merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)\n", "merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)\n", "merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)\n", "\n", "merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]\n", "merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]\n", "merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "all_data = pd.concat([merged_tr_train_df, merged_tr_dev_df, merged_tr_test_df])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "unique_labels = all_data[\"label\"].unique()\n", "train_rows = []\n", "dev_rows = []\n", "test_rows = []\n", "for val in unique_labels:\n", " subset = all_data[all_data['label'] == val].sample(frac=1).reset_index(drop=True) # Shuffle rows for the value\n", " n = len(subset)\n", " \n", " train_end = int(0.8 * n)\n", " dev_end = train_end + int(0.1 * n)\n", " \n", " train_rows.append(subset.iloc[:train_end])\n", " dev_rows.append(subset.iloc[train_end:dev_end])\n", " test_rows.append(subset.iloc[dev_end:])\n", " \n", "# Create the train_df first\n", "train_df = pd.concat(train_rows, ignore_index=True)\n", "dev_df = pd.concat(dev_rows, ignore_index=True)\n", "test_df = pd.concat(test_rows, ignore_index=True)\n", "test_df = test_df[test_df['label'].isin(train_df['label'].unique())]\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "train_df.to_json('data/tr/train.json', orient='records', lines=True)\n", "dev_df.to_json('data/tr/dev.json', orient='records', lines=True)\n", "test_df.to_json('data/tr/test.json', orient='records', lines=True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "devices: 1\n", "accelerator: cpu\n", "max_epochs: 10\n", "max_steps: -1\n", "num_nodes: 1\n", "accumulate_grad_batches: 1\n", "enable_checkpointing: false\n", "logger: false\n", "log_every_n_steps: 1\n", "val_check_interval: 1.0\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "GPU available: False, used: False\n", "TPU available: False, using: 0 TPU cores\n", "IPU available: False, using: 0 IPUs\n", "HPU available: False, using: 0 HPUs\n", "`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[NeMo I 2023-09-29 17:44:57 exp_manager:381] Experiments will be logged at /v3io/users/User/en_tr_titanet_large/tb/TitaNet-Finetune/2023-09-29_17-44-57\n", "[NeMo I 2023-09-29 17:44:57 exp_manager:815] TensorboardLogger has been set up\n", "[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of 41.01 hours.\n", "[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-29 17:44:58 label_models:187] Total number of 1328 found in all the manifest files.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of 41.01 hours.\n", "[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels\n", "[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 4651 items, total duration of 4.47 hours.\n", "[NeMo I 2023-09-29 17:44:59 collections:304] # 4651 files loaded accounting to # 482 labels\n", "[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 6198 items, total duration of 6.29 hours.\n", "[NeMo I 2023-09-29 17:44:59 collections:304] # 6198 files loaded accounting to # 1328 labels\n", "[NeMo I 2023-09-29 17:44:59 features:289] PADDING: 16\n", "[NeMo I 2023-09-29 17:44:59 cloud:58] Found existing object /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n", "[NeMo I 2023-09-29 17:44:59 cloud:64] Re-using file from: /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo\n", "[NeMo I 2023-09-29 17:44:59 common:913] Instantiating model from pre-trained checkpoint\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-29 17:45:00 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n", " Train config : \n", " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 64\n", " shuffle: true\n", " is_tarred: false\n", " tarred_audio_filepaths: null\n", " tarred_shard_strategy: scatter\n", " augmentor:\n", " noise:\n", " manifest_path: /manifests/noise/rir_noise_manifest.json\n", " prob: 0.5\n", " min_snr_db: 0\n", " max_snr_db: 15\n", " speed:\n", " prob: 0.5\n", " sr: 16000\n", " resample_type: kaiser_fast\n", " min_speed_rate: 0.95\n", " max_speed_rate: 1.05\n", " num_workers: 15\n", " pin_memory: true\n", " \n", "[NeMo W 2023-09-29 17:45:00 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n", " Validation config : \n", " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 128\n", " shuffle: false\n", " num_workers: 15\n", " pin_memory: true\n", " \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[NeMo I 2023-09-29 17:45:00 features:289] PADDING: 16\n", "[NeMo I 2023-09-29 17:45:00 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n", "[NeMo I 2023-09-29 17:45:00 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`\n", "[NeMo I 2023-09-29 17:45:00 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']\n", "[NeMo I 2023-09-29 17:45:00 modelPT:1156] Make sure that this is what you wanted!\n", "[NeMo I 2023-09-29 17:45:01 modelPT:735] Optimizer config = AdamW (\n", " Parameter Group 0\n", " amsgrad: False\n", " betas: (0.9, 0.999)\n", " capturable: False\n", " eps: 1e-08\n", " foreach: None\n", " lr: 0.0001\n", " maximize: False\n", " weight_decay: 0.0002\n", " \n", " Parameter Group 1\n", " amsgrad: False\n", " betas: (0.9, 0.999)\n", " capturable: False\n", " eps: 1e-08\n", " foreach: None\n", " lr: 0.001\n", " maximize: False\n", " weight_decay: 0.0002\n", " )\n", "[NeMo I 2023-09-29 17:45:01 lr_scheduler:910] Scheduler \"\" \n", " will be used during training (effective maximum steps = 41560) - \n", " Parameters : \n", " (warmup_ratio: 0.1\n", " min_lr: 0.0\n", " max_steps: 41560\n", " )\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", " | Name | Type | Params\n", "----------------------------------------------------------------------\n", "0 | loss | AngularSoftmaxLoss | 0 \n", "1 | eval_loss | AngularSoftmaxLoss | 0 \n", "2 | _accuracy | TopKClassificationAccuracy | 0 \n", "3 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 \n", "4 | encoder | ConvASREncoder | 19.4 M\n", "5 | decoder | SpeakerDecoder | 3.0 M \n", "6 | _macro_accuracy | MulticlassAccuracy | 0 \n", "----------------------------------------------------------------------\n", "22.4 M Trainable params\n", "0 Non-trainable params\n", "22.4 M Total params\n", "89.509 Total estimated model params size (MB)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Sanity Checking: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-29 17:45:01 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", " rank_zero_warn(\n", " \n", "[NeMo W 2023-09-29 17:45:22 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", " rank_zero_warn(\n", " \n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "45d1cf72025742e884ba3ff4a6b8e7eb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Training: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-29 17:45:40 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:212: UserWarning: You called `self.log('global_step', ...)` in your `training_step` but the value needs to be floating point. Converting it to torch.float32.\n", " warning_cache.warn(\n", " \n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f692ed8064c443afb82ad1e965778fd2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Validation: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Fine-tune the model with Portuguese language\n", "\n", "import torch\n", "import pytorch_lightning as pl\n", "import nemo\n", "import nemo.collections.asr as nemo_asr\n", "from omegaconf import OmegaConf\n", "from nemo.utils.exp_manager import exp_manager\n", "\n", "# Fine-tune the model with Turkish language\n", "tr_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n", "## set up the trainer\n", "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", "\n", "tr_trainer_config = OmegaConf.create(dict(\n", " devices=1,\n", " accelerator=accelerator,\n", " #num_sanity_val_steps=0,\n", " max_epochs=10,\n", " max_steps=-1, # computed at runtime if not set\n", " num_nodes=1,\n", " \n", " accumulate_grad_batches=1,\n", " enable_checkpointing=False, # Provided by exp_manager\n", " logger=False, # Provided by exp_manager\n", " log_every_n_steps=1, # Interval of logging.\n", " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n", "))\n", "print(OmegaConf.to_yaml(tr_trainer_config))\n", "\n", "tr_trainer_finetune = pl.Trainer(**tr_trainer_config)\n", "\n", "\n", "#set up the nemo experiment for logging and monitoring purpose\n", "log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get(\"exp_manager\", None))\n", "\n", "\n", "# set up the manifest file for Turkish language\n", "tr_config.model.train_ds.manifest_filepath = 'data/tr/train.json'\n", "tr_config.model.validation_ds.manifest_filepath = 'data/tr/dev.json'\n", "tr_config.model.test_ds.manifest_filepath = 'data/tr/test.json'\n", "tr_config.model.decoder.num_classes = train_df['label'].nunique()\n", "\n", "\n", "# set up the model for Turkish language and train the model\n", "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)\n", "speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)\n", "tr_trainer_finetune.fit(speaker_model)\n", "#tr_trainer_finetune.test(speaker_model)\n", "\n", "# Save the model after fine-tuning with Turkish language\n", "\n", "speaker_model.save_to('titanet_finetune_tr.nemo')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "Speaker_Recogniton_Verification.ipynb", "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "transcribe", "language": "python", "name": "conda-env-.conda-transcribe-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 4 }