In [1]:
import pandas as pd
import os
# prepare the train, dev, test dataset for Turkish language
tr_duration_df = pd.read_csv('data/tr/clip_durations.tsv', sep='\t')
tr_train_df = pd.read_csv('data/tr/train.tsv', sep='\t')
tr_dev_df = pd.read_csv('data/tr/dev.tsv', sep='\t')
tr_test_df = pd.read_csv('data/tr/test.tsv', sep='\t')

merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})
merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})
merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))
merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))
merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_titanet_large/data/tr/clips', x))

merged_tr_train_df["audio_filepath"] = merged_tr_train_df["audio_filepath"].str.replace(".mp3", ".wav")
merged_tr_dev_df["audio_filepath"] = merged_tr_dev_df["audio_filepath"].str.replace(".mp3", ".wav")
merged_tr_test_df["audio_filepath"] = merged_tr_test_df["audio_filepath"].str.replace(".mp3", ".wav")

merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)
merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)
merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)

merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]
merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]
merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]



 merged_tr_train_df["audio_filepath"] = merged_tr_train_df["audio_filepath"].str.replace(".mp3", ".wav")
 merged_tr_dev_df["audio_filepath"] = merged_tr_dev_df["audio_filepath"].str.replace(".mp3", ".wav")
 merged_tr_test_df["audio_filepath"] = merged_tr_test_df["audio_filepath"].str.replace(".mp3", ".wav")


In [3]:
all_data = pd.concat([merged_tr_train_df, merged_tr_dev_df, merged_tr_test_df])

In [4]:
unique_labels = all_data["label"].unique()
train_rows = []
dev_rows = []
test_rows = []
for val in unique_labels:
 subset = all_data[all_data['label'] == val].sample(frac=1).reset_index(drop=True) # Shuffle rows for the value
 n = len(subset)
 
 train_end = int(0.8 * n)
 dev_end = train_end + int(0.1 * n)
 
 train_rows.append(subset.iloc[:train_end])
 dev_rows.append(subset.iloc[train_end:dev_end])
 test_rows.append(subset.iloc[dev_end:])
 
# Create the train_df first
train_df = pd.concat(train_rows, ignore_index=True)
dev_df = pd.concat(dev_rows, ignore_index=True)
test_df = pd.concat(test_rows, ignore_index=True)
test_df = test_df[test_df['label'].isin(train_df['label'].unique())]


In [5]:
train_df.to_json('data/tr/train.json', orient='records', lines=True)
dev_df.to_json('data/tr/dev.json', orient='records', lines=True)
test_df.to_json('data/tr/test.json', orient='records', lines=True)


In [None]:
# Fine-tune the model with Portuguese language

import torch
import pytorch_lightning as pl
import nemo
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
from nemo.utils.exp_manager import exp_manager

# Fine-tune the model with Turkish language
tr_config = OmegaConf.load("conf/titanet-finetune.yaml")
## set up the trainer
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'

tr_trainer_config = OmegaConf.create(dict(
 devices=1,
 accelerator=accelerator,
 #num_sanity_val_steps=0,
 max_epochs=10,
 max_steps=-1, # computed at runtime if not set
 num_nodes=1,
 
 accumulate_grad_batches=1,
 enable_checkpointing=False, # Provided by exp_manager
 logger=False, # Provided by exp_manager
 log_every_n_steps=1, # Interval of logging.
 val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
))
print(OmegaConf.to_yaml(tr_trainer_config))

tr_trainer_finetune = pl.Trainer(**tr_trainer_config)


#set up the nemo experiment for logging and monitoring purpose
log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get("exp_manager", None))


# set up the manifest file for Turkish language
tr_config.model.train_ds.manifest_filepath = 'data/tr/train.json'
tr_config.model.validation_ds.manifest_filepath = 'data/tr/dev.json'
tr_config.model.test_ds.manifest_filepath = 'data/tr/test.json'
tr_config.model.decoder.num_classes = train_df['label'].nunique()


# set up the model for Turkish language and train the model
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)
speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)
tr_trainer_finetune.fit(speaker_model)
#tr_trainer_finetune.test(speaker_model)

# Save the model after fine-tuning with Turkish language

speaker_model.save_to('titanet_finetune_tr.nemo')

devices: 1
accelerator: cpu
max_epochs: 10
max_steps: -1
num_nodes: 1
accumulate_grad_batches: 1
enable_checkpointing: false
logger: false
log_every_n_steps: 1
val_check_interval: 1.0



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


[NeMo I 2023-09-29 17:44:57 exp_manager:381] Experiments will be logged at /v3io/users/User/en_tr_titanet_large/tb/TitaNet-Finetune/2023-09-29_17-44-57
[NeMo I 2023-09-29 17:44:57 exp_manager:815] TensorboardLogger has been set up
[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is 0.00 hours.
[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of 41.01 hours.
[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels


[NeMo W 2023-09-29 17:44:58 label_models:187] Total number of 1328 found in all the manifest files.


[NeMo I 2023-09-29 17:44:58 collections:301] Filtered duration for loading collection is 0.00 hours.
[NeMo I 2023-09-29 17:44:58 collections:302] Dataset loaded with 41559 items, total duration of 41.01 hours.
[NeMo I 2023-09-29 17:44:58 collections:304] # 41559 files loaded accounting to # 1328 labels
[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is 0.00 hours.
[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 4651 items, total duration of 4.47 hours.
[NeMo I 2023-09-29 17:44:59 collections:304] # 4651 files loaded accounting to # 482 labels
[NeMo I 2023-09-29 17:44:59 collections:301] Filtered duration for loading collection is 0.00 hours.
[NeMo I 2023-09-29 17:44:59 collections:302] Dataset loaded with 6198 items, total duration of 6.29 hours.
[NeMo I 2023-09-29 17:44:59 collections:304] # 6198 files loaded accounting to # 1328 labels
[NeMo I 2023-09-29 17:44:59 features:289] PADDING: 16
[NeMo I 2023-09-29 17:44:59 cloud:58] Fou

[NeMo W 2023-09-29 17:45:00 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
 Train config : 
 manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
 sample_rate: 16000
 labels: null
 batch_size: 64
 shuffle: true
 is_tarred: false
 tarred_audio_filepaths: null
 tarred_shard_strategy: scatter
 augmentor:
 noise:
 manifest_path: /manifests/noise/rir_noise_manifest.json
 prob: 0.5
 min_snr_db: 0
 max_snr_db: 15
 speed:
 prob: 0.5
 sr: 16000
 resample_type: kaiser_fast
 min_speed_rate: 0.95
 max_speed_rate: 1.05
 num_workers: 15
 pin_memory: true
 
[NeMo W 2023-09-29 17:45:00 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
 Validation config : 
 manifest_

[NeMo I 2023-09-29 17:45:00 features:289] PADDING: 16
[NeMo I 2023-09-29 17:45:00 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2023-09-29 17:45:00 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`
[NeMo I 2023-09-29 17:45:00 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']
[NeMo I 2023-09-29 17:45:00 modelPT:1156] Make sure that this is what you wanted!
[NeMo I 2023-09-29 17:45:01 modelPT:735] Optimizer config = AdamW (
 Parameter Group 0
 amsgrad: False
 betas: (0.9, 0.999)
 capturable: False
 eps: 1e-08
 foreach: None
 lr: 0.0001
 maximize: False
 weight_decay: 0.0002
 
 Parameter Group 1
 amsgrad: False
 betas: (0.9, 0.999)
 capturable: False
 eps: 1e-08
 foreach: None
 lr: 0.001
 maximiz


 | Name | Type | Params
----------------------------------------------------------------------
0 | loss | AngularSoftmaxLoss | 0 
1 | eval_loss | AngularSoftmaxLoss | 0 
2 | _accuracy | TopKClassificationAccuracy | 0 
3 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 
4 | encoder | ConvASREncoder | 19.4 M
5 | decoder | SpeakerDecoder | 3.0 M 
6 | _macro_accuracy | MulticlassAccuracy | 0 
----------------------------------------------------------------------
22.4 M Trainable params
0 Non-trainable params
22.4 M Total params
89.509 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

 rank_zero_warn(
 
 rank_zero_warn(
 


Training: 0it [00:00, ?it/s]

 


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]