diff --git a/convtasnet1/CKPT.yaml b/convtasnet1/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..beda85d565a9a3004bb30d20f697ccade805baa9 --- /dev/null +++ b/convtasnet1/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -0.42996728845946536 +unixtime: 1631299321.582555 diff --git a/convtasnet1/brain.ckpt b/convtasnet1/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/convtasnet1/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/convtasnet1/counter.ckpt b/convtasnet1/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..55180c0973199684271f33f576cee2c35cbab893 --- /dev/null +++ b/convtasnet1/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b +size 1 diff --git a/convtasnet1/decoder.ckpt b/convtasnet1/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..5c33e337a2fe38cebc6c4f34c1ffcb31915ab216 --- /dev/null +++ b/convtasnet1/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623030a5a4317eabc555ec09254d6a05e5f3811933c429f90f06f903a22b808c +size 17272 diff --git a/convtasnet1/encoder.ckpt b/convtasnet1/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c561f29d4f6404d181b4521850bfad7c30f51848 --- /dev/null +++ b/convtasnet1/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4314241647c3b6adece727c3ff8066f674c516ad0a531efaa0bbd89eb4786050 +size 17272 diff --git a/convtasnet1/hyperparams.yaml b/convtasnet1/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e1a795efcbe8b46ea6e8a91107da25e8b8eb396c --- /dev/null +++ b/convtasnet1/hyperparams.yaml @@ -0,0 +1,168 @@ +# Generated 2021-09-15 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/convtasnet-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: convtasnet-whamr +output_folder: results/convtasnet-whamr/3 +train_log: results/convtasnet-whamr/3/train_log.txt +save_folder: results/convtasnet-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/convtasnet-whamr/3/save/whamr_tr.csv +valid_data: results/convtasnet-whamr/3/save/whamr_cv.csv +test_data: results/convtasnet-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: false # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + + +# Specifying the network +Encoder: &id001 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +MaskNet: &id003 !new:speechbrain.lobes.models.conv_tasnet.MaskNet + + N: 256 + B: 256 + H: 512 + P: 3 + X: 6 + R: 4 + C: 2 + norm_type: gLN + causal: false + mask_nonlinear: relu + +Decoder: &id002 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id005 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id004 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id001 + decoder: *id002 + masknet: *id003 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/convtasnet-whamr/3/save + recoverables: + encoder: *id001 + decoder: *id002 + masknet: *id003 + counter: *id004 + lr_scheduler: *id005 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/convtasnet-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/convtasnet1/lr_scheduler.ckpt b/convtasnet1/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6da39f304442c42dc64cc537deb988c7bd5593ff --- /dev/null +++ b/convtasnet1/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b633ee5d0a19696bcf1025be87e8e7ec9b783ad2b9adfa2077e057d18accaea6 +size 495 diff --git a/convtasnet1/masknet.ckpt b/convtasnet1/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..94c351d88cdb634787e5e905c16003d5356b0027 --- /dev/null +++ b/convtasnet1/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c8606649b45db4841e79ef35ec71bb7e0f79b1e5101fe690ef083dbb7c0c21c +size 26404523 diff --git a/convtasnet1/optimizer.ckpt b/convtasnet1/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..de2c211f7b6599d45fa00eaa27a786fe6d4adda5 --- /dev/null +++ b/convtasnet1/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eec7c786af8ea382d342b793fa3518fe337f72e114077b4755cfc0c521b74754 +size 52803531 diff --git a/convtasnet2/CKPT.yaml b/convtasnet2/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1a141b6f84574e625ae6eae42bd30432ef379e8f --- /dev/null +++ b/convtasnet2/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -4.564481391887362 +unixtime: 1631590081.3795788 diff --git a/convtasnet2/brain.ckpt b/convtasnet2/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/convtasnet2/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/convtasnet2/counter.ckpt b/convtasnet2/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..bbeebd177e3dfb198e078dc888c07b74d74fd6f9 --- /dev/null +++ b/convtasnet2/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:031b4af5197ec30a926f48cf40e11a7dbc470048a21e4003b7a3c07c5dab1baa +size 2 diff --git a/convtasnet2/decoder.ckpt b/convtasnet2/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8f682340191b0733b6473c78dd44768cb801b446 --- /dev/null +++ b/convtasnet2/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:680ae89a64815fb9ea7a9bf2e19682841772253084ba5db0fe30db96a62e16f1 +size 17272 diff --git a/convtasnet2/encoder.ckpt b/convtasnet2/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f6b70bf48e35d0e25cd6c46169f2e5893399865e --- /dev/null +++ b/convtasnet2/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5830174e7975d20ec1a82e6eb90521ad2359f58d69245864c6299d7fe89b94e3 +size 17272 diff --git a/convtasnet2/hyperparams.yaml b/convtasnet2/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e1a795efcbe8b46ea6e8a91107da25e8b8eb396c --- /dev/null +++ b/convtasnet2/hyperparams.yaml @@ -0,0 +1,168 @@ +# Generated 2021-09-15 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/convtasnet-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: convtasnet-whamr +output_folder: results/convtasnet-whamr/3 +train_log: results/convtasnet-whamr/3/train_log.txt +save_folder: results/convtasnet-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/convtasnet-whamr/3/save/whamr_tr.csv +valid_data: results/convtasnet-whamr/3/save/whamr_cv.csv +test_data: results/convtasnet-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: false # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + + +# Specifying the network +Encoder: &id001 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +MaskNet: &id003 !new:speechbrain.lobes.models.conv_tasnet.MaskNet + + N: 256 + B: 256 + H: 512 + P: 3 + X: 6 + R: 4 + C: 2 + norm_type: gLN + causal: false + mask_nonlinear: relu + +Decoder: &id002 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id005 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id004 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id001 + decoder: *id002 + masknet: *id003 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/convtasnet-whamr/3/save + recoverables: + encoder: *id001 + decoder: *id002 + masknet: *id003 + counter: *id004 + lr_scheduler: *id005 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/convtasnet-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/convtasnet2/lr_scheduler.ckpt b/convtasnet2/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c2087e3793c48d8bb8a8c15d87570eb5d30052f8 --- /dev/null +++ b/convtasnet2/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7efd3c42e7652fa5529e1d01e70d8faebd9d5b3939d0d94c719990ab12e5318f +size 943 diff --git a/convtasnet2/masknet.ckpt b/convtasnet2/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..0ac7e1febc268fb13c2673dba6ceefcb04871006 --- /dev/null +++ b/convtasnet2/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43214e59d3f5b23936a9be023790a4920806186c2e814dc8f4dea13f3a5a2df +size 26404523 diff --git a/convtasnet2/optimizer.ckpt b/convtasnet2/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e0953aa18cc84fbf878314076e2408ecb8f154b4 --- /dev/null +++ b/convtasnet2/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e64b2e266108cd74b2ebd7a5039b1d6e39ca53ae03eab20e8dc123402e6c8e +size 52803979 diff --git a/convtasnet3/CKPT.yaml b/convtasnet3/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..09f1dc0b5a55c7c29ffcfe1ce38a56b06733205a --- /dev/null +++ b/convtasnet3/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -5.491251103860138 +unixtime: 1631850893.625292 diff --git a/convtasnet3/brain.ckpt b/convtasnet3/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/convtasnet3/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/convtasnet3/counter.ckpt b/convtasnet3/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..67cabed3774bb68b3407c628a03cf2dd04e6182d --- /dev/null +++ b/convtasnet3/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16dc368a89b428b2485484313ba67a3912ca03f2b2b42429174a4f8b3dc84e44 +size 3 diff --git a/convtasnet3/decoder.ckpt b/convtasnet3/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..67f2287844893ccfad9393ab28f30e71c2a93c5d --- /dev/null +++ b/convtasnet3/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35b76309d4da7851d953ad333fe2214c5f52625f09bd786c546f4152789c7710 +size 17272 diff --git a/convtasnet3/encoder.ckpt b/convtasnet3/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..388124303371ba9237bb701b03fb17ef47c0a4ec --- /dev/null +++ b/convtasnet3/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9dcf0cc01b62e9bf3545eeccbc34547c718f6ae626d6f91bb6d8540ce418e15 +size 17272 diff --git a/convtasnet3/hyperparams.yaml b/convtasnet3/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e1a795efcbe8b46ea6e8a91107da25e8b8eb396c --- /dev/null +++ b/convtasnet3/hyperparams.yaml @@ -0,0 +1,168 @@ +# Generated 2021-09-15 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/convtasnet-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: convtasnet-whamr +output_folder: results/convtasnet-whamr/3 +train_log: results/convtasnet-whamr/3/train_log.txt +save_folder: results/convtasnet-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/convtasnet-whamr/3/save/whamr_tr.csv +valid_data: results/convtasnet-whamr/3/save/whamr_cv.csv +test_data: results/convtasnet-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: false # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + + +# Specifying the network +Encoder: &id001 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +MaskNet: &id003 !new:speechbrain.lobes.models.conv_tasnet.MaskNet + + N: 256 + B: 256 + H: 512 + P: 3 + X: 6 + R: 4 + C: 2 + norm_type: gLN + causal: false + mask_nonlinear: relu + +Decoder: &id002 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id005 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id004 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id001 + decoder: *id002 + masknet: *id003 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/convtasnet-whamr/3/save + recoverables: + encoder: *id001 + decoder: *id002 + masknet: *id003 + counter: *id004 + lr_scheduler: *id005 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/convtasnet-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/convtasnet3/lr_scheduler.ckpt b/convtasnet3/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6a10f4fd4fcdb7880947a9ae3578f6f3e9f43a0a --- /dev/null +++ b/convtasnet3/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2047f7f167981a8323fedc60c915590115468bd222ce33c19871f495b811f2 +size 1391 diff --git a/convtasnet3/masknet.ckpt b/convtasnet3/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..54cf467432be9a4b32893b6d8f87a3986778666a --- /dev/null +++ b/convtasnet3/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f14d10e6f2e7d4eb7770f3938fac171dfb86d46bce7082fbfee400d95dfe5cc5 +size 26404523 diff --git a/convtasnet3/optimizer.ckpt b/convtasnet3/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..bf1e66bc0a5c20557a80080ed0d11b4f134a75fe --- /dev/null +++ b/convtasnet3/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f618f7f980ba67fb00fe0fb3e13dafb0e9ec5111ffb945e94c53fdc7c7aac3 +size 52803979 diff --git a/dprnn1/CKPT.yaml b/dprnn1/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bb10376d7b7f6a30bdfba9cbeb244fdb176b37b --- /dev/null +++ b/dprnn1/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -0.5297146898724144 +unixtime: 1631292299.804519 diff --git a/dprnn1/brain.ckpt b/dprnn1/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/dprnn1/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/dprnn1/counter.ckpt b/dprnn1/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..55180c0973199684271f33f576cee2c35cbab893 --- /dev/null +++ b/dprnn1/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b +size 1 diff --git a/dprnn1/dataloader-TRAIN.ckpt b/dprnn1/dataloader-TRAIN.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b5a49049e5ab6ce5335145f010e83e445d0d95c8 --- /dev/null +++ b/dprnn1/dataloader-TRAIN.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876c9b16254e157d1eb645390dcfae6f29b9d3cd394e73a91de8ee5d0e67ee43 +size 5 diff --git a/dprnn1/decoder.ckpt b/dprnn1/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..15e11348a741d11281b4f318a3ebfc752a45fbc5 --- /dev/null +++ b/dprnn1/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2e73c51dc6b7bdcc3a546af3e2f682dc4d3f7357d1497b5e545401a3cb517b1 +size 17272 diff --git a/dprnn1/encoder.ckpt b/dprnn1/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1d2729e51ed45cac32b10fe1e9e06fbfe307924e --- /dev/null +++ b/dprnn1/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdf0dbef1da288e831b4facdfa428de1c3bac0dfefe6c08a127c6e09c60d148b +size 17272 diff --git a/dprnn1/hyperparams.yaml b/dprnn1/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..554c99727af1a87a08277df99b6697cda6509304 --- /dev/null +++ b/dprnn1/hyperparams.yaml @@ -0,0 +1,183 @@ +# Generated 2021-09-18 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/dprnn-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr/ + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: dprnn-whamr +output_folder: results/dprnn-whamr/3 +train_log: results/dprnn-whamr/3/train_log.txt +save_folder: results/dprnn-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/dprnn-whamr/3/save/whamr_tr.csv +valid_data: results/dprnn-whamr/3/save/whamr_cv.csv +test_data: results/dprnn-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: true # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + + +# Specifying the network +Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +intra: &id001 !new:speechbrain.lobes.models.dual_path.SBRNNBlock + num_layers: 1 + input_size: 256 + hidden_channels: 256 + dropout: 0 + bidirectional: true + +inter: &id002 !new:speechbrain.lobes.models.dual_path.SBRNNBlock + num_layers: 1 + input_size: 256 + hidden_channels: 256 + dropout: 0 + bidirectional: true + +MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model + + num_spks: 2 + in_channels: 256 + out_channels: 256 + num_layers: 6 + K: 250 + intra_model: *id001 + inter_model: *id002 + norm: ln + linear_layer_after_inter_intra: true + skip_around_intra: true + +Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + + + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id003 + decoder: *id004 + masknet: *id005 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/dprnn-whamr/3/save + recoverables: + encoder: *id003 + decoder: *id004 + masknet: *id005 + counter: *id006 + lr_scheduler: *id007 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/dprnn-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/dprnn1/lr_scheduler.ckpt b/dprnn1/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..952685d5c0fc6ea1942cc80ebc3e94d3a732f3cd --- /dev/null +++ b/dprnn1/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e124452ee7deeb68cdaa636fbcb85a7654dfda37ac2b38ccebf02d5604de85dc +size 495 diff --git a/dprnn1/masknet.ckpt b/dprnn1/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..087d185690590a14d572b1fc0dbb797180eb844f --- /dev/null +++ b/dprnn1/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:427819af9b78e9700010f6cbe84af6c33cb5b229762018c76c3ba9117c4f8761 +size 58477294 diff --git a/dprnn1/optimizer.ckpt b/dprnn1/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e3a8b4f7ea3809afcdb1ea4a10efad5e2101921f --- /dev/null +++ b/dprnn1/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52cdd192d0a27ca7a9a065505c30cb0aa025abf4afdc88fbcf006249929514e7 +size 117035321 diff --git a/dprnn2/CKPT.yaml b/dprnn2/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ae4133ead2f66fcf2f1b8b2f6aae423f494a3fd --- /dev/null +++ b/dprnn2/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -5.874772780568888 +unixtime: 1631748976.9005034 diff --git a/dprnn2/brain.ckpt b/dprnn2/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/dprnn2/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/dprnn2/counter.ckpt b/dprnn2/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..976c94291e60762cab92040bfc74983e027b8317 --- /dev/null +++ b/dprnn2/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fca346db656187102ce806ac732e06a62df0dbb2829e511a770556d398e1a6e +size 2 diff --git a/dprnn2/decoder.ckpt b/dprnn2/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..2ce1e3a023f950622744bbc99ef0707079690415 --- /dev/null +++ b/dprnn2/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74fac8e4c0330aca71afef77553f5da11fa5a89bff026e9fe8e5fd529ddc3b9a +size 17272 diff --git a/dprnn2/encoder.ckpt b/dprnn2/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8d7224bbf949af77640523c6a1813b26b699cf87 --- /dev/null +++ b/dprnn2/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef9291a6d0d0a74955bf539c81d36a35bdc6900306d5d1c69dca57b0b315e4c +size 17272 diff --git a/dprnn2/hyperparams.yaml b/dprnn2/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..554c99727af1a87a08277df99b6697cda6509304 --- /dev/null +++ b/dprnn2/hyperparams.yaml @@ -0,0 +1,183 @@ +# Generated 2021-09-18 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/dprnn-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr/ + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: dprnn-whamr +output_folder: results/dprnn-whamr/3 +train_log: results/dprnn-whamr/3/train_log.txt +save_folder: results/dprnn-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/dprnn-whamr/3/save/whamr_tr.csv +valid_data: results/dprnn-whamr/3/save/whamr_cv.csv +test_data: results/dprnn-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: true # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + + +# Specifying the network +Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +intra: &id001 !new:speechbrain.lobes.models.dual_path.SBRNNBlock + num_layers: 1 + input_size: 256 + hidden_channels: 256 + dropout: 0 + bidirectional: true + +inter: &id002 !new:speechbrain.lobes.models.dual_path.SBRNNBlock + num_layers: 1 + input_size: 256 + hidden_channels: 256 + dropout: 0 + bidirectional: true + +MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model + + num_spks: 2 + in_channels: 256 + out_channels: 256 + num_layers: 6 + K: 250 + intra_model: *id001 + inter_model: *id002 + norm: ln + linear_layer_after_inter_intra: true + skip_around_intra: true + +Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + + + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id003 + decoder: *id004 + masknet: *id005 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/dprnn-whamr/3/save + recoverables: + encoder: *id003 + decoder: *id004 + masknet: *id005 + counter: *id006 + lr_scheduler: *id007 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/dprnn-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/dprnn2/lr_scheduler.ckpt b/dprnn2/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f13d0bc3ba94ddc30a4dc544639231a47916330e --- /dev/null +++ b/dprnn2/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb3b7d061e814f1fa9b59ec858d9f1c346671ce3d81c15b3a1a176a371b52037 +size 943 diff --git a/dprnn2/masknet.ckpt b/dprnn2/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..20a4e6b6b1533832d4b0529a3e1fcae9c7da5c22 --- /dev/null +++ b/dprnn2/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dbfce012dc85b68b3cc9f8e1d10b1fc63ef54248b34d562f61bb8c0f7a23030 +size 58477294 diff --git a/dprnn2/optimizer.ckpt b/dprnn2/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..0dba110fb0e02b66d3507d5edfe2dec8dce6fbd6 --- /dev/null +++ b/dprnn2/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7323ab9ed7747b6bb0dd116f2e7f6271d1f6f9d28971644210e5016fa7ee033 +size 117035641 diff --git a/dprnn3/CKPT.yaml b/dprnn3/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4ab0460a2c90e166215bb065083d7527e6008da5 --- /dev/null +++ b/dprnn3/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -7.044497849920388 +unixtime: 1632151547.7435942 diff --git a/dprnn3/brain.ckpt b/dprnn3/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/dprnn3/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/dprnn3/counter.ckpt b/dprnn3/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..99a3f28056db1018b1ea94f8cccc4c50b283b4a0 --- /dev/null +++ b/dprnn3/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3346f2bbf6c34bd2dbe28bd1bb657d0e9c37392a1d5ec9929e6a5df4763ddc2d +size 3 diff --git a/dprnn3/decoder.ckpt b/dprnn3/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..9453c2f520d25653ed0cc4e7204f44fdeef2b672 --- /dev/null +++ b/dprnn3/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d26e5412c04d918938a2b5a0e310d814fcf1451c3c7e9dda9f52b549f553f64 +size 17272 diff --git a/dprnn3/encoder.ckpt b/dprnn3/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c8561fa74308dec8934a5e328d7421112d1ae0da --- /dev/null +++ b/dprnn3/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eaa9585ba136ef4aa2ea7a4654435da3949856e36b8ff095809193217614bcb +size 17272 diff --git a/dprnn3/hyperparams.yaml b/dprnn3/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..554c99727af1a87a08277df99b6697cda6509304 --- /dev/null +++ b/dprnn3/hyperparams.yaml @@ -0,0 +1,183 @@ +# Generated 2021-09-18 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/dprnn-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr/ + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: dprnn-whamr +output_folder: results/dprnn-whamr/3 +train_log: results/dprnn-whamr/3/train_log.txt +save_folder: results/dprnn-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/dprnn-whamr/3/save/whamr_tr.csv +valid_data: results/dprnn-whamr/3/save/whamr_cv.csv +test_data: results/dprnn-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: true # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + + +# Specifying the network +Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +intra: &id001 !new:speechbrain.lobes.models.dual_path.SBRNNBlock + num_layers: 1 + input_size: 256 + hidden_channels: 256 + dropout: 0 + bidirectional: true + +inter: &id002 !new:speechbrain.lobes.models.dual_path.SBRNNBlock + num_layers: 1 + input_size: 256 + hidden_channels: 256 + dropout: 0 + bidirectional: true + +MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model + + num_spks: 2 + in_channels: 256 + out_channels: 256 + num_layers: 6 + K: 250 + intra_model: *id001 + inter_model: *id002 + norm: ln + linear_layer_after_inter_intra: true + skip_around_intra: true + +Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + + + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id003 + decoder: *id004 + masknet: *id005 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/dprnn-whamr/3/save + recoverables: + encoder: *id003 + decoder: *id004 + masknet: *id005 + counter: *id006 + lr_scheduler: *id007 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/dprnn-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/dprnn3/lr_scheduler.ckpt b/dprnn3/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..a66f2c825795129719733c6911c74751f8044dbd --- /dev/null +++ b/dprnn3/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3018863346f17778814927d610f9a7b054c125d86860441532b5383744896a0 +size 1455 diff --git a/dprnn3/masknet.ckpt b/dprnn3/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..45c48b89952795d6597d3c91c5cdcabcaf045644 --- /dev/null +++ b/dprnn3/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67c82bfea757c580fa2dbffd4df4687a8054652d63857cba920b15d7056a8b4a +size 58477294 diff --git a/dprnn3/optimizer.ckpt b/dprnn3/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..80a22ee6508e278a9301a6c635ad5e162f16528d --- /dev/null +++ b/dprnn3/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e71986eba3cb909993f03ebf556f67bda4d21379b68275e67336ce910c7cf9e5 +size 117035641 diff --git a/sepformer1/CKPT.yaml b/sepformer1/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ad24ec3835562dd451d2ac194037384f0161dad --- /dev/null +++ b/sepformer1/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: 1.3540320373494246 +unixtime: 1631310072.653676 diff --git a/sepformer1/brain.ckpt b/sepformer1/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/sepformer1/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/sepformer1/counter.ckpt b/sepformer1/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..55180c0973199684271f33f576cee2c35cbab893 --- /dev/null +++ b/sepformer1/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b +size 1 diff --git a/sepformer1/decoder.ckpt b/sepformer1/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c74c97c52133754a655a256605b554318b20f393 --- /dev/null +++ b/sepformer1/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:308586da02de3c86af1a597eabf7613d96c12d558bc1ef82274a286ab18281c2 +size 17272 diff --git a/sepformer1/encoder.ckpt b/sepformer1/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8cead37c5a519fcdf95551b2bcbc2ac4ef42ff1f --- /dev/null +++ b/sepformer1/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ba8d001c2b3fbd7cd034271448c0b51e6e7e4f33e9c0f6fc2fe80414ce9c42c +size 17272 diff --git a/sepformer1/hyperparams.yaml b/sepformer1/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e743acb34ec628cadb2e97686a1ab3f2a791f300 --- /dev/null +++ b/sepformer1/hyperparams.yaml @@ -0,0 +1,184 @@ +# Generated 2021-09-23 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: sepformer-whamr +output_folder: results/sepformer-whamr/3 +train_log: results/sepformer-whamr/3/train_log.txt +save_folder: results/sepformer-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/sepformer-whamr/3/save/whamr_tr.csv +valid_data: results/sepformer-whamr/3/save/whamr_cv.csv +test_data: results/sepformer-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: true # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + +# Specifying the network +Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock + num_layers: 8 + d_model: 256 + nhead: 8 + d_ffn: 1024 + dropout: 0 + use_positional_encoding: true + norm_before: true + +SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock + num_layers: 8 + d_model: 256 + nhead: 8 + d_ffn: 1024 + dropout: 0 + use_positional_encoding: true + norm_before: true + +MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model + + num_spks: 2 + in_channels: 256 + out_channels: 256 + num_layers: 2 + K: 250 + intra_model: *id001 + inter_model: *id002 + norm: ln + linear_layer_after_inter_intra: false + skip_around_intra: true + +Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id003 + decoder: *id004 + masknet: *id005 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/sepformer-whamr/3/save + recoverables: + encoder: *id003 + decoder: *id004 + masknet: *id005 + counter: *id006 + lr_scheduler: *id007 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/sepformer-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/sepformer1/lr_scheduler.ckpt b/sepformer1/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..ab011147f50325c5f55dca56a9ba3fcabc785662 --- /dev/null +++ b/sepformer1/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83fbdeb1cc11648e00f8885d3a41f36bf6c24d15b24e138f46af357c62a9dfc0 +size 495 diff --git a/sepformer1/masknet.ckpt b/sepformer1/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..3a7eb46e41ed849008f903d7dc57d6baeb76e442 --- /dev/null +++ b/sepformer1/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e5772c0ed40c85a2ea12fa9c2bbf6fe5a247bec060d13e63e97db085340bb4c +size 113112646 diff --git a/sepformer1/optimizer.ckpt b/sepformer1/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..739d1627f3198b45ebf44316a0e359c5b22ee4a7 --- /dev/null +++ b/sepformer1/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868cfce7a448f4a3c1045fe069b8da4a335bb2161e96ccd2323a0c4981d30f26 +size 205693881 diff --git a/sepformer2/CKPT.yaml b/sepformer2/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a593e3140e0a412dffb788a9b01e1415b37c18d --- /dev/null +++ b/sepformer2/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -7.248838132530435 +unixtime: 1631926460.3493145 diff --git a/sepformer2/brain.ckpt b/sepformer2/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/sepformer2/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/sepformer2/counter.ckpt b/sepformer2/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..808889747520c813180d488ebe6cd614fca66432 --- /dev/null +++ b/sepformer2/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f74efabef12ea619e30b79bddef89cffa9dda494761681ca862cff2871a85980 +size 2 diff --git a/sepformer2/decoder.ckpt b/sepformer2/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..746504d7d713c4e8b17fe88a2d3745e500856edd --- /dev/null +++ b/sepformer2/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee854d9e42e953b5e1e1e13c946fa07b67b0ed619661b9606744f66d27f4b66 +size 17272 diff --git a/sepformer2/encoder.ckpt b/sepformer2/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6a563b4b1426df689eb2de1b3e018260e0b5e55b --- /dev/null +++ b/sepformer2/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b6254aa3844cc2ce414b52a9194159420cf0428d729080ff368e1dd0475221d +size 17272 diff --git a/sepformer2/hyperparams.yaml b/sepformer2/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e743acb34ec628cadb2e97686a1ab3f2a791f300 --- /dev/null +++ b/sepformer2/hyperparams.yaml @@ -0,0 +1,184 @@ +# Generated 2021-09-23 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: sepformer-whamr +output_folder: results/sepformer-whamr/3 +train_log: results/sepformer-whamr/3/train_log.txt +save_folder: results/sepformer-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/sepformer-whamr/3/save/whamr_tr.csv +valid_data: results/sepformer-whamr/3/save/whamr_cv.csv +test_data: results/sepformer-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: true # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + +# Specifying the network +Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock + num_layers: 8 + d_model: 256 + nhead: 8 + d_ffn: 1024 + dropout: 0 + use_positional_encoding: true + norm_before: true + +SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock + num_layers: 8 + d_model: 256 + nhead: 8 + d_ffn: 1024 + dropout: 0 + use_positional_encoding: true + norm_before: true + +MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model + + num_spks: 2 + in_channels: 256 + out_channels: 256 + num_layers: 2 + K: 250 + intra_model: *id001 + inter_model: *id002 + norm: ln + linear_layer_after_inter_intra: false + skip_around_intra: true + +Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id003 + decoder: *id004 + masknet: *id005 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/sepformer-whamr/3/save + recoverables: + encoder: *id003 + decoder: *id004 + masknet: *id005 + counter: *id006 + lr_scheduler: *id007 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/sepformer-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/sepformer2/lr_scheduler.ckpt b/sepformer2/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..41eb6410bcf764a44095a5e51b8918dec771dfad --- /dev/null +++ b/sepformer2/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa4d3fc197e94be89f041d94a3291b4202f8e2f8c45cc55ebc510894e4b74b48 +size 1135 diff --git a/sepformer2/masknet.ckpt b/sepformer2/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..02469532edb3082b45bca0d7ecf67df36630688e --- /dev/null +++ b/sepformer2/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f97d50ea5109f93a4344ad09de7e37094517b230ea51fedcbed5c774d349f80b +size 113112646 diff --git a/sepformer2/optimizer.ckpt b/sepformer2/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c0eb44c199d3c1b9192cd51708ce3d386833a9ca --- /dev/null +++ b/sepformer2/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0388684a6697139e7b26cdbc6b0a59c6ea1d8cfd3267138858a27724b64216cb +size 205694713 diff --git a/sepformer3/CKPT.yaml b/sepformer3/CKPT.yaml new file mode 100644 index 0000000000000000000000000000000000000000..969452641b4d01190401006d488b603d6b7f701e --- /dev/null +++ b/sepformer3/CKPT.yaml @@ -0,0 +1,4 @@ +# yamllint disable +end-of-epoch: true +si-snr: -8.561470244863644 +unixtime: 1632574491.4719393 diff --git a/sepformer3/brain.ckpt b/sepformer3/brain.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b7c35d9d9c2f9520d4975291c60e1e5e2d16f815 --- /dev/null +++ b/sepformer3/brain.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e24193f36931b7f57932532efbdcf64971f42732383ba6808825f77db258f6 +size 28 diff --git a/sepformer3/counter.ckpt b/sepformer3/counter.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6d82abc5767e7a68f080e47c67a1a4a05edc17b5 --- /dev/null +++ b/sepformer3/counter.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e612bd1f5d132a339575b8dafb7842c64614e56bcf3d5ab65a0bc4b34329407 +size 3 diff --git a/sepformer3/decoder.ckpt b/sepformer3/decoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..d49b4ed3c00853f6e6359867f4f9467e8424d44b --- /dev/null +++ b/sepformer3/decoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fe9ba6f6e6e54de10ffc878a7a228e725281301d5a393870d455c0695abd32e +size 17272 diff --git a/sepformer3/encoder.ckpt b/sepformer3/encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..3954f06dd1652174427d1f1015a6c7be972581c0 --- /dev/null +++ b/sepformer3/encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:336c62fbb678eca723dfaf8f05e8d18a35a66f2c125a360e02b7019a0d69fa4f +size 17272 diff --git a/sepformer3/hyperparams.yaml b/sepformer3/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e743acb34ec628cadb2e97686a1ab3f2a791f300 --- /dev/null +++ b/sepformer3/hyperparams.yaml @@ -0,0 +1,184 @@ +# Generated 2021-09-23 from: +# /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml +# yamllint disable +# ################################ +# Model: SepFormer for source separation +# https://arxiv.org/abs/2010.13154 +# +# Dataset : WHAMR! +# ################################ +# Basic parameters +# Seed needs to be set at top of yaml, before objects with parameters are made +# +seed: 3 +__set_seed: !apply:torch.manual_seed [3] + +# Data params + +# the data folder for the wham dataset +# data_folder needs to follow the format: /yourpath/whamr. +# make sure to use the name whamr at your top folder for the dataset! +data_folder: /network/tmp1/subakany/whamr + +# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used +# e.g. /yourpath/wsj0-processed/si_tr_s/ +# you need to convert the original wsj0 to 8k +# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py +base_folder_dm: /network/tmp1/subakany/wsj0-processed/si_tr_s/ + +experiment_name: sepformer-whamr +output_folder: results/sepformer-whamr/3 +train_log: results/sepformer-whamr/3/train_log.txt +save_folder: results/sepformer-whamr/3/save + +# the file names should start with whamr instead of whamorg +train_data: results/sepformer-whamr/3/save/whamr_tr.csv +valid_data: results/sepformer-whamr/3/save/whamr_cv.csv +test_data: results/sepformer-whamr/3/save/whamr_tt.csv +skip_prep: false + +# Experiment params +auto_mix_prec: true # Set it to True for mixed precision +test_only: false +num_spks: 2 # set to 3 for wsj0-3mix +progressbar: true +save_audio: false # Save estimated sources on disk +sample_rate: 8000 + +# Training parameters +N_epochs: 200 +batch_size: 1 +lr: 0.00015 +clip_grad_norm: 5 +loss_upper_lim: 999999 # this is the upper limit for an acceptable loss +# if True, the training sequences are cut to a specified length +limit_training_signal_len: false +# this is the length of sequences if we choose to limit +# the signal length of training sequences +training_signal_len: 32000000 + +# Set it to True to dynamically create mixtures at training time +dynamic_mixing: true + +# Parameters for data augmentation + +# rir_path variable points to the directory of the room impulse responses +# e.g. /miniscratch/subakany/rir_wavs +# If the path does not exist, it is created automatically. +rir_path: /miniscratch/subakany/whamr_rirs_wav + +use_wavedrop: false +use_speedperturb: true +use_speedperturb_sameforeachsource: false +use_rand_shift: false +min_shift: -8000 +max_shift: 8000 + +speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 1.0 + drop_freq_prob: 0.0 + drop_chunk_prob: 0.0 + sample_rate: 8000 + speeds: [95, 100, 105] + +wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + perturb_prob: 0.0 + drop_freq_prob: 1.0 + drop_chunk_prob: 1.0 + sample_rate: 8000 + +# loss thresholding -- this thresholds the training loss +threshold_byloss: true +threshold: -30 + +# Encoder parameters +N_encoder_out: 256 +out_channels: 256 +kernel_size: 16 +kernel_stride: 8 + +# Dataloader options +dataloader_opts: + batch_size: 1 + num_workers: 3 + +# Specifying the network +Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder + kernel_size: 16 + out_channels: 256 + + +SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock + num_layers: 8 + d_model: 256 + nhead: 8 + d_ffn: 1024 + dropout: 0 + use_positional_encoding: true + norm_before: true + +SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock + num_layers: 8 + d_model: 256 + nhead: 8 + d_ffn: 1024 + dropout: 0 + use_positional_encoding: true + norm_before: true + +MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model + + num_spks: 2 + in_channels: 256 + out_channels: 256 + num_layers: 2 + K: 250 + intra_model: *id001 + inter_model: *id002 + norm: ln + linear_layer_after_inter_intra: false + skip_around_intra: true + +Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder + in_channels: 256 + out_channels: 1 + kernel_size: 16 + stride: 8 + bias: false + +optimizer: !name:torch.optim.Adam + lr: 0.00015 + weight_decay: 0 + +loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper + +lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau + + factor: 0.5 + patience: 2 + dont_halve_until_epoch: 85 + +epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter + limit: 200 + +modules: + encoder: *id003 + decoder: *id004 + masknet: *id005 +save_all_checkpoints: true +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: results/sepformer-whamr/3/save + recoverables: + encoder: *id003 + decoder: *id004 + masknet: *id005 + counter: *id006 + lr_scheduler: *id007 +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: results/sepformer-whamr/3/train_log.txt + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + encoder: !ref + masknet: !ref + decoder: !ref diff --git a/sepformer3/lr_scheduler.ckpt b/sepformer3/lr_scheduler.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e651c76f147c8205a38e577d531567a558b00c1f --- /dev/null +++ b/sepformer3/lr_scheduler.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:929792a757156577aa4efbe0208fe38b3f9a58e51951c7d83c15eb142251cf4d +size 1839 diff --git a/sepformer3/masknet.ckpt b/sepformer3/masknet.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f9c3ba0180018944af5e9e9edd3d9ab55b3b85bc --- /dev/null +++ b/sepformer3/masknet.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7121e0143b8d1786410c0ae377769ba13ebf25b36ef32d17df4c28c1451693b +size 113112646 diff --git a/sepformer3/optimizer.ckpt b/sepformer3/optimizer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..7a1490cafdfdd2b1af127d6414bfaade2952bef7 --- /dev/null +++ b/sepformer3/optimizer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab2f758f52d1f9f4ccf88e5ffab75faddcbed481062ab71d7a4c427ad1c6e47 +size 205694713