Spaces:

saefro991
/

aet_demo

Runtime error

App Files Files Community

saeki commited on Mar 18, 2022

Commit

7b918f7

1 Parent(s): e6364e9

fix

Browse files

Files changed (8) hide show

aet.py +368 -0
dataset.py +344 -0
eval.py +67 -0
lightning_module.py +875 -0
model.py +854 -0
preprocess.py +152 -0
train.py +106 -0
utils.py +147 -0

aet.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import argparse
+import pathlib
+import yaml
+import torch
+import torchaudio
+from torch.utils.data import DataLoader
+import numpy as np
+import random
+import librosa
+from dataset import Dataset
+import pickle
+from lightning_module import (
+    SSLStepLightningModule,
+    SSLDualLightningModule,
+)
+from utils import plot_and_save_mels
+import os
+import tqdm
+class AETDataset(Dataset):
+    def __init__(self, filetxt, src_config, tar_config):
+        self.config = src_config
+        self.preprocessed_dir_src = pathlib.Path(
+            src_config["general"]["preprocessed_path"]
+        )
+        self.preprocessed_dir_tar = pathlib.Path(
+            tar_config["general"]["preprocessed_path"]
+        )
+        for item in [
+            "sampling_rate",
+            "fft_length",
+            "frame_length",
+            "frame_shift",
+            "fmin",
+            "fmax",
+            "n_mels",
+        ]:
+            assert src_config["preprocess"][item] == tar_config["preprocess"][item]
+        self.spec_module = torchaudio.transforms.MelSpectrogram(
+            sample_rate=src_config["preprocess"]["sampling_rate"],
+            n_fft=src_config["preprocess"]["fft_length"],
+            win_length=src_config["preprocess"]["frame_length"],
+            hop_length=src_config["preprocess"]["frame_shift"],
+            f_min=src_config["preprocess"]["fmin"],
+            f_max=src_config["preprocess"]["fmax"],
+            n_mels=src_config["preprocess"]["n_mels"],
+            power=1,
+            center=True,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        with open(self.preprocessed_dir_src / filetxt, "r") as fr:
+            self.filelist_src = [pathlib.Path(path.strip("\n")) for path in fr]
+        with open(self.preprocessed_dir_tar / filetxt, "r") as fr:
+            self.filelist_tar = [pathlib.Path(path.strip("\n")) for path in fr]
+        self.d_out = {"src": {}, "tar": {}}
+        for item in ["wavs", "wavsaux"]:
+            self.d_out["src"][item] = []
+            self.d_out["tar"][item] = []
+        for swp in self.filelist_src:
+            if src_config["general"]["corpus_type"] == "single":
+                basename = str(swp.stem)
+            else:
+                basename = str(swp.parent.name) + "-" + str(swp.stem)
+            with open(
+                self.preprocessed_dir_src / "{}.pickle".format(basename), "rb"
+            ) as fw:
+                d_preprocessed = pickle.load(fw)
+            for item in ["wavs", "wavsaux"]:
+                try:
+                    self.d_out["src"][item].extend(d_preprocessed[item])
+                except:
+                    pass
+        for twp in self.filelist_tar:
+            if tar_config["general"]["corpus_type"] == "single":
+                basename = str(twp.stem)
+            else:
+                basename = str(twp.parent.name) + "-" + str(twp.stem)
+            with open(
+                self.preprocessed_dir_tar / "{}.pickle".format(basename), "rb"
+            ) as fw:
+                d_preprocessed = pickle.load(fw)
+            for item in ["wavs", "wavsaux"]:
+                try:
+                    self.d_out["tar"][item].extend(d_preprocessed[item])
+                except:
+                    pass
+        min_len = min(len(self.d_out["src"]["wavs"]), len(self.d_out["tar"]["wavs"]))
+        for spk in ["src", "tar"]:
+            for item in ["wavs", "wavsaux"]:
+                if self.d_out[spk][item] != None:
+                    self.d_out[spk][item] = np.asarray(self.d_out[spk][item][:min_len])
+    def __len__(self):
+        return len(self.d_out["src"]["wavs"])
+    def __getitem__(self, idx):
+        d_batch = {}
+        for spk in ["src", "tar"]:
+            for item in ["wavs", "wavsaux"]:
+                if self.d_out[spk][item].size > 0:
+                    d_batch["{}_{}".format(item, spk)] = torch.from_numpy(
+                        self.d_out[spk][item][idx]
+                    )
+                    d_batch["{}_{}".format(item, spk)] = self.normalize_waveform(
+                        d_batch["{}_{}".format(item, spk)], db=-3
+                    )
+        d_batch["melspecs_src"] = self.calc_spectrogram(d_batch["wavs_src"])
+        return d_batch
+class AETModule(torch.nn.Module):
+    """
+    src: Dataset from which we extract the channel features
+    tar: Dataset to which the src channel features are added
+    """
+    def __init__(self, args, chmatch_config, src_config, tar_config):
+        super().__init__()
+        if args.stage == "ssl-step":
+            LModule = SSLStepLightningModule
+        elif args.stage == "ssl-dual":
+            LModule = SSLDualLightningModule
+        else:
+            raise NotImplementedError()
+        src_model = LModule(src_config).load_from_checkpoint(
+            checkpoint_path=chmatch_config["general"]["source"]["ckpt_path"],
+            config=src_config,
+        )
+        self.src_config = src_config
+        self.encoder_src = src_model.encoder
+        if src_config["general"]["use_gst"]:
+            self.gst_src = src_model.gst
+        else:
+            self.channelfeats_src = src_model.channelfeats
+        self.channel_src = src_model.channel
+    def forward(self, melspecs_src, wavsaux_tar):
+        if self.src_config["general"]["use_gst"]:
+            chfeats_src = self.gst_src(melspecs_src.transpose(1, 2))
+        else:
+            _, enc_hidden_src = self.encoder_src(
+                melspecs_src.unsqueeze(1).transpose(2, 3)
+            )
+            chfeats_src = self.channelfeats_src(enc_hidden_src)
+        wavschmatch_tar = self.channel_src(wavsaux_tar, chfeats_src)
+        return wavschmatch_tar
+def get_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--stage", required=True, type=str)
+    parser.add_argument("--config_path", required=True, type=pathlib.Path)
+    parser.add_argument("--exist_src_aux", action="store_true")
+    parser.add_argument("--run_name", required=True, type=str)
+    return parser.parse_args()
+def main(args, chmatch_config, device):
+    src_config = yaml.load(
+        open(chmatch_config["general"]["source"]["config_path"], "r"),
+        Loader=yaml.FullLoader,
+    )
+    tar_config = yaml.load(
+        open(chmatch_config["general"]["target"]["config_path"], "r"),
+        Loader=yaml.FullLoader,
+    )
+    output_path = pathlib.Path(chmatch_config["general"]["output_path"]) / args.run_name
+    dataset = AETDataset("test.txt", src_config, tar_config)
+    loader = DataLoader(dataset, batch_size=1, shuffle=False)
+    chmatch_module = AETModule(args, chmatch_config, src_config, tar_config).to(device)
+    if args.exist_src_aux:
+        char_vector = calc_deg_charactaristics(chmatch_config)
+    for idx, batch in enumerate(tqdm.tqdm(loader)):
+        melspecs_src = batch["melspecs_src"].to(device)
+        wavsdeg_src = batch["wavs_src"].to(device)
+        wavsaux_tar = batch["wavsaux_tar"].to(device)
+        if args.exist_src_aux:
+            wavsdegbaseline_tar = calc_deg_baseline(
+                batch["wavsaux_tar"], char_vector, tar_config
+            )
+            wavsdegbaseline_tar = normalize_waveform(wavsdegbaseline_tar, tar_config)
+            wavsdeg_tar = batch["wavs_tar"].to(device)
+        wavsmatch_tar = normalize_waveform(
+            chmatch_module(melspecs_src, wavsaux_tar).cpu().detach(), tar_config
+        )
+        torchaudio.save(
+            output_path / "test_wavs" / "{}-src_wavsdeg.wav".format(idx),
+            wavsdeg_src.cpu(),
+            src_config["preprocess"]["sampling_rate"],
+        )
+        torchaudio.save(
+            output_path / "test_wavs" / "{}-tar_wavsaux.wav".format(idx),
+            wavsaux_tar.cpu(),
+            tar_config["preprocess"]["sampling_rate"],
+        )
+        if args.exist_src_aux:
+            torchaudio.save(
+                output_path / "test_wavs" / "{}-tar_wavsdegbaseline.wav".format(idx),
+                wavsdegbaseline_tar.cpu(),
+                tar_config["preprocess"]["sampling_rate"],
+            )
+            torchaudio.save(
+                output_path / "test_wavs" / "{}-tar_wavsdeg.wav".format(idx),
+                wavsdeg_tar.cpu(),
+                tar_config["preprocess"]["sampling_rate"],
+            )
+        torchaudio.save(
+            output_path / "test_wavs" / "{}-tar_wavsmatch.wav".format(idx),
+            wavsmatch_tar.cpu(),
+            tar_config["preprocess"]["sampling_rate"],
+        )
+        plot_and_save_mels(
+            wavsdeg_src[0, ...].cpu().detach(),
+            output_path / "test_mels" / "{}-src_melsdeg.png".format(idx),
+            src_config,
+        )
+        plot_and_save_mels(
+            wavsaux_tar[0, ...].cpu().detach(),
+            output_path / "test_mels" / "{}-tar_melsaux.png".format(idx),
+            tar_config,
+        )
+        if args.exist_src_aux:
+            plot_and_save_mels(
+                wavsdegbaseline_tar[0, ...].cpu().detach(),
+                output_path / "test_mels" / "{}-tar_melsdegbaseline.png".format(idx),
+                tar_config,
+            )
+            plot_and_save_mels(
+                wavsdeg_tar[0, ...].cpu().detach(),
+                output_path / "test_mels" / "{}-tar_melsdeg.png".format(idx),
+                tar_config,
+            )
+        plot_and_save_mels(
+            wavsmatch_tar[0, ...].cpu().detach(),
+            output_path / "test_mels" / "{}-tar_melsmatch.png".format(idx),
+            tar_config,
+        )
+def calc_deg_baseline(wav, char_vector, tar_config):
+    wav = wav[0, ...].cpu().detach().numpy()
+    spec = librosa.stft(
+        wav,
+        n_fft=tar_config["preprocess"]["fft_length"],
+        hop_length=tar_config["preprocess"]["frame_shift"],
+        win_length=tar_config["preprocess"]["frame_length"],
+    )
+    spec_converted = spec * char_vector.reshape(-1, 1)
+    wav_converted = librosa.istft(
+        spec_converted,
+        hop_length=tar_config["preprocess"]["frame_shift"],
+        win_length=tar_config["preprocess"]["frame_length"],
+    )
+    wav_converted = torch.from_numpy(wav_converted).to(torch.float32).unsqueeze(0)
+    return wav_converted
+def calc_deg_charactaristics(chmatch_config):
+    src_config = yaml.load(
+        open(chmatch_config["general"]["source"]["config_path"], "r"),
+        Loader=yaml.FullLoader,
+    )
+    tar_config = yaml.load(
+        open(chmatch_config["general"]["target"]["config_path"], "r"),
+        Loader=yaml.FullLoader,
+    )
+    # configs
+    preprocessed_dir = pathlib.Path(src_config["general"]["preprocessed_path"])
+    n_train = src_config["preprocess"]["n_train"]
+    SR = src_config["preprocess"]["sampling_rate"]
+    os.makedirs(preprocessed_dir, exist_ok=True)
+    sourcepath = pathlib.Path(src_config["general"]["source_path"])
+    if src_config["general"]["corpus_type"] == "single":
+        fulllist = list(sourcepath.glob("*.wav"))
+        random.seed(0)
+        random.shuffle(fulllist)
+        train_filelist = fulllist[:n_train]
+    elif src_config["general"]["corpus_type"] == "multi-seen":
+        fulllist = list(sourcepath.glob("*/*.wav"))
+        random.seed(0)
+        random.shuffle(fulllist)
+        train_filelist = fulllist[:n_train]
+    elif src_config["general"]["corpus_type"] == "multi-unseen":
+        spk_list = list(set([x.parent for x in sourcepath.glob("*/*.wav")]))
+        train_filelist = []
+        random.seed(0)
+        random.shuffle(spk_list)
+        for i, spk in enumerate(spk_list):
+            sourcespkpath = sourcepath / spk
+            if i < n_train:
+                train_filelist.extend(list(sourcespkpath.glob("*.wav")))
+    else:
+        raise NotImplementedError(
+            "corpus_type specified in config.yaml should be {single, multi-seen, multi-unseen}"
+        )
+    specs_all = np.zeros((tar_config["preprocess"]["fft_length"] // 2 + 1, 1))
+    for wp in tqdm.tqdm(train_filelist):
+        wav, _ = librosa.load(wp, sr=SR)
+        spec = np.abs(
+            librosa.stft(
+                wav,
+                n_fft=src_config["preprocess"]["fft_length"],
+                hop_length=src_config["preprocess"]["frame_shift"],
+                win_length=src_config["preprocess"]["frame_length"],
+            )
+        )
+        auxpath = pathlib.Path(src_config["general"]["aux_path"])
+        if src_config["general"]["corpus_type"] == "single":
+            wav_aux, _ = librosa.load(auxpath / wp.name, sr=SR)
+        else:
+            wav_aux, _ = librosa.load(auxpath / wp.parent.name / wp.name, sr=SR)
+        spec_aux = np.abs(
+            librosa.stft(
+                wav_aux,
+                n_fft=src_config["preprocess"]["fft_length"],
+                hop_length=src_config["preprocess"]["frame_shift"],
+                win_length=src_config["preprocess"]["frame_length"],
+            )
+        )
+        min_len = min(spec.shape[1], spec_aux.shape[1])
+        spec_diff = spec[:, :min_len] / (spec_aux[:, :min_len] + 1e-10)
+        specs_all = np.hstack([specs_all, np.mean(spec_diff, axis=1).reshape(-1, 1)])
+    char_vector = np.mean(specs_all, axis=1)
+    char_vector = char_vector / (np.sum(char_vector) + 1e-10)
+    return char_vector
+def normalize_waveform(wav, tar_config, db=-3):
+    wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+        wav,
+        tar_config["preprocess"]["sampling_rate"],
+        [["norm", "{}".format(db)]],
+    )
+    return wav
+if __name__ == "__main__":
+    args = get_arg()
+    chmatch_config = yaml.load(open(args.config_path, "r"), Loader=yaml.FullLoader)
+    output_path = pathlib.Path(chmatch_config["general"]["output_path"]) / args.run_name
+    os.makedirs(output_path, exist_ok=True)
+    os.makedirs(output_path / "test_wavs", exist_ok=True)
+    os.makedirs(output_path / "test_mels", exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    main(args, chmatch_config, device)

dataset.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import pickle
+import pathlib
+import torch
+from torch.utils.data.dataloader import DataLoader
+import pytorch_lightning as pl
+import numpy as np
+import yaml
+import torchaudio
+import pyworld
+import pysptk
+import random
+class DataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.batchsize = config["train"]["batchsize"]
+        self.preprocessed_dir = pathlib.Path(config["general"]["preprocessed_path"])
+    def setup(self, stage):
+        if not self.preprocessed_dir.exists():
+            raise RuntimeError("Preprocessed directory was not be found")
+        if "dual" in self.config:
+            if self.config["dual"]["enable"]:
+                task_config = yaml.load(
+                    open(self.config["dual"]["config_path"], "r"),
+                    Loader=yaml.FullLoader,
+                )
+                task_preprocessed_dir = (
+                    self.preprocessed_dir.parent
+                    / pathlib.Path(task_config["general"]["preprocessed_path"]).name
+                )
+                if not task_preprocessed_dir.exists():
+                    raise RuntimeError(
+                        "Preprocessed directory for multi-task learning was not be found"
+                    )
+        self.flnames = {
+            "train": "train.txt",
+            "val": "val.txt",
+            "test": "test.txt",
+        }
+    def get_ds(self, phase):
+        ds = Dataset(self.flnames[phase], self.config)
+        return ds
+    def get_loader(self, phase):
+        ds = self.get_ds(phase)
+        dl = DataLoader(
+            ds,
+            self.batchsize,
+            shuffle=True if phase == "train" else False,
+            num_workers=self.config["train"]["num_workers"],
+            drop_last=True,
+        )
+        return dl
+    def train_dataloader(self):
+        return self.get_loader(phase="train")
+    def val_dataloader(self):
+        return self.get_loader(phase="val")
+    def test_dataloader(self):
+        return self.get_loader(phase="test")
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, filetxt, config):
+        self.preprocessed_dir = pathlib.Path(config["general"]["preprocessed_path"])
+        self.config = config
+        self.spec_module = torchaudio.transforms.MelSpectrogram(
+            sample_rate=config["preprocess"]["sampling_rate"],
+            n_fft=config["preprocess"]["fft_length"],
+            win_length=config["preprocess"]["frame_length"],
+            hop_length=config["preprocess"]["frame_shift"],
+            f_min=config["preprocess"]["fmin"],
+            f_max=config["preprocess"]["fmax"],
+            n_mels=config["preprocess"]["n_mels"],
+            power=1,
+            center=True,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        self.resample_candidate = [8000, 11025, 12000, 16000]
+        self.quantization_candidate = range(2 ** 6, 2 ** 10 + 2, 2)
+        self.segment_length = config["preprocess"]["segment_length"]
+        with open(self.preprocessed_dir / filetxt, "r") as fr:
+            self.filelist = [pathlib.Path(path.strip("\n")) for path in fr]
+        self.d_out = dict()
+        for item in ["wavs", "wavsaux"]:
+            self.d_out[item] = []
+        for wp in self.filelist:
+            if config["general"]["corpus_type"] == "single":
+                basename = str(wp.stem)
+            else:
+                basename = str(wp.parent.name) + "-" + str(wp.stem)
+            with open(self.preprocessed_dir / "{}.pickle".format(basename), "rb") as fw:
+                d_preprocessed = pickle.load(fw)
+            for item in ["wavs", "wavsaux"]:
+                try:
+                    self.d_out[item].extend(d_preprocessed[item])
+                except:
+                    pass
+        for item in ["wavs", "wavsaux"]:
+            if self.d_out[item] != None:
+                self.d_out[item] = np.asarray(self.d_out[item])
+        if "dual" in self.config:
+            if self.config["dual"]["enable"]:
+                task_config = yaml.load(
+                    open(config["dual"]["config_path"], "r"),
+                    Loader=yaml.FullLoader,
+                )
+                task_preprocessed_dir = (
+                    self.preprocessed_dir.parent
+                    / pathlib.Path(task_config["general"]["preprocessed_path"]).name
+                )
+                with open(task_preprocessed_dir / filetxt, "r") as fr:
+                    task_filelist = [pathlib.Path(path.strip("\n")) for path in fr]
+                self.d_out["wavstask"] = []
+                for wp in task_filelist:
+                    if task_config["general"]["corpus_type"] == "single":
+                        basename = str(wp.stem)
+                    else:
+                        basename = str(wp.parent.name) + "-" + str(wp.stem)
+                    with open(
+                        task_preprocessed_dir / "{}.pickle".format(basename), "rb"
+                    ) as fw:
+                        d_preprocessed = pickle.load(fw)
+                    self.d_out["wavstask"].extend(d_preprocessed["wavs"])
+                self.d_out["wavstask"] = np.asarray(self.d_out["wavstask"])
+    def __len__(self):
+        return len(self.d_out["wavs"])
+    def __getitem__(self, idx):
+        d_batch = {}
+        if self.d_out["wavs"].size > 0:
+            d_batch["wavs"] = torch.from_numpy(self.d_out["wavs"][idx])
+            if self.segment_length > 0:
+                d_batch["wavs"] = self.get_segment(d_batch["wavs"], self.segment_length)
+        if self.d_out["wavsaux"].size > 0:
+            d_batch["wavsaux"] = torch.from_numpy(self.d_out["wavsaux"][idx])
+            if self.segment_length > 0:
+                d_batch["wavsaux"] = self.get_segment(
+                    d_batch["wavsaux"], self.segment_length
+                )
+        if self.config["general"]["stage"] == "pretrain":
+            if self.config["train"]["augment"]:
+                d_batch["wavs"] = self.augmentation(d_batch["wavsaux"])
+            d_batch["wavs"] = self.normalize_waveform(d_batch["wavs"], db=-3)
+            d_batch["wavsaux"] = self.normalize_waveform(d_batch["wavsaux"], db=-3)
+            if len(d_batch["wavs"]) != len(d_batch["wavsaux"]):
+                min_seq_len = min(len(d_batch["wavs"]), len(d_batch["wavsaux"]))
+                d_batch["wavs"] = d_batch["wavs"][:min_seq_len]
+                d_batch["wavsaux"] = d_batch["wavsaux"][:min_seq_len]
+            d_batch["melspecs"] = self.calc_spectrogram(d_batch["wavs"])
+            if self.config["general"]["feature_type"] == "melspec":
+                d_batch["melspecsaux"] = self.calc_spectrogram(d_batch["wavsaux"])
+            elif self.config["general"]["feature_type"] == "vocfeats":
+                d_batch["melceps"] = self.calc_melcep(d_batch["wavsaux"])
+                d_batch["f0s"] = self.calc_f0(d_batch["wavs"])
+                d_batch["melcepssrc"] = self.calc_melcep(d_batch["wavs"])
+            else:
+                raise NotImplementedError()
+        elif self.config["general"]["stage"].startswith("ssl"):
+            d_batch["wavs"] = self.normalize_waveform(d_batch["wavs"], db=-3)
+            d_batch["melspecs"] = self.calc_spectrogram(d_batch["wavs"])
+            if self.config["general"]["feature_type"] == "vocfeats":
+                d_batch["f0s"] = self.calc_f0(d_batch["wavs"])
+                d_batch["melcepssrc"] = self.calc_melcep(d_batch["wavs"])
+            if self.d_out["wavsaux"].size > 0:
+                d_batch["wavsaux"] = self.normalize_waveform(d_batch["wavsaux"], db=-3)
+                if self.config["general"]["feature_type"] == "melspec":
+                    d_batch["melspecsaux"] = self.calc_spectrogram(d_batch["wavsaux"])
+                elif self.config["general"]["feature_type"] == "vocfeats":
+                    d_batch["melceps"] = self.calc_melcep(d_batch["wavsaux"])
+            if "dual" in self.config:
+                if self.config["dual"]["enable"]:
+                    d_batch["wavstask"] = torch.from_numpy(self.d_out["wavstask"][idx])
+                    d_batch["wavstask"] = self.get_segment(
+                        d_batch["wavstask"], self.segment_length
+                    )
+                    d_batch["wavstask"] = self.normalize_waveform(
+                        d_batch["wavstask"], db=-3
+                    )
+                    if self.config["general"]["feature_type"] == "melspec":
+                        d_batch["melspecstask"] = self.calc_spectrogram(
+                            d_batch["wavstask"]
+                        )
+                    elif self.config["general"]["feature_type"] == "vocfeats":
+                        d_batch["melcepstask"] = self.calc_melcep(d_batch["wavstask"])
+                    else:
+                        raise NotImplementedError()
+        else:
+            raise NotImplementedError()
+        return d_batch
+    def calc_spectrogram(self, wav):
+        specs = self.spec_module(wav)
+        log_spec = torch.log(
+            torch.clamp_min(specs, self.config["preprocess"]["min_magnitude"])
+            * self.config["preprocess"]["comp_factor"]
+        ).to(torch.float32)
+        return log_spec
+    def calc_melcep(self, wav):
+        wav = wav.numpy()
+        _, sp, _ = pyworld.wav2world(
+            wav.astype(np.float64),
+            self.config["preprocess"]["sampling_rate"],
+            fft_size=self.config["preprocess"]["fft_length"],
+            frame_period=(
+                self.config["preprocess"]["frame_shift"]
+                / self.config["preprocess"]["sampling_rate"]
+                * 1000
+            ),
+        )
+        melcep = pysptk.sp2mc(
+            sp,
+            order=self.config["preprocess"]["cep_order"],
+            alpha=pysptk.util.mcepalpha(self.config["preprocess"]["sampling_rate"]),
+        ).transpose(1, 0)
+        melcep = torch.from_numpy(melcep).to(torch.float32)
+        return melcep
+    def calc_f0(self, wav):
+        if self.config["preprocess"]["f0_extractor"] == "dio":
+            return self.calc_f0_dio(wav)
+        elif self.config["preprocess"]["f0_extractor"] == "harvest":
+            return self.calc_f0_harvest(wav)
+        elif self.config["preprocess"]["f0_extractor"] == "swipe":
+            return self.calc_f0_swipe(wav)
+        else:
+            raise NotImplementedError()
+    def calc_f0_dio(self, wav):
+        wav = wav.numpy()
+        _f0, _t = pyworld.dio(
+            wav.astype(np.float64),
+            self.config["preprocess"]["sampling_rate"],
+            frame_period=(
+                self.config["preprocess"]["frame_shift"]
+                / self.config["preprocess"]["sampling_rate"]
+                * 1000
+            ),
+        )
+        f0 = pyworld.stonemask(
+            wav.astype(np.float64), _f0, _t, self.config["preprocess"]["sampling_rate"]
+        )
+        f0 = torch.from_numpy(f0).to(torch.float32)
+        return f0
+    def calc_f0_harvest(self, wav):
+        wav = wav.numpy()
+        _f0, _t = pyworld.harvest(
+            wav.astype(np.float64),
+            self.config["preprocess"]["sampling_rate"],
+            frame_period=(
+                self.config["preprocess"]["frame_shift"]
+                / self.config["preprocess"]["sampling_rate"]
+                * 1000
+            ),
+        )
+        f0 = pyworld.stonemask(
+            wav.astype(np.float64), _f0, _t, self.config["preprocess"]["sampling_rate"]
+        )
+        f0 = torch.from_numpy(f0).to(torch.float32)
+        return f0
+    def calc_f0_swipe(self, wav):
+        wav = wav.numpy()
+        f0 = pysptk.sptk.swipe(
+            wav.astype(np.float64),
+            fs=self.config["preprocess"]["sampling_rate"],
+            min=71,
+            max=800,
+            hopsize=self.config["preprocess"]["frame_shift"],
+            otype="f0",
+        )
+        f0 = torch.from_numpy(f0).to(torch.float32)
+        return f0
+    def augmentation(self, wav):
+        wav /= torch.max(torch.abs(wav))
+        new_freq = random.choice(self.resample_candidate)
+        new_quantization = random.choice(self.quantization_candidate)
+        mulaw_encoder = torchaudio.transforms.MuLawEncoding(
+            quantization_channels=new_quantization
+        )
+        wav_quantized = mulaw_encoder(wav) / new_quantization * 2.0 - 1.0
+        downsampler = torchaudio.transforms.Resample(
+            orig_freq=self.config["preprocess"]["sampling_rate"],
+            new_freq=new_freq,
+            resampling_method="sinc_interpolation",
+            lowpass_filter_width=6,
+            dtype=torch.float32,
+        )
+        upsampler = torchaudio.transforms.Resample(
+            orig_freq=new_freq,
+            new_freq=self.config["preprocess"]["sampling_rate"],
+            resampling_method="sinc_interpolation",
+            lowpass_filter_width=6,
+            dtype=torch.float32,
+        )
+        wav_processed = upsampler(downsampler(wav_quantized))
+        return wav_processed
+    def normalize_waveform(self, wav, db=-3):
+        wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+            wav.unsqueeze(0),
+            self.config["preprocess"]["sampling_rate"],
+            [["norm", "{}".format(db)]],
+        )
+        return wav.squeeze(0)
+    def get_segment(self, wav, segment_length):
+        seg_size = self.config["preprocess"]["sampling_rate"] * segment_length
+        if len(wav) >= seg_size:
+            max_wav_start = len(wav) - seg_size
+            wav_start = random.randint(0, max_wav_start)
+            wav = wav[wav_start : wav_start + seg_size]
+        else:
+            wav = torch.nn.functional.pad(wav, (0, seg_size - len(wav)), "constant")
+        return wav

eval.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import argparse
+import os
+import pathlib
+import yaml
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers.csv_logs import CSVLogger
+from pytorch_lightning.loggers import TensorBoardLogger
+from dataset import DataModule
+from lightning_module import (
+    PretrainLightningModule,
+    SSLStepLightningModule,
+    SSLDualLightningModule,
+)
+def get_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", required=True, type=pathlib.Path)
+    parser.add_argument("--ckpt_path", required=True, type=pathlib.Path)
+    parser.add_argument(
+        "--stage", required=True, type=str, choices=["pretrain", "ssl-step", "ssl-dual"]
+    )
+    parser.add_argument("--run_name", required=True, type=str)
+    return parser.parse_args()
+def eval(args, config, output_path):
+    csvlogger = CSVLogger(save_dir=output_path, name="test_log")
+    trainer = Trainer(
+        gpus=-1,
+        deterministic=False,
+        auto_select_gpus=True,
+        benchmark=True,
+        logger=[csvlogger],
+        default_root_dir=os.getcwd(),
+    )
+    if config["general"]["stage"] == "pretrain":
+        model = PretrainLightningModule(config).load_from_checkpoint(
+            checkpoint_path=args.ckpt_path, config=config
+        )
+    elif config["general"]["stage"] == "ssl-step":
+        model = SSLStepLightningModule(config).load_from_checkpoint(
+            checkpoint_path=args.ckpt_path, config=config
+        )
+    elif config["general"]["stage"] == "ssl-dual":
+        model = SSLDualLightningModule(config).load_from_checkpoint(
+            checkpoint_path=args.ckpt_path, config=config
+        )
+    else:
+        raise NotImplementedError()
+    datamodule = DataModule(config)
+    trainer.test(model=model, verbose=True, datamodule=datamodule)
+if __name__ == "__main__":
+    args = get_arg()
+    config = yaml.load(open(args.config_path, "r"), Loader=yaml.FullLoader)
+    output_path = str(pathlib.Path(config["general"]["output_path"]) / args.run_name)
+    config["general"]["stage"] = str(getattr(args, "stage"))
+    eval(args, config, output_path)

lightning_module.py ADDED Viewed

	@@ -0,0 +1,875 @@

+import torch
+import pytorch_lightning as pl
+import torchaudio
+import os
+import pathlib
+import tqdm
+from model import (
+    EncoderModule,
+    ChannelFeatureModule,
+    ChannelModule,
+    MultiScaleSpectralLoss,
+    GSTModule,
+)
+from utils import (
+    manual_logging,
+    load_vocoder,
+    plot_and_save_mels,
+    plot_and_save_mels_all,
+)
+class PretrainLightningModule(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.save_hyperparameters()
+        self.config = config
+        if config["general"]["use_gst"]:
+            self.encoder = EncoderModule(config)
+            self.gst = GSTModule(config)
+        else:
+            self.encoder = EncoderModule(config, use_channel=True)
+            self.channelfeats = ChannelFeatureModule(config)
+        self.channel = ChannelModule(config)
+        self.vocoder = load_vocoder(config)
+        self.criteria_a = MultiScaleSpectralLoss(config)
+        if "feature_loss" in config["train"]:
+            if config["train"]["feature_loss"]["type"] == "mae":
+                self.criteria_b = torch.nn.L1Loss()
+            else:
+                self.criteria_b = torch.nn.MSELoss()
+        else:
+            self.criteria = torch.nn.L1Loss()
+        self.alpha = config["train"]["alpha"]
+    def forward(self, melspecs, wavsaux):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(melspecs.unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(melspecs.transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(melspecs.unsqueeze(1).transpose(2, 3))
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        wavsdeg = self.channel(wavsaux, chfeats)
+        return enc_out, wavsdeg
+    def training_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        wavsdeg = self.channel(batch["wavsaux"], chfeats)
+        loss_recons = self.criteria_a(wavsdeg, batch["wavs"])
+        if self.config["general"]["feature_type"] == "melspec":
+            loss_encoder = self.criteria_b(enc_out, batch["melspecsaux"])
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            loss_encoder = self.criteria_b(enc_out, batch["melceps"])
+        loss = self.alpha * loss_recons + (1.0 - self.alpha) * loss_encoder
+        self.log(
+            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
+        )
+        self.log(
+            "train_loss_recons",
+            loss_recons,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.log(
+            "train_loss_encoder",
+            loss_encoder,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        return loss
+    def validation_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        wavsdeg = self.channel(batch["wavsaux"], chfeats)
+        loss_recons = self.criteria_a(wavsdeg, batch["wavs"])
+        if self.config["general"]["feature_type"] == "melspec":
+            val_aux_feats = batch["melspecsaux"]
+            feats_name = "melspec"
+            loss_encoder = self.criteria_b(enc_out, val_aux_feats)
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            val_aux_feats = batch["melceps"]
+            feats_name = "melcep"
+            loss_encoder = self.criteria_b(enc_out, val_aux_feats)
+        loss = self.alpha * loss_recons + (1.0 - self.alpha) * loss_encoder
+        logger_img_dict = {
+            "val_src_melspec": batch["melspecs"],
+            "val_pred_{}".format(feats_name): enc_out,
+            "val_aux_{}".format(feats_name): val_aux_feats,
+        }
+        logger_wav_dict = {
+            "val_src_wav": batch["wavs"],
+            "val_pred_wav": wavsdeg,
+            "val_aux_wav": batch["wavsaux"],
+        }
+        return {
+            "val_loss": loss,
+            "val_loss_recons": loss_recons,
+            "val_loss_encoder": loss_encoder,
+            "logger_dict": [logger_img_dict, logger_wav_dict],
+        }
+    def validation_epoch_end(self, outputs):
+        val_loss = torch.stack([out["val_loss"] for out in outputs]).mean().item()
+        val_loss_recons = (
+            torch.stack([out["val_loss_recons"] for out in outputs]).mean().item()
+        )
+        val_loss_encoder = (
+            torch.stack([out["val_loss_encoder"] for out in outputs]).mean().item()
+        )
+        self.log("val_loss", val_loss, on_epoch=True, prog_bar=True, logger=True)
+        self.log(
+            "val_loss_recons",
+            val_loss_recons,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.log(
+            "val_loss_encoder",
+            val_loss_encoder,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.tflogger(logger_dict=outputs[-1]["logger_dict"][0], data_type="image")
+        self.tflogger(logger_dict=outputs[-1]["logger_dict"][1], data_type="audio")
+    def test_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        wavsdeg = self.channel(batch["wavsaux"], chfeats)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+            enc_feats_aux = batch["melspecsaux"]
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+            enc_feats_aux = torch.cat(
+                (batch["f0s"].unsqueeze(1), batch["melceps"]), dim=1
+            )
+        recons_wav = self.vocoder(enc_feats_aux).squeeze(1)
+        remas = self.vocoder(enc_feats).squeeze(1)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats_input = batch["melspecs"]
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats_input = torch.cat(
+                (batch["f0s"].unsqueeze(1), batch["melcepssrc"]), dim=1
+            )
+        input_recons = self.vocoder(enc_feats_input).squeeze(1)
+        if "wavsaux" in batch:
+            gt_wav = batch["wavsaux"]
+        else:
+            gt_wav = None
+        return {
+            "reconstructed": recons_wav,
+            "remastered": remas,
+            "channeled": wavsdeg,
+            "groundtruth": gt_wav,
+            "input": batch["wavs"],
+            "input_recons": input_recons,
+        }
+    def test_epoch_end(self, outputs):
+        wav_dir = (
+            pathlib.Path(self.logger.experiment[0].log_dir).parent.parent / "test_wavs"
+        )
+        os.makedirs(wav_dir, exist_ok=True)
+        mel_dir = (
+            pathlib.Path(self.logger.experiment[0].log_dir).parent.parent / "test_mels"
+        )
+        os.makedirs(mel_dir, exist_ok=True)
+        print("Saving mel spectrogram plots ...")
+        for idx, out in enumerate(tqdm.tqdm(outputs)):
+            for key in [
+                "reconstructed",
+                "remastered",
+                "channeled",
+                "input",
+                "input_recons",
+                "groundtruth",
+            ]:
+                if out[key] != None:
+                    torchaudio.save(
+                        wav_dir / "{}-{}.wav".format(idx, key),
+                        out[key][0, ...].unsqueeze(0).cpu(),
+                        sample_rate=self.config["preprocess"]["sampling_rate"],
+                        channels_first=True,
+                    )
+                    plot_and_save_mels(
+                        out[key][0, ...].cpu(),
+                        mel_dir / "{}-{}.png".format(idx, key),
+                        self.config,
+                    )
+            plot_and_save_mels_all(
+                out,
+                [
+                    "reconstructed",
+                    "remastered",
+                    "channeled",
+                    "input",
+                    "input_recons",
+                    "groundtruth",
+                ],
+                mel_dir / "{}-all.png".format(idx),
+                self.config,
+            )
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(
+            self.parameters(), lr=self.config["train"]["learning_rate"]
+        )
+        lr_scheduler_config = {
+            "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer, mode="min", factor=0.5, min_lr=1e-5, verbose=True
+            ),
+            "interval": "epoch",
+            "frequency": 3,
+            "monitor": "val_loss",
+        }
+        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}
+    def tflogger(self, logger_dict, data_type):
+        for lg in self.logger.experiment:
+            if type(lg).__name__ == "SummaryWriter":
+                tensorboard = lg
+        for key in logger_dict.keys():
+            manual_logging(
+                logger=tensorboard,
+                item=logger_dict[key],
+                idx=0,
+                tag=key,
+                global_step=self.global_step,
+                data_type=data_type,
+                config=self.config,
+            )
+class SSLBaseModule(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.save_hyperparameters()
+        self.config = config
+        if config["general"]["use_gst"]:
+            self.encoder = EncoderModule(config)
+            self.gst = GSTModule(config)
+        else:
+            self.encoder = EncoderModule(config, use_channel=True)
+            self.channelfeats = ChannelFeatureModule(config)
+        self.channel = ChannelModule(config)
+        if config["train"]["load_pretrained"]:
+            pre_model = PretrainLightningModule.load_from_checkpoint(
+                checkpoint_path=config["train"]["pretrained_path"]
+            )
+            self.encoder.load_state_dict(pre_model.encoder.state_dict(), strict=False)
+            self.channel.load_state_dict(pre_model.channel.state_dict(), strict=False)
+            if config["general"]["use_gst"]:
+                self.gst.load_state_dict(pre_model.gst.state_dict(), strict=False)
+            else:
+                self.channelfeats.load_state_dict(
+                    pre_model.channelfeats.state_dict(), strict=False
+                )
+        self.vocoder = load_vocoder(config)
+        self.criteria = self.get_loss_function(config)
+    def training_step(self, batch, batch_idx):
+        raise NotImplementedError()
+    def validation_step(self, batch, batch_idx):
+        raise NotImplementedError()
+    def validation_epoch_end(self, outputs):
+        raise NotImplementedError()
+    def configure_optimizers(self):
+        raise NotImplementedError()
+    def get_loss_function(self, config):
+        raise NotImplementedError()
+    def forward(self, melspecs, f0s=None):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(melspecs.unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(melspecs.transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(melspecs.unsqueeze(1).transpose(2, 3))
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((f0s.unsqueeze(1), enc_out), dim=1)
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        return remas, wavsdeg
+    def test_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats_input = batch["melspecs"]
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats_input = torch.cat(
+                (batch["f0s"].unsqueeze(1), batch["melcepssrc"]), dim=1
+            )
+        input_recons = self.vocoder(enc_feats_input).squeeze(1)
+        if "wavsaux" in batch:
+            gt_wav = batch["wavsaux"]
+            if self.config["general"]["feature_type"] == "melspec":
+                enc_feats_aux = batch["melspecsaux"]
+            elif self.config["general"]["feature_type"] == "vocfeats":
+                enc_feats_aux = torch.cat(
+                    (batch["f0s"].unsqueeze(1), batch["melceps"]), dim=1
+                )
+            recons_wav = self.vocoder(enc_feats_aux).squeeze(1)
+        else:
+            gt_wav = None
+            recons_wav = None
+        return {
+            "reconstructed": recons_wav,
+            "remastered": remas,
+            "channeled": wavsdeg,
+            "input": batch["wavs"],
+            "input_recons": input_recons,
+            "groundtruth": gt_wav,
+        }
+    def test_epoch_end(self, outputs):
+        wav_dir = (
+            pathlib.Path(self.logger.experiment[0].log_dir).parent.parent / "test_wavs"
+        )
+        os.makedirs(wav_dir, exist_ok=True)
+        mel_dir = (
+            pathlib.Path(self.logger.experiment[0].log_dir).parent.parent / "test_mels"
+        )
+        os.makedirs(mel_dir, exist_ok=True)
+        print("Saving mel spectrogram plots ...")
+        for idx, out in enumerate(tqdm.tqdm(outputs)):
+            plot_keys = []
+            for key in [
+                "reconstructed",
+                "remastered",
+                "channeled",
+                "input",
+                "input_recons",
+                "groundtruth",
+            ]:
+                if out[key] != None:
+                    plot_keys.append(key)
+                    torchaudio.save(
+                        wav_dir / "{}-{}.wav".format(idx, key),
+                        out[key][0, ...].unsqueeze(0).cpu(),
+                        sample_rate=self.config["preprocess"]["sampling_rate"],
+                        channels_first=True,
+                    )
+                    plot_and_save_mels(
+                        out[key][0, ...].cpu(),
+                        mel_dir / "{}-{}.png".format(idx, key),
+                        self.config,
+                    )
+            plot_and_save_mels_all(
+                out,
+                plot_keys,
+                mel_dir / "{}-all.png".format(idx),
+                self.config,
+            )
+    def tflogger(self, logger_dict, data_type):
+        for lg in self.logger.experiment:
+            if type(lg).__name__ == "SummaryWriter":
+                tensorboard = lg
+        for key in logger_dict.keys():
+            manual_logging(
+                logger=tensorboard,
+                item=logger_dict[key],
+                idx=0,
+                tag=key,
+                global_step=self.global_step,
+                data_type=data_type,
+                config=self.config,
+            )
+class SSLStepLightningModule(SSLBaseModule):
+    def __init__(self, config):
+        super().__init__(config)
+        if config["train"]["fix_channel"]:
+            for param in self.channel.parameters():
+                param.requires_grad = False
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        loss = self.criteria(wavsdeg, batch["wavs"])
+        self.log(
+            "train_loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        return loss
+    def validation_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+            feats_name = "melspec"
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+            feats_name = "melcep"
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        loss = self.criteria(wavsdeg, batch["wavs"])
+        logger_img_dict = {
+            "val_src_melspec": batch["melspecs"],
+            "val_pred_{}".format(feats_name): enc_out,
+        }
+        for auxfeats in ["melceps", "melspecsaux"]:
+            if auxfeats in batch:
+                logger_img_dict["val_aux_{}".format(auxfeats)] = batch[auxfeats]
+        logger_wav_dict = {
+            "val_src_wav": batch["wavs"],
+            "val_remastered_wav": remas,
+            "val_pred_wav": wavsdeg,
+        }
+        if "wavsaux" in batch:
+            logger_wav_dict["val_aux_wav"] = batch["wavsaux"]
+        d_out = {"val_loss": loss, "logger_dict": [logger_img_dict, logger_wav_dict]}
+        return d_out
+    def validation_epoch_end(self, outputs):
+        self.log(
+            "val_loss",
+            torch.stack([out["val_loss"] for out in outputs]).mean().item(),
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.tflogger(logger_dict=outputs[-1]["logger_dict"][0], data_type="image")
+        self.tflogger(logger_dict=outputs[-1]["logger_dict"][1], data_type="audio")
+    def optimizer_step(
+        self,
+        epoch,
+        batch_idx,
+        optimizer,
+        optimizer_idx,
+        optimizer_closure,
+        on_tpu=False,
+        using_native_amp=False,
+        using_lbfgs=False,
+    ):
+        if epoch < self.config["train"]["epoch_channel"]:
+            if optimizer_idx == 0:
+                optimizer.step(closure=optimizer_closure)
+            elif optimizer_idx == 1:
+                optimizer_closure()
+        else:
+            if optimizer_idx == 0:
+                optimizer_closure()
+            elif optimizer_idx == 1:
+                optimizer.step(closure=optimizer_closure)
+    def configure_optimizers(self):
+        if self.config["train"]["fix_channel"]:
+            if self.config["general"]["use_gst"]:
+                optimizer_channel = torch.optim.Adam(
+                    self.gst.parameters(), lr=self.config["train"]["learning_rate"]
+                )
+            else:
+                optimizer_channel = torch.optim.Adam(
+                    self.channelfeats.parameters(),
+                    lr=self.config["train"]["learning_rate"],
+                )
+            optimizer_encoder = torch.optim.Adam(
+                self.encoder.parameters(), lr=self.config["train"]["learning_rate"]
+            )
+        else:
+            if self.config["general"]["use_gst"]:
+                optimizer_channel = torch.optim.Adam(
+                    [
+                        {"params": self.channel.parameters()},
+                        {"params": self.gst.parameters()},
+                    ],
+                    lr=self.config["train"]["learning_rate"],
+                )
+            else:
+                optimizer_channel = torch.optim.Adam(
+                    [
+                        {"params": self.channel.parameters()},
+                        {"params": self.channelfeats.parameters()},
+                    ],
+                    lr=self.config["train"]["learning_rate"],
+                )
+            optimizer_encoder = torch.optim.Adam(
+                self.encoder.parameters(), lr=self.config["train"]["learning_rate"]
+            )
+        optimizers = [optimizer_channel, optimizer_encoder]
+        schedulers = [
+            {
+                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
+                    optimizers[0], mode="min", factor=0.5, min_lr=1e-5, verbose=True
+                ),
+                "interval": "epoch",
+                "frequency": 3,
+                "monitor": "val_loss",
+            },
+            {
+                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
+                    optimizers[1], mode="min", factor=0.5, min_lr=1e-5, verbose=True
+                ),
+                "interval": "epoch",
+                "frequency": 3,
+                "monitor": "val_loss",
+            },
+        ]
+        return optimizers, schedulers
+    def get_loss_function(self, config):
+        return MultiScaleSpectralLoss(config)
+class SSLDualLightningModule(SSLBaseModule):
+    def __init__(self, config):
+        super().__init__(config)
+        if config["train"]["fix_channel"]:
+            for param in self.channel.parameters():
+                param.requires_grad = False
+        self.spec_module = torchaudio.transforms.MelSpectrogram(
+            sample_rate=config["preprocess"]["sampling_rate"],
+            n_fft=config["preprocess"]["fft_length"],
+            win_length=config["preprocess"]["frame_length"],
+            hop_length=config["preprocess"]["frame_shift"],
+            f_min=config["preprocess"]["fmin"],
+            f_max=config["preprocess"]["fmax"],
+            n_mels=config["preprocess"]["n_mels"],
+            power=1,
+            center=True,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        self.beta = config["train"]["beta"]
+        self.criteria_a, self.criteria_b = self.get_loss_function(config)
+    def training_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        loss_recons = self.criteria_a(wavsdeg, batch["wavs"])
+        with torch.no_grad():
+            wavsdegtask = self.channel(batch["wavstask"], chfeats)
+        melspecstask = self.calc_spectrogram(wavsdegtask)
+        if self.config["general"]["use_gst"]:
+            enc_out_task = self.encoder(melspecstask.unsqueeze(1).transpose(2, 3))
+        else:
+            enc_out_task, _ = self.encoder(melspecstask.unsqueeze(1).transpose(2, 3))
+        enc_out_task = enc_out_task.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            loss_task = self.criteria_b(enc_out_task, batch["melspecstask"])
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            loss_task = self.criteria_b(enc_out_task, batch["melcepstask"])
+        loss = self.beta * loss_recons + (1 - self.beta) * loss_task
+        self.log(
+            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
+        )
+        self.log(
+            "train_loss_recons",
+            loss_recons,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.log(
+            "train_loss_task",
+            loss_task,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        return loss
+    def validation_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+            feats_name = "melspec"
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+            feats_name = "melcep"
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        loss_recons = self.criteria_a(wavsdeg, batch["wavs"])
+        wavsdegtask = self.channel(batch["wavstask"], chfeats)
+        melspecstask = self.calc_spectrogram(wavsdegtask)
+        if self.config["general"]["use_gst"]:
+            enc_out_task = self.encoder(melspecstask.unsqueeze(1).transpose(2, 3))
+        else:
+            enc_out_task, _ = self.encoder(melspecstask.unsqueeze(1).transpose(2, 3))
+        enc_out_task = enc_out_task.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_out_task_truth = batch["melspecstask"]
+            loss_task = self.criteria_b(enc_out_task, enc_out_task_truth)
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_out_task_truth = batch["melcepstask"]
+            loss_task = self.criteria_b(enc_out_task, enc_out_task_truth)
+        loss = self.beta * loss_recons + (1 - self.beta) * loss_task
+        logger_img_dict = {
+            "val_src_melspec": batch["melspecs"],
+            "val_pred_{}".format(feats_name): enc_out,
+            "val_truth_{}_task".format(feats_name): enc_out_task_truth,
+            "val_pred_{}_task".format(feats_name): enc_out_task,
+        }
+        for auxfeats in ["melceps", "melspecsaux"]:
+            if auxfeats in batch:
+                logger_img_dict["val_aux_{}".format(auxfeats)] = batch[auxfeats]
+        logger_wav_dict = {
+            "val_src_wav": batch["wavs"],
+            "val_remastered_wav": remas,
+            "val_pred_wav": wavsdeg,
+            "val_truth_wavtask": batch["wavstask"],
+            "val_deg_wavtask": wavsdegtask,
+        }
+        if "wavsaux" in batch:
+            logger_wav_dict["val_aux_wav"] = batch["wavsaux"]
+        d_out = {
+            "val_loss": loss,
+            "val_loss_recons": loss_recons,
+            "val_loss_task": loss_task,
+            "logger_dict": [logger_img_dict, logger_wav_dict],
+        }
+        return d_out
+    def validation_epoch_end(self, outputs):
+        self.log(
+            "val_loss",
+            torch.stack([out["val_loss"] for out in outputs]).mean().item(),
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.log(
+            "val_loss_recons",
+            torch.stack([out["val_loss_recons"] for out in outputs]).mean().item(),
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.log(
+            "val_loss_task",
+            torch.stack([out["val_loss_task"] for out in outputs]).mean().item(),
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        self.tflogger(logger_dict=outputs[-1]["logger_dict"][0], data_type="image")
+        self.tflogger(logger_dict=outputs[-1]["logger_dict"][1], data_type="audio")
+    def test_step(self, batch, batch_idx):
+        if self.config["general"]["use_gst"]:
+            enc_out = self.encoder(batch["melspecs"].unsqueeze(1).transpose(2, 3))
+            chfeats = self.gst(batch["melspecs"].transpose(1, 2))
+        else:
+            enc_out, enc_hidden = self.encoder(
+                batch["melspecs"].unsqueeze(1).transpose(2, 3)
+            )
+            chfeats = self.channelfeats(enc_hidden)
+        enc_out = enc_out.squeeze(1).transpose(1, 2)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats = enc_out
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats = torch.cat((batch["f0s"].unsqueeze(1), enc_out), dim=1)
+        remas = self.vocoder(enc_feats).squeeze(1)
+        wavsdeg = self.channel(remas, chfeats)
+        if self.config["general"]["feature_type"] == "melspec":
+            enc_feats_input = batch["melspecs"]
+        elif self.config["general"]["feature_type"] == "vocfeats":
+            enc_feats_input = torch.cat(
+                (batch["f0s"].unsqueeze(1), batch["melcepssrc"]), dim=1
+            )
+        input_recons = self.vocoder(enc_feats_input).squeeze(1)
+        wavsdegtask = self.channel(batch["wavstask"], chfeats)
+        if "wavsaux" in batch:
+            gt_wav = batch["wavsaux"]
+            if self.config["general"]["feature_type"] == "melspec":
+                enc_feats_aux = batch["melspecsaux"]
+            elif self.config["general"]["feature_type"] == "vocfeats":
+                enc_feats_aux = torch.cat(
+                    (batch["f0s"].unsqueeze(1), batch["melceps"]), dim=1
+                )
+            recons_wav = self.vocoder(enc_feats_aux).squeeze(1)
+        else:
+            gt_wav = None
+            recons_wav = None
+        return {
+            "reconstructed": recons_wav,
+            "remastered": remas,
+            "channeled": wavsdeg,
+            "channeled_task": wavsdegtask,
+            "input": batch["wavs"],
+            "input_recons": input_recons,
+            "groundtruth": gt_wav,
+        }
+    def test_epoch_end(self, outputs):
+        wav_dir = (
+            pathlib.Path(self.logger.experiment[0].log_dir).parent.parent / "test_wavs"
+        )
+        os.makedirs(wav_dir, exist_ok=True)
+        mel_dir = (
+            pathlib.Path(self.logger.experiment[0].log_dir).parent.parent / "test_mels"
+        )
+        os.makedirs(mel_dir, exist_ok=True)
+        print("Saving mel spectrogram plots ...")
+        for idx, out in enumerate(tqdm.tqdm(outputs)):
+            plot_keys = []
+            for key in [
+                "reconstructed",
+                "remastered",
+                "channeled",
+                "channeled_task",
+                "input",
+                "input_recons",
+                "groundtruth",
+            ]:
+                if out[key] != None:
+                    plot_keys.append(key)
+                    torchaudio.save(
+                        wav_dir / "{}-{}.wav".format(idx, key),
+                        out[key][0, ...].unsqueeze(0).cpu(),
+                        sample_rate=self.config["preprocess"]["sampling_rate"],
+                        channels_first=True,
+                    )
+                    plot_and_save_mels(
+                        out[key][0, ...].cpu(),
+                        mel_dir / "{}-{}.png".format(idx, key),
+                        self.config,
+                    )
+            plot_and_save_mels_all(
+                out,
+                plot_keys,
+                mel_dir / "{}-all.png".format(idx),
+                self.config,
+            )
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(
+            self.parameters(), lr=self.config["train"]["learning_rate"]
+        )
+        lr_scheduler_config = {
+            "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer, mode="min", factor=0.5, min_lr=1e-5, verbose=True
+            ),
+            "interval": "epoch",
+            "frequency": 3,
+            "monitor": "val_loss",
+        }
+        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}
+    def calc_spectrogram(self, wav):
+        specs = self.spec_module(wav)
+        log_spec = torch.log(
+            torch.clamp_min(specs, self.config["preprocess"]["min_magnitude"])
+            * self.config["preprocess"]["comp_factor"]
+        ).to(torch.float32)
+        return log_spec
+    def get_loss_function(self, config):
+        if config["train"]["feature_loss"]["type"] == "mae":
+            feature_loss = torch.nn.L1Loss()
+        else:
+            feature_loss = torch.nn.MSELoss()
+        return MultiScaleSpectralLoss(config), feature_loss

model.py ADDED Viewed

	@@ -0,0 +1,854 @@

+import torch
+import torch.nn as nn
+import torchaudio
+import torch.nn.functional as F
+import torch.nn.init as init
+import numpy as np
+class EncoderModule(nn.Module):
+    """
+    Analysis module based on 2D conv U-Net
+    Inspired by https://github.com/haoheliu/voicefixer
+    Args:
+        config (dict): config
+        use_channel (bool): output channel feature or not
+    """
+    def __init__(self, config, use_channel=False):
+        super().__init__()
+        self.channels = 1
+        self.use_channel = use_channel
+        self.downsample_ratio = 2 ** 4
+        self.down_block1 = DownBlockRes2D(
+            in_channels=self.channels,
+            out_channels=32,
+            downsample=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block2 = DownBlockRes2D(
+            in_channels=32,
+            out_channels=64,
+            downsample=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block3 = DownBlockRes2D(
+            in_channels=64,
+            out_channels=128,
+            downsample=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block4 = DownBlockRes2D(
+            in_channels=128,
+            out_channels=256,
+            downsample=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.conv_block5 = ConvBlockRes2D(
+            in_channels=256,
+            out_channels=256,
+            size=3,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block1 = UpBlockRes2D(
+            in_channels=256,
+            out_channels=256,
+            stride=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block2 = UpBlockRes2D(
+            in_channels=256,
+            out_channels=128,
+            stride=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block3 = UpBlockRes2D(
+            in_channels=128,
+            out_channels=64,
+            stride=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block4 = UpBlockRes2D(
+            in_channels=64,
+            out_channels=32,
+            stride=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.after_conv_block1 = ConvBlockRes2D(
+            in_channels=32,
+            out_channels=32,
+            size=3,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.after_conv2 = nn.Conv2d(
+            in_channels=32,
+            out_channels=1,
+            kernel_size=(1, 1),
+            stride=(1, 1),
+            padding=(0, 0),
+            bias=True,
+        )
+        if config["general"]["feature_type"] == "melspec":
+            out_dim = config["preprocess"]["n_mels"]
+        elif config["general"]["feature_type"] == "vocfeats":
+            out_dim = config["preprocess"]["cep_order"] + 1
+        else:
+            raise NotImplementedError()
+        self.after_linear = nn.Linear(
+            in_features=80,
+            out_features=out_dim,
+            bias=True,
+        )
+        if self.use_channel:
+            self.conv_channel = ConvBlockRes2D(
+                in_channels=256,
+                out_channels=256,
+                size=3,
+                activation="relu",
+                momentum=0.01,
+            )
+    def forward(self, x):
+        """
+        Forward
+        Args:
+            mel spectrogram: (batch, 1, time, freq)
+        Return:
+            speech feature (mel spectrogram or mel cepstrum): (batch, 1, time, freq)
+            input of channel feature module (batch, 256, time, freq)
+        """
+        origin_len = x.shape[2]
+        pad_len = (
+            int(np.ceil(x.shape[2] / self.downsample_ratio)) * self.downsample_ratio
+            - origin_len
+        )
+        x = F.pad(x, pad=(0, 0, 0, pad_len))
+        x = x[..., 0 : x.shape[-1] - 1]
+        (x1_pool, x1) = self.down_block1(x)
+        (x2_pool, x2) = self.down_block2(x1_pool)
+        (x3_pool, x3) = self.down_block3(x2_pool)
+        (x4_pool, x4) = self.down_block4(x3_pool)
+        x_center = self.conv_block5(x4_pool)
+        x5 = self.up_block1(x_center, x4)
+        x6 = self.up_block2(x5, x3)
+        x7 = self.up_block3(x6, x2)
+        x8 = self.up_block4(x7, x1)
+        x = self.after_conv_block1(x8)
+        x = self.after_conv2(x)
+        x = F.pad(x, pad=(0, 1))
+        x = x[:, :, 0:origin_len, :]
+        x = self.after_linear(x)
+        if self.use_channel:
+            x_channel = self.conv_channel(x4_pool)
+            return x, x_channel
+        else:
+            return x
+class ChannelModule(nn.Module):
+    """
+    Channel module based on 1D conv U-Net
+    Args:
+        config (dict): config
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.channels = 1
+        self.downsample_ratio = 2 ** 6  # This number equals 2^{#encoder_blcoks}
+        self.down_block1 = DownBlockRes1D(
+            in_channels=self.channels,
+            out_channels=32,
+            downsample=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block2 = DownBlockRes1D(
+            in_channels=32,
+            out_channels=64,
+            downsample=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block3 = DownBlockRes1D(
+            in_channels=64,
+            out_channels=128,
+            downsample=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block4 = DownBlockRes1D(
+            in_channels=128,
+            out_channels=256,
+            downsample=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block5 = DownBlockRes1D(
+            in_channels=256,
+            out_channels=512,
+            downsample=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.conv_block6 = ConvBlockRes1D(
+            in_channels=512,
+            out_channels=384,
+            size=3,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block1 = UpBlockRes1D(
+            in_channels=512,
+            out_channels=512,
+            stride=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block2 = UpBlockRes1D(
+            in_channels=512,
+            out_channels=256,
+            stride=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block3 = UpBlockRes1D(
+            in_channels=256,
+            out_channels=128,
+            stride=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block4 = UpBlockRes1D(
+            in_channels=128,
+            out_channels=64,
+            stride=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.up_block5 = UpBlockRes1D(
+            in_channels=64,
+            out_channels=32,
+            stride=2,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.after_conv_block1 = ConvBlockRes1D(
+            in_channels=32,
+            out_channels=32,
+            size=3,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.after_conv2 = nn.Conv1d(
+            in_channels=32,
+            out_channels=1,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+    def forward(self, x, h):
+        """
+        Forward
+        Args:
+            clean waveform: (batch, n_channel (1), time)
+            channel feature: (batch, feature_dim)
+        Outputs:
+            degraded waveform: (batch, n_channel (1), time)
+        """
+        x = x.unsqueeze(1)
+        origin_len = x.shape[2]
+        pad_len = (
+            int(np.ceil(x.shape[2] / self.downsample_ratio)) * self.downsample_ratio
+            - origin_len
+        )
+        x = F.pad(x, pad=(0, pad_len))
+        x = x[..., 0 : x.shape[-1] - 1]
+        (x1_pool, x1) = self.down_block1(x)
+        (x2_pool, x2) = self.down_block2(x1_pool)
+        (x3_pool, x3) = self.down_block3(x2_pool)
+        (x4_pool, x4) = self.down_block4(x3_pool)
+        (x5_pool, x5) = self.down_block5(x4_pool)
+        x_center = self.conv_block6(x5_pool)
+        x_concat = torch.cat(
+            (x_center, h.unsqueeze(2).expand(-1, -1, x_center.size(2))), dim=1
+        )
+        x6 = self.up_block1(x_concat, x5)
+        x7 = self.up_block2(x6, x4)
+        x8 = self.up_block3(x7, x3)
+        x9 = self.up_block4(x8, x2)
+        x10 = self.up_block5(x9, x1)
+        x = self.after_conv_block1(x10)
+        x = self.after_conv2(x)
+        x = F.pad(x, pad=(0, 1))
+        x = x[..., 0:origin_len]
+        return x.squeeze(1)
+class ChannelFeatureModule(nn.Module):
+    """
+    Channel feature module based on 2D convolution layers
+    Args:
+        config (dict): config
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.conv_blocks_in = ConvBlockRes2D(
+            in_channels=256,
+            out_channels=512,
+            size=3,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block1 = DownBlockRes2D(
+            in_channels=512,
+            out_channels=256,
+            downsample=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.down_block2 = DownBlockRes2D(
+            in_channels=256,
+            out_channels=256,
+            downsample=(2, 2),
+            activation="relu",
+            momentum=0.01,
+        )
+        self.conv_block_out = ConvBlockRes2D(
+            in_channels=256,
+            out_channels=128,
+            size=3,
+            activation="relu",
+            momentum=0.01,
+        )
+        self.avgpool2d = torch.nn.AdaptiveAvgPool2d(1)
+    def forward(self, x):
+        """
+        Forward
+        Args:
+            output of analysis module: (batch, 256, time, freq)
+        Return:
+            channel feature: (batch, feature_dim)
+        """
+        x = self.conv_blocks_in(x)
+        x, _ = self.down_block1(x)
+        x, _ = self.down_block2(x)
+        x = self.conv_block_out(x)
+        x = self.avgpool2d(x)
+        x = x.squeeze(3).squeeze(2)
+        return x
+class ConvBlockRes2D(nn.Module):
+    def __init__(self, in_channels, out_channels, size, activation, momentum):
+        super().__init__()
+        self.activation = activation
+        if type(size) == type((3, 4)):
+            pad = size[0] // 2
+            size = size[0]
+        else:
+            pad = size // 2
+            size = size
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(size, size),
+            stride=(1, 1),
+            dilation=(1, 1),
+            padding=(pad, pad),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(size, size),
+            stride=(1, 1),
+            dilation=(1, 1),
+            padding=(pad, pad),
+            bias=False,
+        )
+        self.bn2 = nn.BatchNorm2d(out_channels, momentum=momentum)
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                padding=(0, 0),
+            )
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        origin = x
+        x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
+        x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
+        if self.is_shortcut:
+            return self.shortcut(origin) + x
+        else:
+            return origin + x
+class ConvBlockRes1D(nn.Module):
+    def __init__(self, in_channels, out_channels, size, activation, momentum):
+        super().__init__()
+        self.activation = activation
+        pad = size // 2
+        self.conv1 = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=size,
+            stride=1,
+            dilation=1,
+            padding=pad,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm1d(in_channels, momentum=momentum)
+        self.conv2 = nn.Conv1d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=size,
+            stride=1,
+            dilation=1,
+            padding=pad,
+            bias=False,
+        )
+        self.bn2 = nn.BatchNorm1d(out_channels, momentum=momentum)
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        origin = x
+        x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
+        x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
+        if self.is_shortcut:
+            return self.shortcut(origin) + x
+        else:
+            return origin + x
+class DownBlockRes2D(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum):
+        super().__init__()
+        size = 3
+        self.conv_block1 = ConvBlockRes2D(
+            in_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block2 = ConvBlockRes2D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block3 = ConvBlockRes2D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block4 = ConvBlockRes2D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.avg_pool2d = torch.nn.AvgPool2d(downsample)
+    def forward(self, x):
+        encoder = self.conv_block1(x)
+        encoder = self.conv_block2(encoder)
+        encoder = self.conv_block3(encoder)
+        encoder = self.conv_block4(encoder)
+        encoder_pool = self.avg_pool2d(encoder)
+        return encoder_pool, encoder
+class DownBlockRes1D(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum):
+        super().__init__()
+        size = 3
+        self.conv_block1 = ConvBlockRes1D(
+            in_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block2 = ConvBlockRes1D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block3 = ConvBlockRes1D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block4 = ConvBlockRes1D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.avg_pool1d = torch.nn.AvgPool1d(downsample)
+    def forward(self, x):
+        encoder = self.conv_block1(x)
+        encoder = self.conv_block2(encoder)
+        encoder = self.conv_block3(encoder)
+        encoder = self.conv_block4(encoder)
+        encoder_pool = self.avg_pool1d(encoder)
+        return encoder_pool, encoder
+class UpBlockRes2D(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum):
+        super().__init__()
+        size = 3
+        self.activation = activation
+        self.conv1 = torch.nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(size, size),
+            stride=stride,
+            padding=(0, 0),
+            output_padding=(0, 0),
+            bias=False,
+            dilation=(1, 1),
+        )
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv_block2 = ConvBlockRes2D(
+            out_channels * 2, out_channels, size, activation, momentum
+        )
+        self.conv_block3 = ConvBlockRes2D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block4 = ConvBlockRes2D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block5 = ConvBlockRes2D(
+            out_channels, out_channels, size, activation, momentum
+        )
+    def prune(self, x, both=False):
+        """Prune the shape of x after transpose convolution."""
+        if both:
+            x = x[:, :, 0:-1, 0:-1]
+        else:
+            x = x[:, :, 0:-1, :]
+        return x
+    def forward(self, input_tensor, concat_tensor, both=False):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        x = self.prune(x, both=both)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x)
+        x = self.conv_block3(x)
+        x = self.conv_block4(x)
+        x = self.conv_block5(x)
+        return x
+class UpBlockRes1D(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum):
+        super().__init__()
+        size = 3
+        self.activation = activation
+        self.conv1 = torch.nn.ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=size,
+            stride=stride,
+            padding=0,
+            output_padding=0,
+            bias=False,
+            dilation=1,
+        )
+        self.bn1 = nn.BatchNorm1d(in_channels)
+        self.conv_block2 = ConvBlockRes1D(
+            out_channels * 2, out_channels, size, activation, momentum
+        )
+        self.conv_block3 = ConvBlockRes1D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block4 = ConvBlockRes1D(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block5 = ConvBlockRes1D(
+            out_channels, out_channels, size, activation, momentum
+        )
+    def prune(self, x):
+        """Prune the shape of x after transpose convolution."""
+        print(x.shape)
+        x = x[:, 0:-1, :]
+        print(x.shape)
+        return x
+    def forward(self, input_tensor, concat_tensor):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        # x = self.prune(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x)
+        x = self.conv_block3(x)
+        x = self.conv_block4(x)
+        x = self.conv_block5(x)
+        return x
+class MultiScaleSpectralLoss(nn.Module):
+    """
+    Multi scale spectral loss
+    https://openreview.net/forum?id=B1x1ma4tDr
+    Args:
+        config (dict): config
+    """
+    def __init__(self, config):
+        super().__init__()
+        try:
+            self.use_linear = config["train"]["multi_scale_loss"]["use_linear"]
+            self.gamma = config["train"]["multi_scale_loss"]["gamma"]
+        except KeyError:
+            self.use_linear = False
+        self.fft_sizes = [2048, 512, 256, 128, 64]
+        self.spectrograms = []
+        for fftsize in self.fft_sizes:
+            self.spectrograms.append(
+                torchaudio.transforms.Spectrogram(
+                    n_fft=fftsize, hop_length=fftsize // 4, power=2
+                )
+            )
+        self.spectrograms = nn.ModuleList(self.spectrograms)
+        self.criteria = nn.L1Loss()
+        self.eps = 1e-10
+    def forward(self, wav_out, wav_target):
+        """
+        Forward
+        Args:
+            wav_out: output of channel module (batch, time)
+            wav_target: input degraded waveform (batch, time)
+        Return:
+            loss
+        """
+        loss = 0.0
+        length = min(wav_out.size(1), wav_target.size(1))
+        for spectrogram in self.spectrograms:
+            S_out = spectrogram(wav_out[..., :length])
+            S_target = spectrogram(wav_target[..., :length])
+            log_S_out = torch.log(S_out + self.eps)
+            log_S_target = torch.log(S_target + self.eps)
+            if self.use_linear:
+                loss += self.criteria(S_out, S_target) + self.gamma * self.criteria(
+                    log_S_out, log_S_target
+                )
+            else:
+                loss += self.criteria(log_S_out, log_S_target)
+        return loss
+class ReferenceEncoder(nn.Module):
+    def __init__(
+        self, idim=80, ref_enc_filters=[32, 32, 64, 64, 128, 128], ref_dim=128
+    ):
+        super().__init__()
+        K = len(ref_enc_filters)
+        filters = [1] + ref_enc_filters
+        convs = [
+            nn.Conv2d(
+                in_channels=filters[i],
+                out_channels=filters[i + 1],
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1),
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(
+            [nn.BatchNorm2d(num_features=ref_enc_filters[i]) for i in range(K)]
+        )
+        out_channels = self.calculate_channels(idim, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1] * out_channels,
+            hidden_size=ref_dim,
+            batch_first=True,
+        )
+        self.n_mel_channels = idim
+    def forward(self, inputs):
+        out = inputs.view(inputs.size(0), 1, -1, self.n_mel_channels)
+        for conv, bn in zip(self.convs, self.bns):
+            out = conv(out)
+            out = bn(out)
+            out = F.relu(out)
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        N, T = out.size(0), out.size(1)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+        self.gru.flatten_parameters()
+        _, out = self.gru(out)
+        return out.squeeze(0)
+    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+        for _ in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+class STL(nn.Module):
+    def __init__(self, ref_dim=128, num_heads=4, token_num=10, token_dim=128):
+        super().__init__()
+        self.embed = nn.Parameter(torch.FloatTensor(token_num, token_dim // num_heads))
+        d_q = ref_dim
+        d_k = token_dim // num_heads
+        self.attention = MultiHeadAttention(
+            query_dim=d_q, key_dim=d_k, num_units=token_dim, num_heads=num_heads
+        )
+        init.normal_(self.embed, mean=0, std=0.5)
+    def forward(self, inputs):
+        N = inputs.size(0)
+        query = inputs.unsqueeze(1)
+        keys = (
+            torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1)
+        )  # [N, token_num, token_embedding_size // num_heads]
+        style_embed = self.attention(query, keys)
+        return style_embed
+class MultiHeadAttention(nn.Module):
+    """
+    Multi head attention
+    https://github.com/KinglittleQ/GST-Tacotron
+    """
+    def __init__(self, query_dim, key_dim, num_units, num_heads):
+        super().__init__()
+        self.num_units = num_units
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.W_query = nn.Linear(
+            in_features=query_dim, out_features=num_units, bias=False
+        )
+        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+        self.W_value = nn.Linear(
+            in_features=key_dim, out_features=num_units, bias=False
+        )
+    def forward(self, query, key):
+        """
+        Forward
+        Args:
+            query: (batch, T_q, query_dim)
+            key: (batch, T_k, key_dim)
+        Return:
+            out: (N, T_q, num_units)
+        """
+        querys = self.W_query(query)  # [N, T_q, num_units]
+        keys = self.W_key(key)  # [N, T_k, num_units]
+        values = self.W_value(key)
+        split_size = self.num_units // self.num_heads
+        querys = torch.stack(
+            torch.split(querys, split_size, dim=2), dim=0
+        )  # [h, N, T_q, num_units/h]
+        keys = torch.stack(
+            torch.split(keys, split_size, dim=2), dim=0
+        )  # [h, N, T_k, num_units/h]
+        values = torch.stack(
+            torch.split(values, split_size, dim=2), dim=0
+        )  # [h, N, T_k, num_units/h]
+        # score = softmax(QK^T / (d_k ** 0.5))
+        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
+        scores = scores / (self.key_dim ** 0.5)
+        scores = F.softmax(scores, dim=3)
+        # out = score * V
+        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
+        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(
+            0
+        )  # [N, T_q, num_units]
+        return out
+class GSTModule(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder_post = ReferenceEncoder(
+            idim=config["preprocess"]["n_mels"],
+            ref_dim=256,
+        )
+        self.stl = STL(ref_dim=256, num_heads=8, token_num=10, token_dim=128)
+    def forward(self, inputs):
+        acoustic_embed = self.encoder_post(inputs)
+        style_embed = self.stl(acoustic_embed)
+        return style_embed.squeeze(1)

preprocess.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import numpy as np
+import os
+import librosa
+import tqdm
+import pickle
+import random
+import argparse
+import yaml
+import pathlib
+def get_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", required=True, type=pathlib.Path)
+    parser.add_argument("--corpus_type", default=None, type=str)
+    parser.add_argument("--source_path", default=None, type=pathlib.Path)
+    parser.add_argument("--source_path_task", default=None, type=pathlib.Path)
+    parser.add_argument("--aux_path", default=None, type=pathlib.Path)
+    parser.add_argument("--preprocessed_path", default=None, type=pathlib.Path)
+    parser.add_argument("--n_train", default=None, type=int)
+    parser.add_argument("--n_val", default=None, type=int)
+    parser.add_argument("--n_test", default=None, type=int)
+    return parser.parse_args()
+def preprocess(config):
+    # configs
+    preprocessed_dir = pathlib.Path(config["general"]["preprocessed_path"])
+    n_train = config["preprocess"]["n_train"]
+    n_val = config["preprocess"]["n_val"]
+    n_test = config["preprocess"]["n_test"]
+    SR = config["preprocess"]["sampling_rate"]
+    os.makedirs(preprocessed_dir, exist_ok=True)
+    sourcepath = pathlib.Path(config["general"]["source_path"])
+    if config["general"]["corpus_type"] == "single":
+        fulllist = list(sourcepath.glob("*.wav"))
+        random.seed(0)
+        random.shuffle(fulllist)
+        train_filelist = fulllist[:n_train]
+        val_filelist = fulllist[n_train : n_train + n_val]
+        test_filelist = fulllist[n_train + n_val : n_train + n_val + n_test]
+        filelist = train_filelist + val_filelist + test_filelist
+    elif config["general"]["corpus_type"] == "multi-seen":
+        fulllist = list(sourcepath.glob("*/*.wav"))
+        random.seed(0)
+        random.shuffle(fulllist)
+        train_filelist = fulllist[:n_train]
+        val_filelist = fulllist[n_train : n_train + n_val]
+        test_filelist = fulllist[n_train + n_val : n_train + n_val + n_test]
+        filelist = train_filelist + val_filelist + test_filelist
+    elif config["general"]["corpus_type"] == "multi-unseen":
+        spk_list = list(set([x.parent for x in sourcepath.glob("*/*.wav")]))
+        train_filelist = []
+        val_filelist = []
+        test_filelist = []
+        random.seed(0)
+        random.shuffle(spk_list)
+        for i, spk in enumerate(spk_list):
+            sourcespkpath = sourcepath / spk
+            if i < n_train:
+                train_filelist.extend(list(sourcespkpath.glob("*.wav")))
+            elif i < n_train + n_val:
+                val_filelist.extend(list(sourcespkpath.glob("*.wav")))
+            elif i < n_train + n_val + n_test:
+                test_filelist.extend(list(sourcespkpath.glob("*.wav")))
+        filelist = train_filelist + val_filelist + test_filelist
+    else:
+        raise NotImplementedError(
+            "corpus_type specified in config.yaml should be {single, multi-seen, multi-unseen}"
+        )
+    with open(preprocessed_dir / "train.txt", "w", encoding="utf-8") as f:
+        for m in train_filelist:
+            f.write(str(m) + "\n")
+    with open(preprocessed_dir / "val.txt", "w", encoding="utf-8") as f:
+        for m in val_filelist:
+            f.write(str(m) + "\n")
+    with open(preprocessed_dir / "test.txt", "w", encoding="utf-8") as f:
+        for m in test_filelist:
+            f.write(str(m) + "\n")
+    for wp in tqdm.tqdm(filelist):
+        if config["general"]["corpus_type"] == "single":
+            basename = str(wp.stem)
+        else:
+            basename = str(wp.parent.name) + "-" + str(wp.stem)
+        wav, _ = librosa.load(wp, sr=SR)
+        wavsegs = []
+        if config["general"]["aux_path"] != None:
+            auxpath = pathlib.Path(config["general"]["aux_path"])
+            if config["general"]["corpus_type"] == "single":
+                wav_aux, _ = librosa.load(auxpath / wp.name, sr=SR)
+            else:
+                wav_aux, _ = librosa.load(auxpath / wp.parent.name / wp.name, sr=SR)
+            wavauxsegs = []
+        if config["general"]["aux_path"] == None:
+            wavsegs.append(wav)
+        else:
+            min_seq_len = min(len(wav), len(wav_aux))
+            wav = wav[:min_seq_len]
+            wav_aux = wav_aux[:min_seq_len]
+            wavsegs.append(wav)
+            wavauxsegs.append(wav_aux)
+        wavsegs = np.asarray(wavsegs).astype(np.float32)
+        if config["general"]["aux_path"] != None:
+            wavauxsegs = np.asarray(wavauxsegs).astype(np.float32)
+        else:
+            wavauxsegs = None
+        d_preprocessed = {"wavs": wavsegs, "wavsaux": wavauxsegs}
+        with open(preprocessed_dir / "{}.pickle".format(basename), "wb") as fw:
+            pickle.dump(d_preprocessed, fw)
+if __name__ == "__main__":
+    args = get_arg()
+    config = yaml.load(open(args.config_path, "r"), Loader=yaml.FullLoader)
+    for key in ["corpus_type", "source_path", "aux_path", "preprocessed_path"]:
+        if getattr(args, key) != None:
+            config["general"][key] = str(getattr(args, key))
+    for key in ["n_train", "n_val", "n_test"]:
+        if getattr(args, key) != None:
+            config["preprocess"][key] = getattr(args, key)
+    print("Performing preprocessing ...")
+    preprocess(config)
+    if "dual" in config:
+        if config["dual"]["enable"]:
+            task_config = yaml.load(
+                open(config["dual"]["config_path"], "r"), Loader=yaml.FullLoader
+            )
+            task_preprocessed_dir = (
+                pathlib.Path(config["general"]["preprocessed_path"]).parent
+                / pathlib.Path(task_config["general"]["preprocessed_path"]).name
+            )
+            task_config["general"]["preprocessed_path"] = task_preprocessed_dir
+            if args.source_path_task != None:
+                task_config["general"]["source_path"] = args.source_path_task
+            print("Performing preprocessing for multi-task learning ...")
+            preprocess(task_config)

train.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import argparse
+import os
+import pathlib
+import yaml
+from dataset import DataModule
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers.csv_logs import CSVLogger
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+from lightning_module import (
+    PretrainLightningModule,
+    SSLStepLightningModule,
+    SSLDualLightningModule,
+)
+from utils import configure_args
+def get_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", required=True, type=pathlib.Path)
+    parser.add_argument(
+        "--stage", required=True, type=str, choices=["pretrain", "ssl-step", "ssl-dual"]
+    )
+    parser.add_argument("--run_name", required=True, type=str)
+    parser.add_argument("--corpus_type", default=None, type=str)
+    parser.add_argument("--source_path", default=None, type=pathlib.Path)
+    parser.add_argument("--aux_path", default=None, type=pathlib.Path)
+    parser.add_argument("--preprocessed_path", default=None, type=pathlib.Path)
+    parser.add_argument("--n_train", default=None, type=int)
+    parser.add_argument("--n_val", default=None, type=int)
+    parser.add_argument("--n_test", default=None, type=int)
+    parser.add_argument("--epoch", default=None, type=int)
+    parser.add_argument("--load_pretrained", action="store_true")
+    parser.add_argument("--pretrained_path", default=None, type=pathlib.Path)
+    parser.add_argument("--early_stopping", action="store_true")
+    parser.add_argument("--alpha", default=None, type=float)
+    parser.add_argument("--beta", default=None, type=float)
+    parser.add_argument("--learning_rate", default=None, type=float)
+    parser.add_argument(
+        "--feature_loss_type", default=None, type=str, choices=["mae", "mse"]
+    )
+    parser.add_argument("--debug", action="store_true")
+    return parser.parse_args()
+def train(args, config, output_path):
+    debug = args.debug
+    csvlogger = CSVLogger(save_dir=str(output_path), name="train_log")
+    tblogger = TensorBoardLogger(save_dir=str(output_path), name="tf_log")
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=str(output_path),
+        save_weights_only=True,
+        save_top_k=-1,
+        every_n_epochs=1,
+        monitor="val_loss",
+    )
+    callbacks = [checkpoint_callback]
+    if config["train"]["early_stopping"]:
+        earlystop_callback = EarlyStopping(
+            monitor="val_loss", min_delta=0.0, patience=15, mode="min"
+        )
+        callbacks.append(earlystop_callback)
+    trainer = Trainer(
+        max_epochs=1 if debug else config["train"]["epoch"],
+        gpus=-1,
+        deterministic=False,
+        auto_select_gpus=True,
+        benchmark=True,
+        default_root_dir=os.getcwd(),
+        limit_train_batches=0.01 if debug else 1.0,
+        limit_val_batches=0.5 if debug else 1.0,
+        callbacks=callbacks,
+        logger=[csvlogger, tblogger],
+        gradient_clip_val=config["train"]["grad_clip_thresh"],
+        flush_logs_every_n_steps=config["train"]["logger_step"],
+        val_check_interval=0.5,
+    )
+    if config["general"]["stage"] == "pretrain":
+        model = PretrainLightningModule(config)
+    elif config["general"]["stage"] == "ssl-step":
+        model = SSLStepLightningModule(config)
+    elif config["general"]["stage"] == "ssl-dual":
+        model = SSLDualLightningModule(config)
+    else:
+        raise NotImplementedError()
+    datamodule = DataModule(config)
+    trainer.fit(model, datamodule=datamodule)
+if __name__ == "__main__":
+    args = get_arg()
+    config = yaml.load(open(args.config_path, "r"), Loader=yaml.FullLoader)
+    output_path = pathlib.Path(config["general"]["output_path"]) / args.run_name
+    os.makedirs(output_path, exist_ok=True)
+    config, args = configure_args(config, args)
+    train(args=args, config=config, output_path=output_path)

utils.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import librosa.display
+import matplotlib.pyplot as plt
+import json
+import torch
+import torchaudio
+import hifigan
+def manual_logging(logger, item, idx, tag, global_step, data_type, config):
+    if data_type == "audio":
+        audio = item[idx, ...].detach().cpu().numpy()
+        logger.add_audio(
+            tag,
+            audio,
+            global_step,
+            sample_rate=config["preprocess"]["sampling_rate"],
+        )
+    elif data_type == "image":
+        image = item[idx, ...].detach().cpu().numpy()
+        fig, ax = plt.subplots()
+        _ = librosa.display.specshow(
+            image,
+            x_axis="time",
+            y_axis="linear",
+            sr=config["preprocess"]["sampling_rate"],
+            hop_length=config["preprocess"]["frame_shift"],
+            fmax=config["preprocess"]["sampling_rate"] // 2,
+            ax=ax,
+        )
+        logger.add_figure(tag, fig, global_step)
+    else:
+        raise NotImplementedError(
+            "Data type given to logger should be [audio] or [image]"
+        )
+def load_vocoder(config):
+    with open(
+        "hifigan/config_{}.json".format(config["general"]["feature_type"]), "r"
+    ) as f:
+        config_hifigan = hifigan.AttrDict(json.load(f))
+    vocoder = hifigan.Generator(config_hifigan)
+    vocoder.load_state_dict(torch.load(config["general"]["hifigan_path"])["generator"])
+    vocoder.remove_weight_norm()
+    for param in vocoder.parameters():
+        param.requires_grad = False
+    return vocoder
+def get_conv_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def plot_and_save_mels(wav, save_path, config):
+    spec_module = torchaudio.transforms.MelSpectrogram(
+        sample_rate=config["preprocess"]["sampling_rate"],
+        n_fft=config["preprocess"]["fft_length"],
+        win_length=config["preprocess"]["frame_length"],
+        hop_length=config["preprocess"]["frame_shift"],
+        f_min=config["preprocess"]["fmin"],
+        f_max=config["preprocess"]["fmax"],
+        n_mels=config["preprocess"]["n_mels"],
+        power=1,
+        center=True,
+        norm="slaney",
+        mel_scale="slaney",
+    )
+    spec = spec_module(wav.unsqueeze(0))
+    log_spec = torch.log(
+        torch.clamp_min(spec, config["preprocess"]["min_magnitude"])
+        * config["preprocess"]["comp_factor"]
+    )
+    fig, ax = plt.subplots()
+    _ = librosa.display.specshow(
+        log_spec.squeeze(0).numpy(),
+        x_axis="time",
+        y_axis="linear",
+        sr=config["preprocess"]["sampling_rate"],
+        hop_length=config["preprocess"]["frame_shift"],
+        fmax=config["preprocess"]["sampling_rate"] // 2,
+        ax=ax,
+        cmap="viridis",
+    )
+    fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
+def plot_and_save_mels_all(wavs, keys, save_path, config):
+    spec_module = torchaudio.transforms.MelSpectrogram(
+        sample_rate=config["preprocess"]["sampling_rate"],
+        n_fft=config["preprocess"]["fft_length"],
+        win_length=config["preprocess"]["frame_length"],
+        hop_length=config["preprocess"]["frame_shift"],
+        f_min=config["preprocess"]["fmin"],
+        f_max=config["preprocess"]["fmax"],
+        n_mels=config["preprocess"]["n_mels"],
+        power=1,
+        center=True,
+        norm="slaney",
+        mel_scale="slaney",
+    )
+    fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(18, 18))
+    for i, key in enumerate(keys):
+        wav = wavs[key][0, ...].cpu()
+        spec = spec_module(wav.unsqueeze(0))
+        log_spec = torch.log(
+            torch.clamp_min(spec, config["preprocess"]["min_magnitude"])
+            * config["preprocess"]["comp_factor"]
+        )
+        ax[i // 3, i % 3].set(title=key)
+        _ = librosa.display.specshow(
+            log_spec.squeeze(0).numpy(),
+            x_axis="time",
+            y_axis="linear",
+            sr=config["preprocess"]["sampling_rate"],
+            hop_length=config["preprocess"]["frame_shift"],
+            fmax=config["preprocess"]["sampling_rate"] // 2,
+            ax=ax[i // 3, i % 3],
+            cmap="viridis",
+        )
+    fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
+def configure_args(config, args):
+    for key in ["stage", "corpus_type", "source_path", "aux_path", "preprocessed_path"]:
+        if getattr(args, key) != None:
+            config["general"][key] = str(getattr(args, key))
+    for key in ["n_train", "n_val", "n_test"]:
+        if getattr(args, key) != None:
+            config["preprocess"][key] = getattr(args, key)
+    for key in ["alpha", "beta", "learning_rate", "epoch"]:
+        if getattr(args, key) != None:
+            config["train"][key] = getattr(args, key)
+    for key in ["load_pretrained", "early_stopping"]:
+        config["train"][key] = getattr(args, key)
+    if args.feature_loss_type != None:
+        config["train"]["feature_loss"]["type"] = args.feature_loss_type
+    for key in ["pretrained_path"]:
+        if getattr(args, key) != None:
+            config["train"][key] = str(getattr(args, key))
+    return config, args