{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "starganv2_vc_weights_converter.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "CA5i7YAlagUA" }, "outputs": [], "source": [ "!git clone https://github.com/yl4579/StarGANv2-VC\n", "!pip install SoundFile torchaudio munch\n", "!git clone https://github.com/HighCWu/starganv2vc-paddle\n", "!cd starganv2vc-paddle && pip install paddlepaddle-gpu==2.2.2 paddleaudio munch pydub\n", "!cp -r starganv2vc-paddle/starganv2vc_paddle StarGANv2-VC/" ] }, { "cell_type": "code", "source": [ "!gdown https://drive.google.com/uc?id=1nzTyyl-9A1Hmqya2Q_f2bpZkUoRjbZsY" ], "metadata": { "id": "ac4g4L1Bbx1t" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!unzip -qq Models.zip\n", "!rm -rf Models.zip\n", "!mv Models StarGANv2-VC/Models" ], "metadata": { "id": "EJ3vG_RvcOD8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "%cd StarGANv2-VC" ], "metadata": { "id": "rKovh1Egi4mJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import yaml\n", "import numpy as np\n", "import torch\n", "import warnings\n", "warnings.simplefilter('ignore')\n", "\n", "from munch import Munch\n", "\n", "from models import build_model\n", "\n", "from Utils.ASR.models import ASRCNN\n", "from Utils.JDC.model import JDCNet\n", "\n", "torch.backends.cudnn.benchmark = True #\n", "\n", "def main(config_path):\n", " config = yaml.safe_load(open(config_path))\n", " \n", " device = config.get('device', 'cpu')\n", "\n", " # load pretrained ASR model\n", " ASR_config = config.get('ASR_config', False)\n", " ASR_path = config.get('ASR_path', False)\n", " with open(ASR_config) as f:\n", " ASR_config = yaml.safe_load(f)\n", " ASR_model_config = ASR_config['model_params']\n", " ASR_model = ASRCNN(**ASR_model_config)\n", " params = torch.load(ASR_path, map_location='cpu')['model']\n", " ASR_model.load_state_dict(params)\n", " ASR_model.to(device)\n", " _ = ASR_model.eval()\n", " \n", " # load pretrained F0 model\n", " F0_path = config.get('F0_path', False)\n", " F0_model = JDCNet(num_class=1, seq_len=192)\n", " params = torch.load(F0_path, map_location='cpu')['net']\n", " F0_model.load_state_dict(params)\n", " F0_model.to(device)\n", " \n", " # build model\n", " _, model_ema = build_model(Munch(config['model_params']), F0_model, ASR_model)\n", " pretrained_path = 'Models/epoch_00150.pth'# config.get('pretrained_model', False)\n", " params = torch.load(pretrained_path, map_location='cpu')['model_ema']\n", " [model_ema[key].load_state_dict(state_dict) for key, state_dict in params.items()]\n", " _ = [model_ema[key].to(device) for key in model_ema]\n", "\n", " return ASR_model, F0_model, model_ema\n", "\n", "ASR_model_torch, F0_model_torch, model_ema_torch = main('./Models/config.yml')\n" ], "metadata": { "id": "UpMuk5kni67B" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import yaml\n", "import numpy as np\n", "import paddle\n", "import warnings\n", "warnings.simplefilter('ignore')\n", "\n", "from munch import Munch\n", "\n", "from starganv2vc_paddle.models import build_model\n", "\n", "from starganv2vc_paddle.Utils.ASR.models import ASRCNN\n", "from starganv2vc_paddle.Utils.JDC.model import JDCNet\n", "\n", "@paddle.no_grad()\n", "def convert_weights(torch_model, paddle_model):\n", " _ = torch_model.eval()\n", " _ = paddle_model.eval()\n", " dense_layers = []\n", " for name, layer in torch_model.named_modules():\n", " if isinstance(layer, torch.nn.Linear):\n", " dense_layers.append(name)\n", " torch_state_dict = torch_model.state_dict()\n", " for name, param in paddle_model.named_parameters():\n", " name = name.replace('._mean', '.running_mean')\n", " name = name.replace('._variance', '.running_var')\n", " name = name.replace('.scale', '.weight')\n", " target_param = torch_state_dict[name].detach().cpu().numpy()\n", " if '.'.join(name.split('.')[:-1]) in dense_layers:\n", " if len(param.shape) == 2:\n", " target_param = target_param.transpose((1,0))\n", " param.set_value(paddle.to_tensor(target_param))\n", "\n", "@torch.no_grad()\n", "@paddle.no_grad()\n", "def main(config_path):\n", " config = yaml.safe_load(open(config_path))\n", " \n", " ASR_config = config.get('ASR_config', False)\n", " with open(ASR_config) as f:\n", " ASR_config = yaml.safe_load(f)\n", " ASR_model_config = ASR_config['model_params']\n", " ASR_model = ASRCNN(**ASR_model_config)\n", " _ = ASR_model.eval()\n", " convert_weights(ASR_model_torch, ASR_model)\n", "\n", " F0_model = JDCNet(num_class=1, seq_len=192)\n", " _ = F0_model.eval()\n", " convert_weights(F0_model_torch, F0_model)\n", " \n", " # build model\n", " model, model_ema = build_model(Munch(config['model_params']), F0_model, ASR_model)\n", "\n", " asr_input = paddle.randn([2, 80, 192])\n", " asr_output = ASR_model(asr_input)\n", " asr_output_torch = ASR_model_torch(torch.from_numpy(asr_input.numpy()).cuda())\n", " print('ASR model input:', asr_input.shape, 'output:', asr_output.shape)\n", " print('Error:', (asr_output_torch.cpu().numpy() - asr_output.numpy()).mean())\n", " mel_input = paddle.randn([2, 1, 192, 512])\n", " f0_output = F0_model(mel_input)\n", " f0_output_torch = F0_model_torch(torch.from_numpy(mel_input.numpy()).cuda())\n", " print('F0 model input:', mel_input.shape, 'output:', [t.shape for t in f0_output])\n", " # print('Error:', (t_dict2['output'].cpu().numpy() - t_dict1['output'].numpy()).mean())\n", " print('Error:', [(t1.cpu().numpy() - t2.numpy()).mean() for t1, t2 in zip(f0_output_torch, f0_output)])\n", "\n", " _ = [convert_weights(model_ema_torch[k], model_ema[k]) for k in model_ema.keys()]\n", " label = paddle.to_tensor([0,0], dtype=paddle.int64)\n", " latent_dim = model_ema.mapping_network.shared[0].weight.shape[0]\n", " latent_style = paddle.randn([2, latent_dim])\n", " ref = model_ema.mapping_network(latent_style, label)\n", " ref_torch = model_ema_torch.mapping_network(torch.from_numpy(latent_style.numpy()).cuda(), torch.from_numpy(label.numpy()).cuda())\n", " print('Error of mapping network:', (ref_torch.cpu().numpy() - ref.numpy()).mean())\n", " mel_input2 = paddle.randn([2, 1, 192, 512])\n", " style_ref = model_ema.style_encoder(mel_input2, label)\n", " style_ref_torch = model_ema_torch.style_encoder(torch.from_numpy(mel_input2.numpy()).cuda(), torch.from_numpy(label.numpy()).cuda())\n", " print('StyleGANv2-VC encoder inputs:', mel_input2.shape, 'output:', style_ref.shape, 'should has the same shape as the ref:', ref.shape)\n", " print('Error of style encoder:', (style_ref_torch.cpu().numpy() - style_ref.numpy()).mean())\n", " f0_feat = F0_model.get_feature_GAN(mel_input)\n", " f0_feat_torch = F0_model_torch.get_feature_GAN(torch.from_numpy(mel_input.numpy()).cuda())\n", " print('Error of f0 feat:', (f0_feat_torch.cpu().numpy() - f0_feat.numpy()).mean())\n", " out = model_ema.generator(mel_input, style_ref, F0=f0_feat)\n", " out_torch = model_ema_torch.generator(torch.from_numpy(mel_input.numpy()).cuda(), torch.from_numpy(style_ref.numpy()).cuda(), F0=torch.from_numpy(f0_feat.numpy()).cuda())\n", " print('StyleGANv2-VC inputs:', label.shape, latent_style.shape, mel_input.shape, 'output:', out.shape)\n", " print('Error:', (out_torch.cpu().numpy() - out.numpy()).mean())\n", "\n", " paddle.save({'model': ASR_model.state_dict()}, 'ASR.pd')\n", " paddle.save({ 'net': F0_model.state_dict()}, 'F0.pd')\n", " model_ema_dict = {key: model.state_dict() for key, model in model_ema.items()}\n", " \n", " paddle.save({ 'model_ema': model_ema_dict }, 'VC.pd')\n", "\n", " return 0\n", "\n", "main('./Models/config.yml')\n" ], "metadata": { "id": "PnuApVuyIIyd" }, "execution_count": null, "outputs": [] } ] }