{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import gradio as gr\n", "import os\n", "\n", "import logging\n", "\n", "import librosa\n", "import torch\n", "\n", "import commons\n", "import utils\n", "from models import SynthesizerTrn\n", "from text.symbols import symbols\n", "from text import text_to_sequence\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "G:\\AI\\so-vits-svc_v2\\VITS_WebUI\\monotonic_align\n", "G:\\AI\\so-vits-svc_v2\\VITS_WebUI\n" ] } ], "source": [ "%cd G:\\AI\\so-vits-svc_v2\\VITS_WebUI\\monotonic_align\n", "!python setup.py build_ext --inplace\n", "%cd .." ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [], "source": [ "def resize2d(source, target_len):\n", " source[source<0.001] = np.nan\n", " target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)\n", " return np.nan_to_num(target)\n", "def convert_wav_22050_to_f0(audio):\n", " tmp = librosa.pyin(audio,\n", " fmin=librosa.note_to_hz('C0'),\n", " fmax=librosa.note_to_hz('C7'),\n", " frame_length=1780)[0]\n", " f0 = np.zeros_like(tmp)\n", " f0[tmp>0] = tmp[tmp>0]\n", " return f0" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "def get_text(text, hps):\n", " text_norm = text_to_sequence(text, hps.data.text_cleaners)\n", " if hps.data.add_blank:\n", " text_norm = commons.intersperse(text_norm, 0)\n", " text_norm = torch.LongTensor(text_norm)\n", " print(text_norm.shape)\n", " return text_norm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [], "source": [ "CONFIG_PATH = \"configs/config.json\"\n", "MODEL_PATH = \"models/Yuuka/Yuuka.pth\"\n", "\n", "hps = utils.get_hparams_from_file(CONFIG_PATH)\n", "net_g_ms = SynthesizerTrn(\n", " len(hps.symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " n_speakers=hps.data.n_speakers,\n", " **hps.model).cuda()\n", "\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Loaded checkpoint 'models/Yuuka/Yuuka.pth' (iteration 445)\n" ] } ], "source": [ "_ = utils.load_checkpoint(MODEL_PATH, net_g_ms, None)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using cache found in C:\\Users\\l4227/.cache\\torch\\hub\\bshall_hubert_main\n" ] } ], "source": [ "hubert = torch.hub.load(\"bshall/hubert:main\", \"hubert_soft\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "def vc_fn(input_audio,vc_transform):\n", " if input_audio is None:\n", " return \"You need to upload an audio\", None\n", " sampling_rate, audio = input_audio\n", " # 