{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import IPython.display as ipd\n", "\n", "import os\n", "import json\n", "import math\n", "import torch\n", "from torch import nn\n", "from torch.nn import functional as F\n", "from torch.utils.data import DataLoader\n", "\n", "import commons\n", "import utils\n", "from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate\n", "from models import SynthesizerTrn\n", "from text.symbols import symbols\n", "from text import text_to_sequence\n", "\n", "from scipy.io.wavfile import write\n", "import numpy as np\n", "\n", "\n", "def get_text(text, hps):\n", " text_norm = text_to_sequence(text, hps.data.text_cleaners)\n", " if hps.data.add_blank:\n", " text_norm = commons.intersperse(text_norm, 0)\n", " text_norm = torch.LongTensor(text_norm)\n", " return text_norm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "hps = utils.get_hparams_from_file(\"./configs/vtubers.json\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Loaded checkpoint 'logs/vtubers/nene_final.pth' (iteration 476)\n", "Loaded checkpoint 'logs/vtubers/nene_final.pth' (iteration 476) \n" ] } ], "source": [ "net_g = SynthesizerTrn(\n", " len(symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " n_speakers=hps.data.n_speakers,\n", " **hps.model)\n", "_ = net_g.eval()\n", "\n", "_ = utils.load_checkpoint(\"logs/vtubers/nene_final.pth\", net_g, None)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# 随机抽取情感参考音频的根目录\n", "random_emotion_root = \"dataset/nene\"\n", "import random\n", "\n", "def tts(txt, emotion):\n", " \"\"\"emotion为参考情感音频路径 或random_sample(随机抽取)\"\"\"\n", " stn_tst = get_text(txt, hps)\n", " with torch.no_grad():\n", " x_tst = stn_tst.unsqueeze(0)\n", " x_tst_lengths = torch.LongTensor([stn_tst.size(0)])\n", " sid = torch.LongTensor([0])\n", " if os.path.exists(f\"{emotion}.emo.npy\"):\n", " emo = torch.FloatTensor(np.load(f\"{emotion}.emo.npy\")).unsqueeze(0)\n", " elif emotion == \"random_sample\":\n", " while True:\n", " rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]\n", " if rand_wav.endswith('wav') and os.path.exists(f\"{random_emotion_root}/{rand_wav}.emo.npy\"):\n", " break\n", " emo = torch.FloatTensor(np.load(f\"{random_emotion_root}/{rand_wav}.emo.npy\")).unsqueeze(0)\n", " print(f\"{random_emotion_root}/{rand_wav}\")\n", " elif emotion.endswith(\"wav\"):\n", " import emotion_extract\n", " emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))\n", " else:\n", " print(\"emotion参数不正确\")\n", "\n", " audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()\n", " ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dataset/nene/nen108_168.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "dataset/nene/nen106_128.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "dataset/nene/nen113_136.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "dataset/nene/nen405_025.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "dataset/nene/nen116_030.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "dataset/nene/nen104_275.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 随机选取使用训练数据中某一条数据的情感\n", "# 随机抽取的音频文件路径可以用于使用该情感合成其他句子\n", "txt = \"なんでこんなに慣れてんのよ。私の方が先に好きだったのに\"\n", "tts(txt, emotion='random_sample')\n", "tts(txt, emotion='random_sample')\n", "tts(txt, emotion='random_sample')\n", "tts(txt, emotion='random_sample')\n", "tts(txt, emotion='random_sample')\n", "tts(txt, emotion='random_sample')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "txt = \"こんにちは。私わあやちねねです。\"\n", "tts(txt, emotion=\"dataset/nene/nen116_030.wav\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 4 }