diff --git "a/Inference/infer_24khz_mod.ipynb" "b/Inference/infer_24khz_mod.ipynb"
deleted file mode 100644--- "a/Inference/infer_24khz_mod.ipynb"
+++ /dev/null
@@ -1,926 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/ubuntu/miniconda3/envs/respair/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
- " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech\n",
- "177\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
- "The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. \n",
- "The class this function is called from is 'DistilBertJapaneseTokenizer'.\n",
- "[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...\n",
- "[nltk_data] Package punkt_tab is already up-to-date!\n",
- "/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/models.py:895: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
- " params = torch.load(model_path, map_location='cpu')['model']\n",
- "/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/models.py:823: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
- " params = torch.load(path, map_location='cpu')['net']\n",
- "/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/Utils/PLBERT/util.py:30: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
- " checkpoint = torch.load(log_dir + \"/step_\" + str(iters) + \".t7\", map_location='cpu')\n",
- "/home/ubuntu/miniconda3/envs/respair/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
- " WeightNorm.apply(module, name, dim)\n",
- "/home/ubuntu/miniconda3/envs/respair/lib/python3.11/site-packages/torch/nn/modules/rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "\n",
- "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
- "# import torch\n",
- "# print(torch.cuda.device_count())\n",
- "import IPython.display as ipd\n",
- "import os\n",
- "# os.environ['CUDA_HOME'] = '/home/ubuntu/miniconda3/envs/respair/lib/python3.11/site-packages/torch/lib/include/cuda'\n",
- "import torch\n",
- "torch.manual_seed(0)\n",
- "torch.backends.cudnn.benchmark = False\n",
- "torch.backends.cudnn.deterministic = True\n",
- "\n",
- "import random\n",
- "random.seed(0)\n",
- "\n",
- "import numpy as np\n",
- "np.random.seed(0)\n",
- "\n",
- "# load packages\n",
- "\n",
- "%cd /home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech\n",
- "from Utils.phonemize.mixed_phon import smart_phonemize\n",
- "\n",
- "import time\n",
- "import random\n",
- "import yaml\n",
- "from munch import Munch\n",
- "import numpy as np\n",
- "import torch\n",
- "from torch import nn\n",
- "import torch.nn.functional as F\n",
- "import torchaudio\n",
- "import librosa\n",
- "from nltk.tokenize import word_tokenize\n",
- "\n",
- "from models import *\n",
- "from Modules.KotoDama_sampler import tokenizer_koto_prompt, tokenizer_koto_text, inference, Longform, merging_sentences\n",
- "from utils import *\n",
- "\n",
- "import nltk\n",
- "nltk.download('punkt_tab')\n",
- "\n",
- "from nltk.tokenize import sent_tokenize\n",
- "\n",
- "from konoha import SentenceTokenizer\n",
- "\n",
- "\n",
- "sent_tokenizer = SentenceTokenizer()\n",
- "\n",
- "%matplotlib inline\n",
- "to_mel = torchaudio.transforms.MelSpectrogram(\n",
- " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
- "mean, std = -4, 4\n",
- "\n",
- "\n",
- "def preprocess(wave):\n",
- " wave_tensor = torch.from_numpy(wave).float()\n",
- " mel_tensor = to_mel(wave_tensor)\n",
- " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
- " return mel_tensor\n",
- "\n",
- "def compute_style_through_clip_through_clip(path):\n",
- " wave, sr = librosa.load(path, sr=24000)\n",
- " audio, index = librosa.effects.trim(wave, top_db=30)\n",
- " if sr != 24000:\n",
- " audio = librosa.resample(audio, sr, 24000)\n",
- " mel_tensor = preprocess(audio).to(device)\n",
- "\n",
- " with torch.no_grad():\n",
- " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
- " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
- "\n",
- " return torch.cat([ref_s, ref_p], dim=1)\n",
- "\n",
- "\n",
- "def Kotodama_Prompter(model, text, device):\n",
- " \n",
- " with torch.no_grad():\n",
- " style = model.KotoDama_Prompt(**tokenizer_koto_prompt(text, return_tensors=\"pt\").to(device))['logits']\n",
- " return style\n",
- "\n",
- "def Kotodama_Sampler(model, text, device):\n",
- " \n",
- " with torch.no_grad():\n",
- " style = model.KotoDama_Text(**tokenizer_koto_text(text, return_tensors=\"pt\").to(device))['logits']\n",
- " return style\n",
- "\n",
- "\n",
- "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
- "\n",
- "config = yaml.safe_load(open(\"Configs/config_kanade.yml\"))\n",
- "\n",
- "# load pretrained ASR model\n",
- "ASR_config = config.get('ASR_config', False)\n",
- "ASR_path = config.get('ASR_path', False)\n",
- "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
- "\n",
- "\n",
- "KotoDama_Prompter = load_KotoDama_Prompter(path=\"Utils/KTD/prompt_enc/checkpoint-73285\")\n",
- "KotoDama_TextSampler = load_KotoDama_TextSampler(path=\"Utils/KTD/text_enc/checkpoint-22680\")\n",
- "\n",
- "# load pretrained F0 model\n",
- "F0_path = config.get('F0_path', False)\n",
- "pitch_extractor = load_F0_models(F0_path)\n",
- "\n",
- "# load BERT model\n",
- "from Utils.PLBERT.util import load_plbert\n",
- "BERT_path = config.get('PLBERT_dir', False)\n",
- "plbert = load_plbert(BERT_path)\n",
- "\n",
- "model_params = recursive_munch(config['model_params'])\n",
- "model = build_model(model_params, text_aligner, pitch_extractor, plbert, KotoDama_Prompter, KotoDama_TextSampler)\n",
- "_ = [model[key].eval() for key in model]\n",
- "_ = [model[key].to(device) for key in model]\n",
- "\n",
- "# params_whole = torch.load(\"/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/Models/Style_Tsukasa_v02\", map_location='cpu')\n",
- "# params = params_whole['net']\n",
- "\n",
- "\n",
- "# for key in model:\n",
- "# if key in params:\n",
- "# print('%s loaded' % key)\n",
- "# try:\n",
- "# model[key].load_state_dict(params[key])\n",
- "# except:\n",
- "# from collections import OrderedDict\n",
- "# state_dict = params[key]\n",
- "# new_state_dict = OrderedDict()\n",
- "# for k, v in state_dict.items():\n",
- "# name = k[7:] # remove `module.`\n",
- "# new_state_dict[name] = v\n",
- "# # load params\n",
- "# model[key].load_state_dict(new_state_dict, strict=False)\n",
- "# # except:\n",
- "# # _load(params[key], model[key])\n",
- "\n",
- "\n",
- "# _ = [model[key].eval() for key in model]\n",
- "\n",
- "\n",
- "# from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
- "# diffusion_sampler = DiffusionSampler(\n",
- "# model.diffusion.diffusion,\n",
- "# sampler=ADPM2Sampler(),\n",
- "# sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
- "# clamp=False\n",
- "# )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "NLTK\n",
- "SCIPY\n",
- "TORCH STUFF\n",
- "START\n",
- "177\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...\n",
- "[nltk_data] Package punkt is already up-to-date!\n",
- "[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...\n",
- "[nltk_data] Package punkt_tab is already up-to-date!\n",
- "/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/importable.py:136: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
- " params_whole = torch.load(\"Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth\", map_location='cpu')\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "bert loaded\n",
- "bert_encoder loaded\n",
- "predictor loaded\n",
- "decoder loaded\n",
- "text_encoder loaded\n",
- "predictor_encoder loaded\n",
- "style_encoder loaded\n",
- "diffusion loaded\n",
- "text_aligner loaded\n",
- "pitch_extractor loaded\n",
- "mpd loaded\n",
- "msd loaded\n",
- "wd loaded\n"
- ]
- }
- ],
- "source": [
- "import importable"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "kimia activated! the converted text is: お前はみずかしくカンゲデント結果として気回りスルタイプだきらな。 大体、そんなかんじでオベていけばいんだ。\n"
- ]
- }
- ],
- "source": [
- "japanese = \"Ezuki: omae wa muzukashiku kangaeruto kekka to shite karamawari suru taipu dakara na. daitai, sonna kanji de obete ikeba iin da.\"\n",
- "\n",
- "raw_jpn = japanese[japanese.find(\":\") + 2:] # factoring out the name of the speaker, since we don't need that for phonemization.\n",
- "text = smart_phonemize(raw_jpn)\n",
- "\n",
- "def is_japanese(text):\n",
- " # Check if text contains Japanese characters\n",
- " japanese_ranges = [\n",
- " (0x3040, 0x309F), # Hiragana\n",
- " (0x30A0, 0x30FF), # Katakana\n",
- " (0x4E00, 0x9FFF), # Kanji\n",
- " ]\n",
- " \n",
- " for char in text:\n",
- " char_code = ord(char)\n",
- " for start, end in japanese_ranges:\n",
- " if start <= char_code <= end:\n",
- " return True\n",
- " return False\n",
- "\n",
- "if is_japanese(raw_jpn):\n",
- " kotodama_prompt = raw_jpn\n",
- "\n",
- " \n",
- "else:\n",
- " kotodama_prompt = importable.p2g(raw_jpn)\n",
- " print('kimia activated! the converted text is: ', kotodama_prompt)\n",
- " \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "お前は難しく考えると結果として空回りするタイプだからな。大体、そんな感じで覚えて置けばいいんだ。\n",
- "['お前は難しく考えると結果として空回りするタイプだからな。 大体、そんな感じで覚えて置けばいいんだ。']\n",
- "phonemes -> omae wa mɯzɯkaɕikɯ kaŋgaerɯto keʔka to ɕite kaɽamawaɽi sɯrɯ taipɯ dakaɽa na. daitai, sonna kandʑi de oboete okeba iːɴ da.\n",
- "Synthesized: \n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "def LongformSynth_Text(text, s_prev=None, Kotodama=None, alpha=.0, beta=0, t=.8, diffusion_steps=5, embedding_scale=1, rate_of_speech=1.):\n",
- "\n",
- " japanese = text\n",
- "\n",
- " # raw_jpn = japanese[japanese.find(\":\") + 2:]\n",
- " # speaker = japanese[:japanese.find(\":\") + 2]\n",
- "\n",
- "\n",
- " if \":\" in japanese[:10]:\n",
- " raw_jpn = japanese[japanese.find(\":\") + 2:]\n",
- " speaker = japanese[:japanese.find(\":\") + 2]\n",
- " else:\n",
- " raw_jpn = japanese\n",
- " speaker = \"\"\n",
- " \n",
- " sentences = importable.sent_tokenizer.tokenize(raw_jpn)\n",
- " sentences = importable.merging_sentences(sentences)\n",
- " \n",
- " \n",
- " kotodama_prompt = text\n",
- " \n",
- " if is_japanese(raw_jpn):\n",
- " kotodama_prompt = kotodama_prompt\n",
- "\n",
- " \n",
- " else:\n",
- " kotodama_prompt = speaker + importable.p2g(smart_phonemize(raw_jpn))\n",
- " print('kimia activated! the converted text is: ', kotodama_prompt)\n",
- " \n",
- "\n",
- " print(kotodama_prompt)\n",
- "\n",
- " silence = 24000 * 0.5 # 500 ms of silence between outputs for a more natural transition\n",
- " # sentences = sent_tokenize(text)\n",
- " txt = \"お前は難しく考えると結果としてから周りするタイプだからな。大体、そんな感じで覚えて置けばいいんだ。\"\n",
- " print(sentences)\n",
- " wavs = []\n",
- " s_prev = None\n",
- " for text in sentences:\n",
- " \n",
- " text_input = smart_phonemize(text)\n",
- " print('phonemes -> ', text_input)\n",
- "\n",
- " Kotodama = importable.Kotodama_Sampler(importable.model, text=speaker + txt, device=importable.device) \n",
- "\n",
- " wav, s_prev = importable.Longform(text_input, \n",
- " s_prev, \n",
- " Kotodama, \n",
- " alpha = alpha, \n",
- " beta = beta, \n",
- " t = t, \n",
- " diffusion_steps=diffusion_steps, embedding_scale=embedding_scale, rate_of_speech=rate_of_speech)\n",
- " wavs.append(wav)\n",
- " wavs.append(np.zeros(int(silence)))\n",
- " \n",
- " print('Synthesized: ')\n",
- " return (24000, np.concatenate(wavs))\n",
- "\n",
- "sr, wav = LongformSynth_Text(\"お前は難しく考えると結果として空回りするタイプだからな。大体、そんな感じで覚えて置けばいいんだ。\")\n",
- "\n",
- "display(ipd.Audio(wav, rate=sr, normalize=False))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Wed Nov 27 01:55:55 2024 \n",
- "+-----------------------------------------------------------------------------------------+\n",
- "| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n",
- "|-----------------------------------------+------------------------+----------------------+\n",
- "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
- "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
- "| | | MIG M. |\n",
- "|=========================================+========================+======================|\n",
- "| 0 Tesla V100S-PCIE-32GB Off | 00000000:00:06.0 Off | 0 |\n",
- "| N/A 31C P0 37W / 250W | 31937MiB / 32768MiB | 0% Default |\n",
- "| | | N/A |\n",
- "+-----------------------------------------+------------------------+----------------------+\n",
- " \n",
- "+-----------------------------------------------------------------------------------------+\n",
- "| Processes: |\n",
- "| GPU GI CI PID Type Process name GPU Memory |\n",
- "| ID ID Usage |\n",
- "|=========================================================================================|\n",
- "| 0 N/A N/A 278684 C python 6206MiB |\n",
- "| 0 N/A N/A 404713 C ...ntu/miniconda3/envs/vllm/bin/python 20744MiB |\n",
- "| 0 N/A N/A 418844 C .../miniconda3/envs/respair/bin/python 4974MiB |\n",
- "+-----------------------------------------------------------------------------------------+\n"
- ]
- }
- ],
- "source": [
- "!nvidia-smi"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(24000,\n",
- " array([-8.75000933e-06, -2.29574198e-05, -2.80886070e-05, ...,\n",
- " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]))"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "wav"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\"\"\"different speaker by switching the name. we then define how much should be the impact of the diffusion sampler. \n",
- "if the diffusion sampler works for you, it's recommended to use both Kotodama. otherwise, set alpha and beta to 0.\"\"\"\n",
- "\n",
- "japanese = \"Kimiji: 人生は、果てしない探求の旅のようなもの。私たちは、自分自身や周囲の世界について、常に新しい発見をしていく。それは、時として喜びをもたらすこともあれば、困難に直面することもある。しかしそれら全てが、自分を形作る貴重な経験である。\"\n",
- "\n",
- "raw_jpn = japanese[japanese.find(\":\") + 2:] # factoring out the name of the speaker, since we don't need that for phonemization.\n",
- "text = phonemize(raw_jpn)\n",
- "\n",
- "Kotodama = Kotodama_Sampler(model, text=japanese, device=device) # provide the Japanese text, not the Phonemized version.\n",
- "\n",
- "reference_dicts = {}\n",
- "\n",
- "reference_dicts['1789_14289w'] = japanese\n",
- "\n",
- "start = time.time()\n",
- "noise = torch.randn(1,1,256).to(device)\n",
- "for k, path in reference_dicts.items():\n",
- "\n",
- " wav = inference(model, diffusion_sampler, text, Kotodama, alpha=0.1, beta=0.5, diffusion_steps=10, embedding_scale=1.5, rate_of_speech=1.) \n",
- " rtf = (time.time() - start) / (len(wav) / 24000)\n",
- " print(f\"RTF = {rtf:5f}\")\n",
- " display(ipd.Audio(wav, rate=24000, normalize=False))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Explaining the Inference\n",
- "\n",
- "alpha and beta will determine the impact of the diffusion sampler. \n",
- "rate_of_speech will define pace of speech.\n",
- " \n",
- "diffusion sampler may not work on all devices for the reason explained in the repo, in that case please turn it off by setting both to 0.\n",
- "\n",
- "\n",
- "アルファとベータは、ディフュージョンサンプラーの影響を決定します。\n",
- "ディフュージョンサンプラーはすべての端末で動作しない可能性があり、その理由は��ポジトリで説明されています。その場合は、両方を0にしてください。そうすれば、ディフュージョンサンプラーと影響が完全に0になります。\n",
- "\"リコンストラクションヘッド\"は、ディフュージョンネットを使用して48kHzまでアップサンプリングすることで、失われた周波数を復元しようとします。レイテンシーを改善したい場合や、その機能にそれほど影響がないと感じる場合は、それをオフにできます。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "RTF = 0.016579\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "\"\"\"different speaker by switching the name. we then define how much should be the impact of the diffusion sampler. \n",
- "if the diffusion sampler works for you, it's recommended to use both Kotodama. otherwise, set alpha and beta to 0.\"\"\"\n",
- "\n",
- "japanese = \"Kimiji: 人生は、果てしない探求の旅のようなもの。私たちは、自分自身や周囲の世界について、常に新しい発見をしていく。それは、時として喜びをもたらすこともあれば、困難に直面することもある。しかしそれら全てが、自分を形作る貴重な経験である。\"\n",
- "\n",
- "raw_jpn = japanese[japanese.find(\":\") + 2:] # factoring out the name of the speaker, since we don't need that for phonemization.\n",
- "text = phonemize(raw_jpn)\n",
- "\n",
- "Kotodama = Kotodama_Sampler(model, text=japanese, device=device) # provide the Japanese text, not the Phonemized version.\n",
- "\n",
- "reference_dicts = {}\n",
- "\n",
- "reference_dicts['1789_14289w'] = japanese\n",
- "\n",
- "start = time.time()\n",
- "noise = torch.randn(1,1,256).to(device)\n",
- "for k, path in reference_dicts.items():\n",
- "\n",
- " wav = inference(model, diffusion_sampler, text, Kotodama, alpha=0.1, beta=0.5, diffusion_steps=10, embedding_scale=1.5, rate_of_speech=1.) \n",
- " rtf = (time.time() - start) / (len(wav) / 24000)\n",
- " print(f\"RTF = {rtf:5f}\")\n",
- " display(ipd.Audio(wav, rate=24000, normalize=False))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'A male voice that resonates with deep, thunderous intensity. His rapid-fire words slam like aggressive drumbeats, each syllable charged with intense rage. The expressive tone fluctuates between restrained fury and explosive outbursts. \\n text: この俺に何度も同じことを説明させるな!! お前たちは俺の忠告を完全に無視して、とんでもない結果を招いてしまった。これが最後の警告だ。次は絶対に許さないぞ!'"
- ]
- },
- "execution_count": 56,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "raw_jpn = \"\"\"この俺に何度も同じことを説明させるな!! お前たちは俺の忠告を完全に無視して、とんでもない結果を招いてしまった。これが最後の警告だ。次は絶対に許さないぞ!\"\"\"\n",
- "prompt =f\"\"\"A male voice that resonates with deep, thunderous intensity. His rapid-fire words slam like aggressive drumbeats, each syllable charged with intense rage. The expressive tone fluctuates between restrained fury and explosive outbursts. \\n text: {raw_jpn}\"\"\"\n",
- "prompt"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "RTF = 0.023575\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "text = phonemize(raw_jpn)\n",
- "\n",
- "Kotodama = Kotodama_Prompter(model, text=prompt, device=device) # provide the prompt text, not the Phonemized version.\n",
- "\n",
- "reference_dicts = {}\n",
- "\n",
- "reference_dicts['1789_14289w'] = prompt\n",
- "\n",
- "start = time.time()\n",
- "noise = torch.randn(1,1,256).to(device)\n",
- "for k, path in reference_dicts.items():\n",
- "\n",
- " wav = inference(model, diffusion_sampler, text, Kotodama, alpha=.5, beta=.05, diffusion_steps=10, embedding_scale=1.5, rate_of_speech=1.)\n",
- " wave = trim_long_silences(wav)\n",
- " rtf = (time.time() - start) / (len(wav) / 24000)\n",
- " print(f\"RTF = {rtf:5f}\")\n",
- " display(ipd.Audio(wave, rate=24000, normalize=True))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'A female voice that resonates with extreme intensity and a slightly high pitch. Her rapid-fire words cascade like aggressive drumbeats, each syllable charged with raw anger. The expressive tone fluctuates between controlled fury and explosive outbursts. \\n text: 私に何度も同じことを説明させないでよ!!!\\u3000お前たちは私の忠告を完全に無視して、とんでもない結果を招いてしまった! これが最後の警告だ。次は絶対に許さないわよ!'"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "raw_jpn = \"\"\"私に何度も同じことを説明させないでよ!!! お前たちは私の忠告を完全に無視して、とんでもない結果を招いてしまった! これが最後の警告だ。次は絶対に許さないわよ!\"\"\"\n",
- "prompt =f\"\"\"A female voice that resonates with extreme intensity and a slightly high pitch. Her rapid-fire words cascade like aggressive drumbeats, each syllable charged with raw anger. The expressive tone fluctuates between controlled fury and explosive outbursts. \\n text: {raw_jpn}\"\"\"\n",
- "prompt"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "RTF = 0.022258\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "text = phonemize(raw_jpn) \n",
- "\n",
- "Kotodama = Kotodama_Prompter(model, text=prompt, device=device) # provide the prompt text, not the Phonemized version.\n",
- "\n",
- "reference_dicts = {}\n",
- "\n",
- "reference_dicts['1789_14289w'] = prompt\n",
- "\n",
- "start = time.time()\n",
- "noise = torch.randn(1,1,256).to(device)\n",
- "for k, path in reference_dicts.items():\n",
- "\n",
- " wav = inference(model, diffusion_sampler, text, Kotodama, alpha=.1, beta=.5, diffusion_steps=10, embedding_scale=1.5, rate_of_speech=1.)\n",
- " rtf = (time.time() - start) / (len(wav) / 24000)\n",
- " print(f\"RTF = {rtf:5f}\")\n",
- " display(ipd.Audio(wav, rate=24000, normalize=True))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "RTF = 0.014008\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "text = phonemize(raw_jpn)\n",
- "\n",
- "Kotodama = Kotodama_Prompter(model, text=prompt, device=device) # provide the prompt text, not the Phonemized version.\n",
- "\n",
- "reference_dicts = {}\n",
- "\n",
- "reference_dicts['1789_14289w'] = prompt\n",
- "\n",
- "start = time.time()\n",
- "noise = torch.randn(1,1,256).to(device)\n",
- "for k, path in reference_dicts.items():\n",
- "\n",
- " wav = inference(model, diffusion_sampler, text, Kotodama, alpha=.0, beta=.0, diffusion_steps=10, embedding_scale=1., rate_of_speech=1.)\n",
- " rtf = (time.time() - start) / (len(wav) / 24000)\n",
- " print(f\"RTF = {rtf:5f}\")\n",
- " display(ipd.Audio(wav, rate=24000, normalize=True))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 190,
- "metadata": {},
- "outputs": [],
- "source": [
- "sref1 = style\n",
- "sref2 = compute_style_through_clip(\"/home/ubuntu/Kanade_Project/Project_Kanade_SpeechModel/pkanade_24/Inference/reference_sample_wavs/syuukovoice_200918_3_01.wav\")\n",
- "\n",
- "\"\"\"weighted combination, define which style should be the dominant. 50-50 may result in an entirely new speaker.\n",
- "ウ付きの組み合わせを使うことで、どのスタイルが優位になるべきかを定義できます。50対50の比率だと、新しいスピーカーが生み出される可能性があります。ェイト\"\"\"\n",
- "\n",
- "cook = (sref1 * .5 + sref2 * .5) # for emotion transfer [20 / 80] or [10 / 90] might be good."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Cooking New Styles\n",
- "\n",
- "you can mix multiple styles and weight them to create new styles. \n",
- "\n",
- "複数のスタイルを組み合わせ、重み付けすることで新しいスタイルを作り出すことができます。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sref1 = compute_style_through_clip_through_clip(\"a/sample/that/is/not/too/short.wav\")\n",
- "sref2 = compute_style_through_clip_through_clip(\"a/sample/that/is/not/too/short.wav\")\n",
- "\n",
- "\"\"\"weighted combination, define which style should be the dominant. 50-50 may result in an entirely new speaker.\n",
- "ウェイト付きの組み合わせを使うことで、どのスタイルが優位になるべきかを定義できます。50対50の比率だと、新しいスピーカーが生み出される可能性があります。\"\"\"\n",
- "\n",
- "cook = (sref1 * .1 + sref2 * .9) # for emotion transfer [20 / 80] or [10 / 90] might be good."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "text = phonemize(raw_jpn)\n",
- "\n",
- "reference_dicts = {}\n",
- "\n",
- "reference_dicts['1789_14289w'] = prompt\n",
- "\n",
- "start = time.time()\n",
- "noise = torch.randn(1,1,256).to(device)\n",
- "for k, path in reference_dicts.items():\n",
- "\n",
- " wav = inference(model, diffusion_sampler, text, cook, alpha=.0, beta=.0, diffusion_steps=10, embedding_scale=1., rate_of_speech=1.)\n",
- " rtf = (time.time() - start) / (len(wav) / 24000)\n",
- " print(f\"RTF = {rtf:5f}\")\n",
- " display(ipd.Audio(wav, rate=24000, normalize=True))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Longform\n",
- "\n",
- "The duration of the Samples generated by using the regular inference function is limited. but we can tokenize each paragraph and\n",
- " do a weighted Style Transfer similar to what you saw before to ensure the output is consistent.\n",
- "\n",
- "there's no theoretical limit to the length of a sample produced by the Long Form algorithm. make sure your texts are not too short after tokenization. \n",
- "\n",
- "**日本語**\n",
- "\n",
- "通常のinference functionを使用して生成されたサンプルの長さに限界があります。しかし、各段落をトークン化し、以前見たようなスタイル転送を重み付けして行うことで、出力の一貫性を確保することができます。\n",
- "ロングフォームアルゴリズムで生成されるサンプルの長さには理論的な制限はありません。トークン化後のテキストが短すぎないことを確認してください。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['この世には、言葉にできない悲しみがある。 それは、胸の奥に沈んでいくような重さで、時間が経つにつれて、じわじわと広がっていく。', '私は、その悲しみを抱えながら、日々を過ごしている。 言葉を発するたびに、心の中で何度も繰り返し、慎重に選び抜いている。', 'それは、痛みを和らげるための儀式のようなものだ.']\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Synthesized: \n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "raw_jpn = japanese[japanese.find(\":\") + 2:]\n",
- "speaker = japanese[:japanese.find(\":\") + 2]\n",
- "\n",
- "sentences = sent_tokenizer.tokenize(raw_jpn)\n",
- "sentences = merging_sentences(sentences)\n",
- "\n",
- "silence = 24000 * 0.5 # 500 ms of silence between outputs for a more natural transition\n",
- "# sentences = sent_tokenize(text)\n",
- "print(sentences)\n",
- "wavs = []\n",
- "s_prev = None\n",
- "for text in sentences:\n",
- " \n",
- " text_input = phonemize(text)\n",
- "\n",
- " Kotodama = Kotodama_Sampler(model, text=speaker + text, device=device) \n",
- "\n",
- " wav, s_prev = Longform(model, diffusion_sampler,\n",
- " text_input, \n",
- " s_prev, \n",
- " Kotodama, \n",
- " alpha = .3, \n",
- " beta =.9, \n",
- " t = .8, \n",
- " diffusion_steps=10, embedding_scale=1.5, rate_of_speech=1.)\n",
- " wavs.append(wav)\n",
- " wavs.append(np.zeros(int(silence)))\n",
- " \n",
- "print('Synthesized: ')\n",
- "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
- "# print('Reference: ')\n",
- "# display(ipd.Audio(path, rate=24000, normalize=True))"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "respair",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}