Respair commited on
Commit
834bd9b
·
verified ·
1 Parent(s): 4e30bdb

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. stts_48khz/StyleTTS2_48khz/.gitignore +1 -0
  3. stts_48khz/StyleTTS2_48khz/Colab/StyleTTS2_Demo_LJSpeech.ipynb +486 -0
  4. stts_48khz/StyleTTS2_48khz/Colab/StyleTTS2_Demo_LibriTTS.ipynb +1218 -0
  5. stts_48khz/StyleTTS2_48khz/Colab/StyleTTS2_Finetune_Demo.ipynb +480 -0
  6. stts_48khz/StyleTTS2_48khz/Configs/config.yml +116 -0
  7. stts_48khz/StyleTTS2_48khz/Configs/config_ft.yml +118 -0
  8. stts_48khz/StyleTTS2_48khz/Configs/config_kanade_48khz.yml +125 -0
  9. stts_48khz/StyleTTS2_48khz/Configs/config_kanade_48khz_copy.yml +124 -0
  10. stts_48khz/StyleTTS2_48khz/Demo/Inference_LJSpeech.ipynb +554 -0
  11. stts_48khz/StyleTTS2_48khz/Demo/Inference_LibriTTS.ipynb +1242 -0
  12. stts_48khz/StyleTTS2_48khz/LICENSE +21 -0
  13. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade/config_kanade.yml +124 -0
  14. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade/tensorboard/events.out.tfevents.1728511195.node-1.1421403.0 +3 -0
  15. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade/train.log +0 -0
  16. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/2nd_phase_165885.pth +3 -0
  17. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/2nd_phase_65527.pth +3 -0
  18. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/2nd_phase_last.pth +3 -0
  19. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/DP_epoch_2nd_00004.pth +3 -0
  20. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/NO_SLM_epoch_2nd_00009.pth +3 -0
  21. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/NO_SLM_epoch_2nd_00010.pth +3 -0
  22. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462180.node-1.1003680.0 +3 -0
  23. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462294.node-1.1004682.0 +3 -0
  24. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462472.node-1.1005638.0 +3 -0
  25. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462951.node-1.1007312.0 +3 -0
  26. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463094.node-1.1008219.0 +3 -0
  27. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463336.node-1.1010823.0 +3 -0
  28. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463388.node-1.1011249.0 +3 -0
  29. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463515.node-1.1013548.0 +3 -0
  30. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463957.node-1.1016238.0 +3 -0
  31. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464009.node-1.1016738.0 +3 -0
  32. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464233.node-1.1019060.0 +3 -0
  33. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464354.node-1.1019744.0 +3 -0
  34. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464586.node-1.1020751.0 +3 -0
  35. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464707.node-1.1021516.0 +3 -0
  36. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464831.node-1.1022361.0 +3 -0
  37. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464900.node-1.1022907.0 +3 -0
  38. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465001.node-1.1025007.0 +3 -0
  39. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465067.node-1.1026980.0 +3 -0
  40. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465130.node-1.1028957.0 +3 -0
  41. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465526.node-1.1031919.0 +3 -0
  42. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465719.node-1.1034258.0 +3 -0
  43. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465773.node-1.1034708.0 +3 -0
  44. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465956.node-1.1037028.0 +3 -0
  45. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466151.node-1.1039623.0 +3 -0
  46. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466402.node-1.1042081.0 +3 -0
  47. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466494.node-1.1044247.0 +3 -0
  48. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466592.node-1.1046287.0 +3 -0
  49. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466648.node-1.1048190.0 +3 -0
  50. stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466722.node-1.1050256.0 +3 -0
.gitattributes CHANGED
@@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  AuxiliaryASR/Data/train_list.csv filter=lfs diff=lfs merge=lfs -text
37
  AuxiliaryASR/Data/train_list_plus.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  AuxiliaryASR/Data/train_list.csv filter=lfs diff=lfs merge=lfs -text
37
  AuxiliaryASR/Data/train_list_plus.csv filter=lfs diff=lfs merge=lfs -text
38
+ stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/train.log filter=lfs diff=lfs merge=lfs -text
39
+ stts_48khz/StyleTTS2_48khz/Utils/JDC/bst_rmvpe_48k.t7 filter=lfs diff=lfs merge=lfs -text
40
+ stts_48khz/StyleTTS2_48khz/infer.ipynb filter=lfs diff=lfs merge=lfs -text
stts_48khz/StyleTTS2_48khz/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
stts_48khz/StyleTTS2_48khz/Colab/StyleTTS2_Demo_LJSpeech.ipynb ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LJSpeech.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "metadata": {
16
+ "id": "nm653VK4CG9F"
17
+ },
18
+ "source": [
19
+ "### Install packages and download models"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {
26
+ "id": "gciBKMqCCLvT"
27
+ },
28
+ "outputs": [],
29
+ "source": [
30
+ "%%shell\n",
31
+ "git clone https://github.com/yl4579/StyleTTS2.git\n",
32
+ "cd StyleTTS2\n",
33
+ "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n",
34
+ "sudo apt-get install espeak-ng\n",
35
+ "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LJSpeech\n",
36
+ "mv StyleTTS2-LJSpeech/Models ."
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "markdown",
41
+ "metadata": {
42
+ "id": "OAA8lx-XCQnM"
43
+ },
44
+ "source": [
45
+ "### Load models"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "metadata": {
52
+ "id": "m0XRpbxSCSix"
53
+ },
54
+ "outputs": [],
55
+ "source": [
56
+ "%cd StyleTTS2\n",
57
+ "\n",
58
+ "import torch\n",
59
+ "torch.manual_seed(0)\n",
60
+ "torch.backends.cudnn.benchmark = False\n",
61
+ "torch.backends.cudnn.deterministic = True\n",
62
+ "\n",
63
+ "import random\n",
64
+ "random.seed(0)\n",
65
+ "\n",
66
+ "import numpy as np\n",
67
+ "np.random.seed(0)\n",
68
+ "\n",
69
+ "import nltk\n",
70
+ "nltk.download('punkt')\n",
71
+ "\n",
72
+ "# load packages\n",
73
+ "import time\n",
74
+ "import random\n",
75
+ "import yaml\n",
76
+ "from munch import Munch\n",
77
+ "import numpy as np\n",
78
+ "import torch\n",
79
+ "from torch import nn\n",
80
+ "import torch.nn.functional as F\n",
81
+ "import torchaudio\n",
82
+ "import librosa\n",
83
+ "from nltk.tokenize import word_tokenize\n",
84
+ "\n",
85
+ "from models import *\n",
86
+ "from utils import *\n",
87
+ "from text_utils import TextCleaner\n",
88
+ "textclenaer = TextCleaner()\n",
89
+ "\n",
90
+ "%matplotlib inline\n",
91
+ "\n",
92
+ "device = 'cpu' if torch.cuda.is_available() else 'cpu'\n",
93
+ "\n",
94
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
95
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
96
+ "mean, std = -4, 4\n",
97
+ "\n",
98
+ "def length_to_mask(lengths):\n",
99
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
100
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
101
+ " return mask\n",
102
+ "\n",
103
+ "def preprocess(wave):\n",
104
+ " wave_tensor = torch.from_numpy(wave).float()\n",
105
+ " mel_tensor = to_mel(wave_tensor)\n",
106
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
107
+ " return mel_tensor\n",
108
+ "\n",
109
+ "def compute_style(ref_dicts):\n",
110
+ " reference_embeddings = {}\n",
111
+ " for key, path in ref_dicts.items():\n",
112
+ " wave, sr = librosa.load(path, sr=24000)\n",
113
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
114
+ " if sr != 24000:\n",
115
+ " audio = librosa.resample(audio, sr, 24000)\n",
116
+ " mel_tensor = preprocess(audio).to(device)\n",
117
+ "\n",
118
+ " with torch.no_grad():\n",
119
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
120
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
121
+ "\n",
122
+ " return reference_embeddings\n",
123
+ "\n",
124
+ "# load phonemizer\n",
125
+ "import phonemizer\n",
126
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True, words_mismatch='ignore')\n",
127
+ "\n",
128
+ "config = yaml.safe_load(open(\"Models/Kaede-san/config.yml\"))\n",
129
+ "\n",
130
+ "# load pretrained ASR model\n",
131
+ "ASR_config = config.get('ASR_config', False)\n",
132
+ "ASR_path = config.get('ASR_path', False)\n",
133
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
134
+ "\n",
135
+ "# load pretrained F0 model\n",
136
+ "F0_path = config.get('F0_path', False)\n",
137
+ "pitch_extractor = load_F0_models(F0_path)\n",
138
+ "\n",
139
+ "# load BERT model\n",
140
+ "from Utils.PLBERT.util import load_plbert\n",
141
+ "BERT_path = config.get('PLBERT_dir', False)\n",
142
+ "plbert = load_plbert(BERT_path)\n",
143
+ "\n",
144
+ "model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)\n",
145
+ "_ = [model[key].eval() for key in model]\n",
146
+ "_ = [model[key].to(device) for key in model]\n",
147
+ "\n",
148
+ "params_whole = torch.load(\"Models/LJSpeech/epoch_2nd_00100.pth\", map_location='cpu')\n",
149
+ "params = params_whole['net']\n",
150
+ "\n",
151
+ "for key in model:\n",
152
+ " if key in params:\n",
153
+ " print('%s loaded' % key)\n",
154
+ " try:\n",
155
+ " model[key].load_state_dict(params[key])\n",
156
+ " except:\n",
157
+ " from collections import OrderedDict\n",
158
+ " state_dict = params[key]\n",
159
+ " new_state_dict = OrderedDict()\n",
160
+ " for k, v in state_dict.items():\n",
161
+ " name = k[7:] # remove `module.`\n",
162
+ " new_state_dict[name] = v\n",
163
+ " # load params\n",
164
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
165
+ "# except:\n",
166
+ "# _load(params[key], model[key])\n",
167
+ "_ = [model[key].eval() for key in model]\n",
168
+ "\n",
169
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
170
+ "\n",
171
+ "sampler = DiffusionSampler(\n",
172
+ " model.diffusion.diffusion,\n",
173
+ " sampler=ADPM2Sampler(),\n",
174
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
175
+ " clamp=False\n",
176
+ ")\n",
177
+ "\n",
178
+ "def inference(text, noise, diffusion_steps=5, embedding_scale=1):\n",
179
+ " text = text.strip()\n",
180
+ " text = text.replace('\"', '')\n",
181
+ " ps = global_phonemizer.phonemize([text])\n",
182
+ " ps = word_tokenize(ps[0])\n",
183
+ " ps = ' '.join(ps)\n",
184
+ "\n",
185
+ " tokens = textclenaer(ps)\n",
186
+ " tokens.insert(0, 0)\n",
187
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
188
+ "\n",
189
+ " with torch.no_grad():\n",
190
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
191
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
192
+ "\n",
193
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
194
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
195
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
196
+ "\n",
197
+ " s_pred = sampler(noise,\n",
198
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
199
+ " embedding_scale=embedding_scale).squeeze(0)\n",
200
+ "\n",
201
+ " s = s_pred[:, 128:]\n",
202
+ " ref = s_pred[:, :128]\n",
203
+ "\n",
204
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
205
+ "\n",
206
+ " x, _ = model.predictor.lstm(d)\n",
207
+ " duration = model.predictor.duration_proj(x)\n",
208
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
209
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
210
+ "\n",
211
+ " pred_dur[-1] += 5\n",
212
+ "\n",
213
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
214
+ " c_frame = 0\n",
215
+ " for i in range(pred_aln_trg.size(0)):\n",
216
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
217
+ " c_frame += int(pred_dur[i].data)\n",
218
+ "\n",
219
+ " # encode prosody\n",
220
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
221
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
222
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),\n",
223
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
224
+ "\n",
225
+ " return out.squeeze().cpu().numpy()\n",
226
+ "\n",
227
+ "def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):\n",
228
+ " text = text.strip()\n",
229
+ " text = text.replace('\"', '')\n",
230
+ " ps = global_phonemizer.phonemize([text])\n",
231
+ " ps = word_tokenize(ps[0])\n",
232
+ " ps = ' '.join(ps)\n",
233
+ "\n",
234
+ " tokens = textclenaer(\"so↑nna to↓kini, jo↓kaʔta jo↑otoka, ge↓ŋkiga mo↑ɾaeɾʔʔte i↑ʔte mo↑ɾaeɾɯto, so↑ɾedakede ta↑içeɴ↓sanante ɸɯ↑kito↓ndʑa i↑ma↓sɯ. a↑idoɾɯo ja↓ʔte i↑tejo↓kaʔtaʔte o↑moe↓ɾɯ n↓desɯjo. ko↑no ka↑itoo, sɯ↑ko↓ɕi zɯ↑ɾɯ↓ikamo ɕi↑ɾemase↓ŋkedo, wa↑taɕino ço↑ntoono ki↑motɕide↓sɯ.\")\n",
235
+ " tokens.insert(0, 0)\n",
236
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
237
+ "\n",
238
+ " with torch.no_grad():\n",
239
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
240
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
241
+ "\n",
242
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
243
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
244
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
245
+ "\n",
246
+ " s_pred = sampler(noise,\n",
247
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
248
+ " embedding_scale=embedding_scale).squeeze(0)\n",
249
+ "\n",
250
+ " if s_prev is not None:\n",
251
+ " # convex combination of previous and current style\n",
252
+ " s_pred = alpha * s_prev + (1 - alpha) * s_pred\n",
253
+ "\n",
254
+ " s = s_pred[:, 128:]\n",
255
+ " ref = s_pred[:, :128]\n",
256
+ "\n",
257
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
258
+ "\n",
259
+ " x, _ = model.predictor.lstm(d)\n",
260
+ " duration = model.predictor.duration_proj(x)\n",
261
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
262
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
263
+ "\n",
264
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
265
+ " c_frame = 0\n",
266
+ " for i in range(pred_aln_trg.size(0)):\n",
267
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
268
+ " c_frame += int(pred_dur[i].data)\n",
269
+ "\n",
270
+ " # encode prosody\n",
271
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
272
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
273
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),\n",
274
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
275
+ "\n",
276
+ " return out.squeeze().cpu().numpy(), s_pred"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "markdown",
281
+ "metadata": {
282
+ "id": "vuCbS0gdArgJ"
283
+ },
284
+ "source": [
285
+ "### Synthesize speech"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": 3,
291
+ "metadata": {
292
+ "id": "7Ud1Y-kbBPTw"
293
+ },
294
+ "outputs": [],
295
+ "source": [
296
+ "# @title Input Text { display-mode: \"form\" }\n",
297
+ "# synthesize a text\n",
298
+ "text = \"StyleTTS 2 is a text-to-speech model that leverages style diffusion and adversarial training with large speech language models to achieve human-level text-to-speech synthesis.\" # @param {type:\"string\"}\n"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "markdown",
303
+ "metadata": {
304
+ "id": "TM2NjuM7B6sz"
305
+ },
306
+ "source": [
307
+ "#### Basic synthesis (5 diffusion steps)"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": null,
313
+ "metadata": {
314
+ "id": "KILqC-V-Ay5e"
315
+ },
316
+ "outputs": [],
317
+ "source": [
318
+ "start = time.time()\n",
319
+ "noise = torch.randn(1,1,256).to(device)\n",
320
+ "wav = inference(text, noise, diffusion_steps=5, embedding_scale=1)\n",
321
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
322
+ "print(f\"RTF = {rtf:5f}\")\n",
323
+ "import IPython.display as ipd\n",
324
+ "display(ipd.Audio(wav, rate=24000))"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "markdown",
329
+ "metadata": {
330
+ "id": "oZk9o-EzCBVx"
331
+ },
332
+ "source": [
333
+ "#### With higher diffusion steps (more diverse)\n",
334
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
335
+ ]
336
+ },
337
+ {
338
+ "cell_type": "code",
339
+ "execution_count": null,
340
+ "metadata": {
341
+ "id": "9_OHtzMbB9gL"
342
+ },
343
+ "outputs": [],
344
+ "source": [
345
+ "start = time.time()\n",
346
+ "noise = torch.randn(1,1,256).to(device)\n",
347
+ "wav = inference(text, noise, diffusion_steps=10, embedding_scale=1)\n",
348
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
349
+ "print(f\"RTF = {rtf:5f}\")\n",
350
+ "import IPython.display as ipd\n",
351
+ "display(ipd.Audio(wav, rate=24000))"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "markdown",
356
+ "metadata": {
357
+ "id": "NyDACd-0CaqL"
358
+ },
359
+ "source": [
360
+ "### Speech expressiveness\n",
361
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page."
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "markdown",
366
+ "metadata": {
367
+ "id": "cRkS5VWxCck4"
368
+ },
369
+ "source": [
370
+ "#### With embedding_scale=1\n",
371
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional."
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": null,
377
+ "metadata": {
378
+ "id": "H5g5RO-mCbZB"
379
+ },
380
+ "outputs": [],
381
+ "source": [
382
+ "texts = {}\n",
383
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
384
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
385
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
386
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
387
+ "\n",
388
+ "for k,v in texts.items():\n",
389
+ " noise = torch.randn(1,1,256).to(device)\n",
390
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=1)\n",
391
+ " print(k + \": \")\n",
392
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "markdown",
397
+ "metadata": {
398
+ "id": "f4S8TXSpCgpA"
399
+ },
400
+ "source": [
401
+ "#### With embedding_scale=2"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "code",
406
+ "execution_count": null,
407
+ "metadata": {
408
+ "id": "xHHIdeNrCezC"
409
+ },
410
+ "outputs": [],
411
+ "source": [
412
+ "texts = {}\n",
413
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
414
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
415
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
416
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
417
+ "\n",
418
+ "for k,v in texts.items():\n",
419
+ " noise = torch.randn(1,1,256).to(device)\n",
420
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=2) # embedding_scale=2 for more pronounced emotion\n",
421
+ " print(k + \": \")\n",
422
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "metadata": {
428
+ "id": "nAh7Tov4CkuH"
429
+ },
430
+ "source": [
431
+ "### Long-form generation\n",
432
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": 8,
438
+ "metadata": {
439
+ "cellView": "form",
440
+ "id": "IJwUbgvACoDu"
441
+ },
442
+ "outputs": [],
443
+ "source": [
444
+ "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first-class homemade products there is a market in all large cities. All first-class grocers have customers who purchase such goods.''' # @param {type:\"string\"}"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": null,
450
+ "metadata": {
451
+ "id": "nP-7i2QAC0JT"
452
+ },
453
+ "outputs": [],
454
+ "source": [
455
+ "sentences = passage.split('.') # simple split by comma\n",
456
+ "wavs = []\n",
457
+ "s_prev = None\n",
458
+ "for text in sentences:\n",
459
+ " if text.strip() == \"\": continue\n",
460
+ " text += '.' # add it back\n",
461
+ " noise = torch.randn(1,1,256).to(device)\n",
462
+ " wav, s_prev = LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=10, embedding_scale=1.5)\n",
463
+ " wavs.append(wav)\n",
464
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))"
465
+ ]
466
+ }
467
+ ],
468
+ "metadata": {
469
+ "accelerator": "GPU",
470
+ "colab": {
471
+ "authorship_tag": "ABX9TyM1x2mx2VnkYNFVlD+DFzmy",
472
+ "gpuType": "T4",
473
+ "include_colab_link": true,
474
+ "provenance": []
475
+ },
476
+ "kernelspec": {
477
+ "display_name": "Python 3",
478
+ "name": "python3"
479
+ },
480
+ "language_info": {
481
+ "name": "python"
482
+ }
483
+ },
484
+ "nbformat": 4,
485
+ "nbformat_minor": 0
486
+ }
stts_48khz/StyleTTS2_48khz/Colab/StyleTTS2_Demo_LibriTTS.ipynb ADDED
@@ -0,0 +1,1218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "view-in-github",
7
+ "colab_type": "text"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LibriTTS.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "metadata": {
16
+ "id": "aAGQPfgYIR23"
17
+ },
18
+ "source": [
19
+ "### Install packages and download models"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {
26
+ "colab": {
27
+ "base_uri": "https://localhost:8080/"
28
+ },
29
+ "id": "zDPW5uSpISd2",
30
+ "outputId": "6463ff79-18d5-4071-c6ad-01947beeb368"
31
+ },
32
+ "outputs": [
33
+ {
34
+ "output_type": "stream",
35
+ "name": "stdout",
36
+ "text": [
37
+
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "%%shell\n",
43
+ "git clone https://github.com/yl4579/StyleTTS2.git\n",
44
+ "cd StyleTTS2\n",
45
+ "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n",
46
+ "sudo apt-get install espeak-ng\n",
47
+ "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS\n",
48
+ "mv StyleTTS2-LibriTTS/Models .\n",
49
+ "mv StyleTTS2-LibriTTS/reference_audio.zip .\n",
50
+ "unzip reference_audio.zip\n",
51
+ "mv reference_audio Demo/reference_audio"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "markdown",
56
+ "metadata": {
57
+ "id": "eJdB_nCOIVIN"
58
+ },
59
+ "source": [
60
+ "### Load models"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {
67
+ "id": "cha8Tr2uJwN0"
68
+ },
69
+ "outputs": [],
70
+ "source": [
71
+ "import nltk\n",
72
+ "nltk.download('punkt')"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "metadata": {
79
+ "id": "Qoow8Wd8ITtm"
80
+ },
81
+ "outputs": [],
82
+ "source": [
83
+ "%cd StyleTTS2\n",
84
+ "\n",
85
+ "import torch\n",
86
+ "torch.manual_seed(0)\n",
87
+ "torch.backends.cudnn.benchmark = False\n",
88
+ "torch.backends.cudnn.deterministic = True\n",
89
+ "\n",
90
+ "import random\n",
91
+ "random.seed(0)\n",
92
+ "\n",
93
+ "import numpy as np\n",
94
+ "np.random.seed(0)\n",
95
+ "\n",
96
+ "# load packages\n",
97
+ "import time\n",
98
+ "import random\n",
99
+ "import yaml\n",
100
+ "from munch import Munch\n",
101
+ "import numpy as np\n",
102
+ "import torch\n",
103
+ "from torch import nn\n",
104
+ "import torch.nn.functional as F\n",
105
+ "import torchaudio\n",
106
+ "import librosa\n",
107
+ "from nltk.tokenize import word_tokenize\n",
108
+ "\n",
109
+ "from models import *\n",
110
+ "from utils import *\n",
111
+ "from text_utils import TextCleaner\n",
112
+ "textclenaer = TextCleaner()\n",
113
+ "\n",
114
+ "%matplotlib inline\n",
115
+ "\n",
116
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
117
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
118
+ "mean, std = -4, 4\n",
119
+ "\n",
120
+ "def length_to_mask(lengths):\n",
121
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
122
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
123
+ " return mask\n",
124
+ "\n",
125
+ "def preprocess(wave):\n",
126
+ " wave_tensor = torch.from_numpy(wave).float()\n",
127
+ " mel_tensor = to_mel(wave_tensor)\n",
128
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
129
+ " return mel_tensor\n",
130
+ "\n",
131
+ "def compute_style(path):\n",
132
+ " wave, sr = librosa.load(path, sr=24000)\n",
133
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
134
+ " if sr != 24000:\n",
135
+ " audio = librosa.resample(audio, sr, 24000)\n",
136
+ " mel_tensor = preprocess(audio).to(device)\n",
137
+ "\n",
138
+ " with torch.no_grad():\n",
139
+ " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
140
+ " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
141
+ "\n",
142
+ " return torch.cat([ref_s, ref_p], dim=1)\n",
143
+ "\n",
144
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
145
+ "\n",
146
+ "# load phonemizer\n",
147
+ "import phonemizer\n",
148
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)\n",
149
+ "\n",
150
+ "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
151
+ "\n",
152
+ "# load pretrained ASR model\n",
153
+ "ASR_config = config.get('ASR_config', False)\n",
154
+ "ASR_path = config.get('ASR_path', False)\n",
155
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
156
+ "\n",
157
+ "# load pretrained F0 model\n",
158
+ "F0_path = config.get('F0_path', False)\n",
159
+ "pitch_extractor = load_F0_models(F0_path)\n",
160
+ "\n",
161
+ "# load BERT model\n",
162
+ "from Utils.PLBERT.util import load_plbert\n",
163
+ "BERT_path = config.get('PLBERT_dir', False)\n",
164
+ "plbert = load_plbert(BERT_path)\n",
165
+ "\n",
166
+ "model_params = recursive_munch(config['model_params'])\n",
167
+ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
168
+ "_ = [model[key].eval() for key in model]\n",
169
+ "_ = [model[key].to(device) for key in model]\n",
170
+ "\n",
171
+ "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
172
+ "params = params_whole['net']\n",
173
+ "\n",
174
+ "for key in model:\n",
175
+ " if key in params:\n",
176
+ " print('%s loaded' % key)\n",
177
+ " try:\n",
178
+ " model[key].load_state_dict(params[key])\n",
179
+ " except:\n",
180
+ " from collections import OrderedDict\n",
181
+ " state_dict = params[key]\n",
182
+ " new_state_dict = OrderedDict()\n",
183
+ " for k, v in state_dict.items():\n",
184
+ " name = k[7:] # remove `module.`\n",
185
+ " new_state_dict[name] = v\n",
186
+ " # load params\n",
187
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
188
+ "# except:\n",
189
+ "# _load(params[key], model[key])\n",
190
+ "_ = [model[key].eval() for key in model]\n",
191
+ "\n",
192
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
193
+ "\n",
194
+ "sampler = DiffusionSampler(\n",
195
+ " model.diffusion.diffusion,\n",
196
+ " sampler=ADPM2Sampler(),\n",
197
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
198
+ " clamp=False\n",
199
+ ")\n",
200
+ "\n",
201
+ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
202
+ " text = text.strip()\n",
203
+ " ps = global_phonemizer.phonemize([text])\n",
204
+ " ps = word_tokenize(ps[0])\n",
205
+ " ps = ' '.join(ps)\n",
206
+ " tokens = textclenaer(ps)\n",
207
+ " tokens.insert(0, 0)\n",
208
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
209
+ "\n",
210
+ " with torch.no_grad():\n",
211
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
212
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
213
+ "\n",
214
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
215
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
216
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
217
+ "\n",
218
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
219
+ " embedding=bert_dur,\n",
220
+ " embedding_scale=embedding_scale,\n",
221
+ " features=ref_s, # reference from the same speaker as the embedding\n",
222
+ " num_steps=diffusion_steps).squeeze(1)\n",
223
+ "\n",
224
+ "\n",
225
+ " s = s_pred[:, 128:]\n",
226
+ " ref = s_pred[:, :128]\n",
227
+ "\n",
228
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
229
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
230
+ "\n",
231
+ " d = model.predictor.text_encoder(d_en,\n",
232
+ " s, input_lengths, text_mask)\n",
233
+ "\n",
234
+ " x, _ = model.predictor.lstm(d)\n",
235
+ " duration = model.predictor.duration_proj(x)\n",
236
+ "\n",
237
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
238
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
239
+ "\n",
240
+ "\n",
241
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
242
+ " c_frame = 0\n",
243
+ " for i in range(pred_aln_trg.size(0)):\n",
244
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
245
+ " c_frame += int(pred_dur[i].data)\n",
246
+ "\n",
247
+ " # encode prosody\n",
248
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
249
+ " if model_params.decoder.type == \"hifigan\":\n",
250
+ " asr_new = torch.zeros_like(en)\n",
251
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
252
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
253
+ " en = asr_new\n",
254
+ "\n",
255
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
256
+ "\n",
257
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
258
+ " if model_params.decoder.type == \"hifigan\":\n",
259
+ " asr_new = torch.zeros_like(asr)\n",
260
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
261
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
262
+ " asr = asr_new\n",
263
+ "\n",
264
+ " out = model.decoder(asr,\n",
265
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
266
+ "\n",
267
+ "\n",
268
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later\n",
269
+ "\n",
270
+ "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
271
+ " text = text.strip()\n",
272
+ " ps = global_phonemizer.phonemize([text])\n",
273
+ " ps = word_tokenize(ps[0])\n",
274
+ " ps = ' '.join(ps)\n",
275
+ " ps = ps.replace('``', '\"')\n",
276
+ " ps = ps.replace(\"''\", '\"')\n",
277
+ "\n",
278
+ " tokens = textclenaer(ps)\n",
279
+ " tokens.insert(0, 0)\n",
280
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
281
+ "\n",
282
+ " with torch.no_grad():\n",
283
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
284
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
285
+ "\n",
286
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
287
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
288
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
289
+ "\n",
290
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
291
+ " embedding=bert_dur,\n",
292
+ " embedding_scale=embedding_scale,\n",
293
+ " features=ref_s, # reference from the same speaker as the embedding\n",
294
+ " num_steps=diffusion_steps).squeeze(1)\n",
295
+ "\n",
296
+ " if s_prev is not None:\n",
297
+ " # convex combination of previous and current style\n",
298
+ " s_pred = t * s_prev + (1 - t) * s_pred\n",
299
+ "\n",
300
+ " s = s_pred[:, 128:]\n",
301
+ " ref = s_pred[:, :128]\n",
302
+ "\n",
303
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
304
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
305
+ "\n",
306
+ " s_pred = torch.cat([ref, s], dim=-1)\n",
307
+ "\n",
308
+ " d = model.predictor.text_encoder(d_en,\n",
309
+ " s, input_lengths, text_mask)\n",
310
+ "\n",
311
+ " x, _ = model.predictor.lstm(d)\n",
312
+ " duration = model.predictor.duration_proj(x)\n",
313
+ "\n",
314
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
315
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
316
+ "\n",
317
+ "\n",
318
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
319
+ " c_frame = 0\n",
320
+ " for i in range(pred_aln_trg.size(0)):\n",
321
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
322
+ " c_frame += int(pred_dur[i].data)\n",
323
+ "\n",
324
+ " # encode prosody\n",
325
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
326
+ " if model_params.decoder.type == \"hifigan\":\n",
327
+ " asr_new = torch.zeros_like(en)\n",
328
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
329
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
330
+ " en = asr_new\n",
331
+ "\n",
332
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
333
+ "\n",
334
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
335
+ " if model_params.decoder.type == \"hifigan\":\n",
336
+ " asr_new = torch.zeros_like(asr)\n",
337
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
338
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
339
+ " asr = asr_new\n",
340
+ "\n",
341
+ " out = model.decoder(asr,\n",
342
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
343
+ "\n",
344
+ "\n",
345
+ " return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later\n",
346
+ "\n",
347
+ "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
348
+ " text = text.strip()\n",
349
+ " ps = global_phonemizer.phonemize([text])\n",
350
+ " ps = word_tokenize(ps[0])\n",
351
+ " ps = ' '.join(ps)\n",
352
+ "\n",
353
+ " tokens = textclenaer(ps)\n",
354
+ " tokens.insert(0, 0)\n",
355
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
356
+ "\n",
357
+ " ref_text = ref_text.strip()\n",
358
+ " ps = global_phonemizer.phonemize([ref_text])\n",
359
+ " ps = word_tokenize(ps[0])\n",
360
+ " ps = ' '.join(ps)\n",
361
+ "\n",
362
+ " ref_tokens = textclenaer(ps)\n",
363
+ " ref_tokens.insert(0, 0)\n",
364
+ " ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
365
+ "\n",
366
+ "\n",
367
+ " with torch.no_grad():\n",
368
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
369
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
370
+ "\n",
371
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
372
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
373
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
374
+ "\n",
375
+ " ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
376
+ " ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
377
+ " ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
378
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
379
+ " embedding=bert_dur,\n",
380
+ " embedding_scale=embedding_scale,\n",
381
+ " features=ref_s, # reference from the same speaker as the embedding\n",
382
+ " num_steps=diffusion_steps).squeeze(1)\n",
383
+ "\n",
384
+ "\n",
385
+ " s = s_pred[:, 128:]\n",
386
+ " ref = s_pred[:, :128]\n",
387
+ "\n",
388
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
389
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
390
+ "\n",
391
+ " d = model.predictor.text_encoder(d_en,\n",
392
+ " s, input_lengths, text_mask)\n",
393
+ "\n",
394
+ " x, _ = model.predictor.lstm(d)\n",
395
+ " duration = model.predictor.duration_proj(x)\n",
396
+ "\n",
397
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
398
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
399
+ "\n",
400
+ "\n",
401
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
402
+ " c_frame = 0\n",
403
+ " for i in range(pred_aln_trg.size(0)):\n",
404
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
405
+ " c_frame += int(pred_dur[i].data)\n",
406
+ "\n",
407
+ " # encode prosody\n",
408
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
409
+ " if model_params.decoder.type == \"hifigan\":\n",
410
+ " asr_new = torch.zeros_like(en)\n",
411
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
412
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
413
+ " en = asr_new\n",
414
+ "\n",
415
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
416
+ "\n",
417
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
418
+ " if model_params.decoder.type == \"hifigan\":\n",
419
+ " asr_new = torch.zeros_like(asr)\n",
420
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
421
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
422
+ " asr = asr_new\n",
423
+ "\n",
424
+ " out = model.decoder(asr,\n",
425
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
426
+ "\n",
427
+ "\n",
428
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later\n"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "markdown",
433
+ "metadata": {
434
+ "id": "32S6U0LyJbCA"
435
+ },
436
+ "source": [
437
+ "### Synthesize speech"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "markdown",
442
+ "metadata": {
443
+ "id": "ehK_0daMJdk_"
444
+ },
445
+ "source": [
446
+ "#### Basic synthesis (5 diffusion steps, seen speakers)"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "metadata": {
453
+ "id": "SJs2x41MJhM-"
454
+ },
455
+ "outputs": [],
456
+ "source": [
457
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": null,
463
+ "metadata": {
464
+ "id": "xuqIJe-IJb7A"
465
+ },
466
+ "outputs": [],
467
+ "source": [
468
+ "reference_dicts = {}\n",
469
+ "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
470
+ "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\""
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": null,
476
+ "metadata": {
477
+ "id": "H3ra3IxJJmF0"
478
+ },
479
+ "outputs": [],
480
+ "source": [
481
+ "noise = torch.randn(1,1,256).to(device)\n",
482
+ "for k, path in reference_dicts.items():\n",
483
+ " ref_s = compute_style(path)\n",
484
+ " start = time.time()\n",
485
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
486
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
487
+ " print(f\"RTF = {rtf:5f}\")\n",
488
+ " import IPython.display as ipd\n",
489
+ " print(k + ' Synthesized:')\n",
490
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
491
+ " print('Reference:')\n",
492
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
493
+ ]
494
+ },
495
+ {
496
+ "cell_type": "markdown",
497
+ "metadata": {
498
+ "id": "aB3wUz6yJ-P_"
499
+ },
500
+ "source": [
501
+ "#### With higher diffusion steps (more diverse)\n",
502
+ "\n",
503
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": null,
509
+ "metadata": {
510
+ "id": "lF27XUo4JrKk"
511
+ },
512
+ "outputs": [],
513
+ "source": [
514
+ "noise = torch.randn(1,1,256).to(device)\n",
515
+ "for k, path in reference_dicts.items():\n",
516
+ " ref_s = compute_style(path)\n",
517
+ " start = time.time()\n",
518
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
519
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
520
+ " print(f\"RTF = {rtf:5f}\")\n",
521
+ " import IPython.display as ipd\n",
522
+ " print(k + ' Synthesized:')\n",
523
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
524
+ " print(k + ' Reference:')\n",
525
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "markdown",
530
+ "metadata": {
531
+ "id": "pFT_vmJcKDs1"
532
+ },
533
+ "source": [
534
+ "#### Basic synthesis (5 diffusion steps, unseen speakers)\n",
535
+ "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
536
+ ]
537
+ },
538
+ {
539
+ "cell_type": "code",
540
+ "execution_count": null,
541
+ "metadata": {
542
+ "id": "HvNAeGPEKAWN"
543
+ },
544
+ "outputs": [],
545
+ "source": [
546
+ "reference_dicts = {}\n",
547
+ "# format: (path, text)\n",
548
+ "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
549
+ "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
550
+ "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
551
+ "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "code",
556
+ "execution_count": null,
557
+ "metadata": {
558
+ "id": "mFnyvYp5KAYN"
559
+ },
560
+ "outputs": [],
561
+ "source": [
562
+ "noise = torch.randn(1,1,256).to(device)\n",
563
+ "for k, v in reference_dicts.items():\n",
564
+ " path, text = v\n",
565
+ " ref_s = compute_style(path)\n",
566
+ " start = time.time()\n",
567
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
568
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
569
+ " print(f\"RTF = {rtf:5f}\")\n",
570
+ " import IPython.display as ipd\n",
571
+ " print(k + ' Synthesized: ' + text)\n",
572
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
573
+ " print(k + ' Reference:')\n",
574
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "markdown",
579
+ "metadata": {
580
+ "id": "QBZ53BQtKNQ6"
581
+ },
582
+ "source": [
583
+ "### Speech expressiveness\n",
584
+ "\n",
585
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training.\n",
586
+ "\n",
587
+ "#### With `embedding_scale=1`\n",
588
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional."
589
+ ]
590
+ },
591
+ {
592
+ "cell_type": "code",
593
+ "execution_count": null,
594
+ "metadata": {
595
+ "id": "5FwE9CefKQk6"
596
+ },
597
+ "outputs": [],
598
+ "source": [
599
+ "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": null,
605
+ "metadata": {
606
+ "id": "0CKMI0ZsKUDh"
607
+ },
608
+ "outputs": [],
609
+ "source": [
610
+ "texts = {}\n",
611
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
612
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
613
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
614
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
615
+ "\n",
616
+ "for k,v in texts.items():\n",
617
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
618
+ " print(k + \": \")\n",
619
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "markdown",
624
+ "metadata": {
625
+ "id": "reemQKVEKWAZ"
626
+ },
627
+ "source": [
628
+ "#### With `embedding_scale=2`"
629
+ ]
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": null,
634
+ "metadata": {
635
+ "id": "npIAiAUvKYGv"
636
+ },
637
+ "outputs": [],
638
+ "source": [
639
+ "texts = {}\n",
640
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
641
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
642
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
643
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
644
+ "\n",
645
+ "for k,v in texts.items():\n",
646
+ " noise = torch.randn(1,1,256).to(device)\n",
647
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
648
+ " print(k + \": \")\n",
649
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
650
+ ]
651
+ },
652
+ {
653
+ "cell_type": "markdown",
654
+ "metadata": {
655
+ "id": "lqKZaXeYKbrH"
656
+ },
657
+ "source": [
658
+ "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
659
+ "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody."
660
+ ]
661
+ },
662
+ {
663
+ "cell_type": "code",
664
+ "execution_count": null,
665
+ "metadata": {
666
+ "id": "VjXuRCCWKcdN"
667
+ },
668
+ "outputs": [],
669
+ "source": [
670
+ "texts = {}\n",
671
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
672
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
673
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
674
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
675
+ "\n",
676
+ "for k,v in texts.items():\n",
677
+ " noise = torch.randn(1,1,256).to(device)\n",
678
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
679
+ " print(k + \": \")\n",
680
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
681
+ ]
682
+ },
683
+ {
684
+ "cell_type": "markdown",
685
+ "metadata": {
686
+ "id": "xrwYXGh0KiIW"
687
+ },
688
+ "source": [
689
+ "### Zero-shot speaker adaptation\n",
690
+ "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance."
691
+ ]
692
+ },
693
+ {
694
+ "cell_type": "markdown",
695
+ "metadata": {
696
+ "id": "ETUywHHmKimE"
697
+ },
698
+ "source": [
699
+ "#### Acoustic Environment Maintenance\n",
700
+ "\n",
701
+ "Since we want to maintain the acoustic environment in the speaker (timbre), we set `alpha = 0` to make the speaker as close to the reference as possible while only changing the prosody according to the text. "
702
+ ]
703
+ },
704
+ {
705
+ "cell_type": "code",
706
+ "execution_count": null,
707
+ "metadata": {
708
+ "id": "yvjBK3syKnZL"
709
+ },
710
+ "outputs": [],
711
+ "source": [
712
+ "reference_dicts = {}\n",
713
+ "# format: (path, text)\n",
714
+ "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
715
+ "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
716
+ "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
717
+ ]
718
+ },
719
+ {
720
+ "cell_type": "code",
721
+ "execution_count": null,
722
+ "metadata": {
723
+ "id": "jclowWp4KomJ"
724
+ },
725
+ "outputs": [],
726
+ "source": [
727
+ "noise = torch.randn(1,1,256).to(device)\n",
728
+ "for k, v in reference_dicts.items():\n",
729
+ " path, text = v\n",
730
+ " ref_s = compute_style(path)\n",
731
+ " start = time.time()\n",
732
+ " wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
733
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
734
+ " print(f\"RTF = {rtf:5f}\")\n",
735
+ " import IPython.display as ipd\n",
736
+ " print('Synthesized: ' + text)\n",
737
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
738
+ " print('Reference:')\n",
739
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
740
+ ]
741
+ },
742
+ {
743
+ "cell_type": "markdown",
744
+ "metadata": {
745
+ "id": "LgIm7M93KqVZ"
746
+ },
747
+ "source": [
748
+ "#### Speaker’s Emotion Maintenance\n",
749
+ "\n",
750
+ "Since we want to maintain the emotion in the speaker (prosody), we set `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
751
+ ]
752
+ },
753
+ {
754
+ "cell_type": "code",
755
+ "execution_count": null,
756
+ "metadata": {
757
+ "id": "yzsNoP6oKulL"
758
+ },
759
+ "outputs": [],
760
+ "source": [
761
+ "reference_dicts = {}\n",
762
+ "# format: (path, text)\n",
763
+ "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
764
+ "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
765
+ "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
766
+ "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
767
+ ]
768
+ },
769
+ {
770
+ "cell_type": "code",
771
+ "execution_count": null,
772
+ "metadata": {
773
+ "id": "7h2-9cpfKwr4"
774
+ },
775
+ "outputs": [],
776
+ "source": [
777
+ "noise = torch.randn(1,1,256).to(device)\n",
778
+ "for k, v in reference_dicts.items():\n",
779
+ " path, text = v\n",
780
+ " ref_s = compute_style(path)\n",
781
+ " start = time.time()\n",
782
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
783
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
784
+ " print(f\"RTF = {rtf:5f}\")\n",
785
+ " import IPython.display as ipd\n",
786
+ " print(k + ' Synthesized: ' + text)\n",
787
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
788
+ " print(k + ' Reference:')\n",
789
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "markdown",
794
+ "metadata": {
795
+ "id": "aNS82PGwKzgg"
796
+ },
797
+ "source": [
798
+ "### Longform Narration\n",
799
+ "\n",
800
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
801
+ ]
802
+ },
803
+ {
804
+ "cell_type": "code",
805
+ "execution_count": null,
806
+ "metadata": {
807
+ "cellView": "form",
808
+ "id": "qs97nL5HK5DH"
809
+ },
810
+ "outputs": [],
811
+ "source": [
812
+ "passage = passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.''' # @param {type:\"string\"}"
813
+ ]
814
+ },
815
+ {
816
+ "cell_type": "code",
817
+ "execution_count": null,
818
+ "metadata": {
819
+ "colab": {
820
+ "background_save": true
821
+ },
822
+ "id": "8Mu9whHYK_1b"
823
+ },
824
+ "outputs": [],
825
+ "source": [
826
+ "# seen speaker\n",
827
+ "path = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
828
+ "s_ref = compute_style(path)\n",
829
+ "sentences = passage.split('.') # simple split by comma\n",
830
+ "wavs = []\n",
831
+ "s_prev = None\n",
832
+ "for text in sentences:\n",
833
+ " if text.strip() == \"\": continue\n",
834
+ " text += '.' # add it back\n",
835
+ "\n",
836
+ " wav, s_prev = LFinference(text,\n",
837
+ " s_prev,\n",
838
+ " s_ref,\n",
839
+ " alpha = 0.3,\n",
840
+ " beta = 0.9, # make it more suitable for the text\n",
841
+ " t = 0.7,\n",
842
+ " diffusion_steps=10, embedding_scale=1.5)\n",
843
+ " wavs.append(wav)\n",
844
+ "print('Synthesized: ')\n",
845
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
846
+ "print('Reference: ')\n",
847
+ "display(ipd.Audio(path, rate=24000, normalize=False))"
848
+ ]
849
+ },
850
+ {
851
+ "cell_type": "markdown",
852
+ "metadata": {
853
+ "id": "81Rh-lgWLB2i"
854
+ },
855
+ "source": [
856
+ "### Style Transfer\n",
857
+ "\n",
858
+ "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style)."
859
+ ]
860
+ },
861
+ {
862
+ "cell_type": "code",
863
+ "execution_count": null,
864
+ "metadata": {
865
+ "id": "CtIgr5kOLE9a"
866
+ },
867
+ "outputs": [],
868
+ "source": [
869
+ "# reference texts to sample styles\n",
870
+ "\n",
871
+ "ref_texts = {}\n",
872
+ "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
873
+ "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
874
+ "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
875
+ "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
876
+ ]
877
+ },
878
+ {
879
+ "cell_type": "code",
880
+ "execution_count": null,
881
+ "metadata": {
882
+ "id": "MlA1CbhzLIoI"
883
+ },
884
+ "outputs": [],
885
+ "source": [
886
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
887
+ "s_ref = compute_style(path)\n",
888
+ "\n",
889
+ "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
890
+ "for k,v in ref_texts.items():\n",
891
+ " wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
892
+ " print(k + \": \")\n",
893
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
894
+ ]
895
+ },
896
+ {
897
+ "cell_type": "markdown",
898
+ "metadata": {
899
+ "id": "2M0iaXlkLJUQ"
900
+ },
901
+ "source": [
902
+ "### Speech diversity\n",
903
+ "\n",
904
+ "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page.\n",
905
+ "\n",
906
+ "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
907
+ "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different).\n",
908
+ "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis).\n"
909
+ ]
910
+ },
911
+ {
912
+ "cell_type": "markdown",
913
+ "metadata": {
914
+ "id": "tSxZDvF2LNu4"
915
+ },
916
+ "source": [
917
+ "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
918
+ "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text."
919
+ ]
920
+ },
921
+ {
922
+ "cell_type": "code",
923
+ "execution_count": null,
924
+ "metadata": {
925
+ "id": "AAomGCDZLIt5"
926
+ },
927
+ "outputs": [],
928
+ "source": [
929
+ "# unseen speaker\n",
930
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
931
+ "ref_s = compute_style(path)\n",
932
+ "\n",
933
+ "text = \"How much variation is there?\"\n",
934
+ "for _ in range(5):\n",
935
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
936
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
937
+ ]
938
+ },
939
+ {
940
+ "cell_type": "markdown",
941
+ "metadata": {
942
+ "id": "BKrSMdgcLQRP"
943
+ },
944
+ "source": [
945
+ "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
946
+ "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples."
947
+ ]
948
+ },
949
+ {
950
+ "cell_type": "code",
951
+ "execution_count": null,
952
+ "metadata": {
953
+ "id": "Uo7gVmFoLRfm"
954
+ },
955
+ "outputs": [],
956
+ "source": [
957
+ "# unseen speaker\n",
958
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
959
+ "ref_s = compute_style(path)\n",
960
+ "\n",
961
+ "text = \"How much variation is there?\"\n",
962
+ "for _ in range(5):\n",
963
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
964
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
965
+ ]
966
+ },
967
+ {
968
+ "cell_type": "markdown",
969
+ "metadata": {
970
+ "id": "nfQ0Xrg9LStd"
971
+ },
972
+ "source": [
973
+ "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
974
+ "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker. "
975
+ ]
976
+ },
977
+ {
978
+ "cell_type": "code",
979
+ "execution_count": null,
980
+ "metadata": {
981
+ "id": "cPHz4BzVLT_u"
982
+ },
983
+ "outputs": [],
984
+ "source": [
985
+ "# unseen speaker\n",
986
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
987
+ "ref_s = compute_style(path)\n",
988
+ "\n",
989
+ "text = \"How much variation is there?\"\n",
990
+ "for _ in range(5):\n",
991
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
992
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
993
+ ]
994
+ },
995
+ {
996
+ "cell_type": "markdown",
997
+ "source": [
998
+ "#### Extreme setting (`alpha = 1, beta=1`)\n",
999
+ "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker."
1000
+ ],
1001
+ "metadata": {
1002
+ "id": "hPKg9eYpL00f"
1003
+ }
1004
+ },
1005
+ {
1006
+ "cell_type": "code",
1007
+ "source": [
1008
+ "# unseen speaker\n",
1009
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1010
+ "ref_s = compute_style(path)\n",
1011
+ "\n",
1012
+ "text = \"How much variation is there?\"\n",
1013
+ "for _ in range(5):\n",
1014
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
1015
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1016
+ ],
1017
+ "metadata": {
1018
+ "id": "Ei-7JOccL0bF"
1019
+ },
1020
+ "execution_count": null,
1021
+ "outputs": []
1022
+ },
1023
+ {
1024
+ "cell_type": "markdown",
1025
+ "source": [
1026
+ "#### No variation (`alpha = 0, beta=0`)\n",
1027
+ "This setting uses 100% of the reference timbre and prosody and do not use the diffusion model at all. This makes the speaker very similar to the reference speaker, but there is no variation."
1028
+ ],
1029
+ "metadata": {
1030
+ "id": "FVMPc3bhL3eL"
1031
+ }
1032
+ },
1033
+ {
1034
+ "cell_type": "code",
1035
+ "source": [
1036
+ "# unseen speaker\n",
1037
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1038
+ "ref_s = compute_style(path)\n",
1039
+ "\n",
1040
+ "text = \"How much variation is there?\"\n",
1041
+ "for _ in range(5):\n",
1042
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
1043
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1044
+ ],
1045
+ "metadata": {
1046
+ "id": "yh1QZ7uhL4wM"
1047
+ },
1048
+ "execution_count": null,
1049
+ "outputs": []
1050
+ },
1051
+ {
1052
+ "cell_type": "markdown",
1053
+ "source": [
1054
+ "### Extra fun!\n",
1055
+ "\n",
1056
+ "You can record your own voice and clone it using pre-trained StyleTTS 2 model here."
1057
+ ],
1058
+ "metadata": {
1059
+ "id": "T0EvkWrAMBDB"
1060
+ }
1061
+ },
1062
+ {
1063
+ "cell_type": "markdown",
1064
+ "source": [
1065
+ "#### Run the following cell to record your voice for 5 seconds. Please keep speaking to have the best effect."
1066
+ ],
1067
+ "metadata": {
1068
+ "id": "R985j5QONY8I"
1069
+ }
1070
+ },
1071
+ {
1072
+ "cell_type": "code",
1073
+ "source": [
1074
+ "# all imports\n",
1075
+ "from IPython.display import Javascript\n",
1076
+ "from google.colab import output\n",
1077
+ "from base64 import b64decode\n",
1078
+ "\n",
1079
+ "RECORD = \"\"\"\n",
1080
+ "const sleep = time => new Promise(resolve => setTimeout(resolve, time))\n",
1081
+ "const b2text = blob => new Promise(resolve => {\n",
1082
+ " const reader = new FileReader()\n",
1083
+ " reader.onloadend = e => resolve(e.srcElement.result)\n",
1084
+ " reader.readAsDataURL(blob)\n",
1085
+ "})\n",
1086
+ "var record = time => new Promise(async resolve => {\n",
1087
+ " stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n",
1088
+ " recorder = new MediaRecorder(stream)\n",
1089
+ " chunks = []\n",
1090
+ " recorder.ondataavailable = e => chunks.push(e.data)\n",
1091
+ " recorder.start()\n",
1092
+ " await sleep(time)\n",
1093
+ " recorder.onstop = async ()=>{\n",
1094
+ " blob = new Blob(chunks)\n",
1095
+ " text = await b2text(blob)\n",
1096
+ " resolve(text)\n",
1097
+ " }\n",
1098
+ " recorder.stop()\n",
1099
+ "})\n",
1100
+ "\"\"\"\n",
1101
+ "\n",
1102
+ "def record(sec=3):\n",
1103
+ " display(Javascript(RECORD))\n",
1104
+ " s = output.eval_js('record(%d)' % (sec*1000))\n",
1105
+ " b = b64decode(s.split(',')[1])\n",
1106
+ " with open('audio.wav','wb') as f:\n",
1107
+ " f.write(b)\n",
1108
+ " return 'audio.wav' # or webm ?"
1109
+ ],
1110
+ "metadata": {
1111
+ "id": "MWrFs0KWMBpz"
1112
+ },
1113
+ "execution_count": null,
1114
+ "outputs": []
1115
+ },
1116
+ {
1117
+ "cell_type": "markdown",
1118
+ "source": [
1119
+ "#### Please run this cell and speak:"
1120
+ ],
1121
+ "metadata": {
1122
+ "id": "z35qXwM0Nhx1"
1123
+ }
1124
+ },
1125
+ {
1126
+ "cell_type": "code",
1127
+ "source": [
1128
+ "print('Speak now for 5 seconds.')\n",
1129
+ "audio = record(sec=5)\n",
1130
+ "import IPython.display as ipd\n",
1131
+ "display(ipd.Audio(audio, rate=24000, normalize=False))"
1132
+ ],
1133
+ "metadata": {
1134
+ "id": "KUEoFyQBMR-8"
1135
+ },
1136
+ "execution_count": null,
1137
+ "outputs": []
1138
+ },
1139
+ {
1140
+ "cell_type": "markdown",
1141
+ "source": [
1142
+ "#### Synthesize in your own voice"
1143
+ ],
1144
+ "metadata": {
1145
+ "id": "OQS_7IBpNmM1"
1146
+ }
1147
+ },
1148
+ {
1149
+ "cell_type": "code",
1150
+ "source": [
1151
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. ''' # @param {type:\"string\"}\n"
1152
+ ],
1153
+ "metadata": {
1154
+ "cellView": "form",
1155
+ "id": "c0I3LY7vM8Ta"
1156
+ },
1157
+ "execution_count": null,
1158
+ "outputs": []
1159
+ },
1160
+ {
1161
+ "cell_type": "code",
1162
+ "source": [
1163
+ "reference_dicts = {}\n",
1164
+ "reference_dicts['You'] = audio"
1165
+ ],
1166
+ "metadata": {
1167
+ "id": "80eW-pwxNCxu"
1168
+ },
1169
+ "execution_count": null,
1170
+ "outputs": []
1171
+ },
1172
+ {
1173
+ "cell_type": "code",
1174
+ "source": [
1175
+ "start = time.time()\n",
1176
+ "noise = torch.randn(1,1,256).to(device)\n",
1177
+ "for k, path in reference_dicts.items():\n",
1178
+ " ref_s = compute_style(path)\n",
1179
+ "\n",
1180
+ " wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
1181
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
1182
+ " print('Speaker: ' + k)\n",
1183
+ " import IPython.display as ipd\n",
1184
+ " print('Synthesized:')\n",
1185
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
1186
+ " print('Reference:')\n",
1187
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
1188
+ ],
1189
+ "metadata": {
1190
+ "id": "yIga6MTuNJaN"
1191
+ },
1192
+ "execution_count": null,
1193
+ "outputs": []
1194
+ }
1195
+ ],
1196
+ "metadata": {
1197
+ "accelerator": "GPU",
1198
+ "colab": {
1199
+ "provenance": [],
1200
+ "collapsed_sections": [
1201
+ "aAGQPfgYIR23",
1202
+ "eJdB_nCOIVIN",
1203
+ "R985j5QONY8I"
1204
+ ],
1205
+ "authorship_tag": "ABX9TyPQdFTqqVEknEG/ma/HMfU+",
1206
+ "include_colab_link": true
1207
+ },
1208
+ "kernelspec": {
1209
+ "display_name": "Python 3",
1210
+ "name": "python3"
1211
+ },
1212
+ "language_info": {
1213
+ "name": "python"
1214
+ }
1215
+ },
1216
+ "nbformat": 4,
1217
+ "nbformat_minor": 0
1218
+ }
stts_48khz/StyleTTS2_48khz/Colab/StyleTTS2_Finetune_Demo.ipynb ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4",
8
+ "authorship_tag": "ABX9TyNiDU9ykIeYxO86Lmuid+ph",
9
+ "include_colab_link": true
10
+ },
11
+ "kernelspec": {
12
+ "name": "python3",
13
+ "display_name": "Python 3"
14
+ },
15
+ "language_info": {
16
+ "name": "python"
17
+ },
18
+ "accelerator": "GPU"
19
+ },
20
+ "cells": [
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {
24
+ "id": "view-in-github",
25
+ "colab_type": "text"
26
+ },
27
+ "source": [
28
+ "<a href=\"https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Finetune_Demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "source": [
34
+ "### Install packages and download models"
35
+ ],
36
+ "metadata": {
37
+ "id": "yLqBa4uYPrqE"
38
+ }
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "source": [
43
+ "%%shell\n",
44
+ "git clone https://github.com/yl4579/StyleTTS2.git\n",
45
+ "cd StyleTTS2\n",
46
+ "pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git\n",
47
+ "sudo apt-get install espeak-ng\n",
48
+ "git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS\n",
49
+ "mv StyleTTS2-LibriTTS/Models ."
50
+ ],
51
+ "metadata": {
52
+ "id": "H72WF06ZPrTF"
53
+ },
54
+ "execution_count": null,
55
+ "outputs": []
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "source": [
60
+ "### Download dataset (LJSpeech, 200 samples, ~15 minutes of data)\n",
61
+ "\n",
62
+ "You can definitely do it with fewer samples. This is just a proof of concept with 200 smaples."
63
+ ],
64
+ "metadata": {
65
+ "id": "G398sL8wPzTB"
66
+ }
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "source": [
71
+ "%cd StyleTTS2\n",
72
+ "!rm -rf Data"
73
+ ],
74
+ "metadata": {
75
+ "id": "kJuQUBrEPy5C"
76
+ },
77
+ "execution_count": null,
78
+ "outputs": []
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "source": [
83
+ "!gdown --id 1vqz26D3yn7OXS2vbfYxfSnpLS6m6tOFP\n",
84
+ "!unzip Data.zip"
85
+ ],
86
+ "metadata": {
87
+ "id": "mDXW8ZZePuSb"
88
+ },
89
+ "execution_count": null,
90
+ "outputs": []
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "source": [
95
+ "### Change the finetuning config\n",
96
+ "\n",
97
+ "Depending on the GPU you got, you may want to change the bacth size, max audio length, epiochs and so on."
98
+ ],
99
+ "metadata": {
100
+ "id": "_AlBQREWU8ud"
101
+ }
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "source": [
106
+ "config_path = \"Configs/config_ft.yml\"\n",
107
+ "\n",
108
+ "import yaml\n",
109
+ "config = yaml.safe_load(open(config_path))"
110
+ ],
111
+ "metadata": {
112
+ "id": "7uEITi0hU4I2"
113
+ },
114
+ "execution_count": null,
115
+ "outputs": []
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "source": [
120
+ "config['data_params']['root_path'] = \"Data/wavs\"\n",
121
+ "\n",
122
+ "config['batch_size'] = 2 # not enough RAM\n",
123
+ "config['max_len'] = 100 # not enough RAM\n",
124
+ "config['loss_params']['joint_epoch'] = 110 # we do not do SLM adversarial training due to not enough RAM\n",
125
+ "\n",
126
+ "with open(config_path, 'w') as outfile:\n",
127
+ " yaml.dump(config, outfile, default_flow_style=True)"
128
+ ],
129
+ "metadata": {
130
+ "id": "TPTRgOKSVT4K"
131
+ },
132
+ "execution_count": null,
133
+ "outputs": []
134
+ },
135
+ {
136
+ "cell_type": "markdown",
137
+ "source": [
138
+ "### Start finetuning\n"
139
+ ],
140
+ "metadata": {
141
+ "id": "uUuB_19NWj2Y"
142
+ }
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "source": [
147
+ "!python train_finetune.py --config_path ./Configs/config_ft.yml"
148
+ ],
149
+ "metadata": {
150
+ "id": "HZVAD5GKWm-O"
151
+ },
152
+ "execution_count": null,
153
+ "outputs": []
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "source": [
158
+ "### Test the model quality\n",
159
+ "\n",
160
+ "Note that this mainly serves as a proof of concept due to RAM limitation of free Colab instances. A lot of settings are suboptimal. In the future when DDP works for train_second.py, we will also add mixed precision finetuning to save time and RAM. You can also add SLM adversarial training run if you have paid Colab services (such as A100 with 40G of RAM)."
161
+ ],
162
+ "metadata": {
163
+ "id": "I0_7wsGkXGfc"
164
+ }
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "source": [
169
+ "import nltk\n",
170
+ "nltk.download('punkt')"
171
+ ],
172
+ "metadata": {
173
+ "id": "OPLphjbncE7p"
174
+ },
175
+ "execution_count": null,
176
+ "outputs": []
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "source": [
181
+ "import torch\n",
182
+ "torch.manual_seed(0)\n",
183
+ "torch.backends.cudnn.benchmark = False\n",
184
+ "torch.backends.cudnn.deterministic = True\n",
185
+ "\n",
186
+ "import random\n",
187
+ "random.seed(0)\n",
188
+ "\n",
189
+ "import numpy as np\n",
190
+ "np.random.seed(0)\n",
191
+ "\n",
192
+ "# load packages\n",
193
+ "import time\n",
194
+ "import random\n",
195
+ "import yaml\n",
196
+ "from munch import Munch\n",
197
+ "import numpy as np\n",
198
+ "import torch\n",
199
+ "from torch import nn\n",
200
+ "import torch.nn.functional as F\n",
201
+ "import torchaudio\n",
202
+ "import librosa\n",
203
+ "from nltk.tokenize import word_tokenize\n",
204
+ "\n",
205
+ "from models import *\n",
206
+ "from utils import *\n",
207
+ "from text_utils import TextCleaner\n",
208
+ "textclenaer = TextCleaner()\n",
209
+ "\n",
210
+ "%matplotlib inline\n",
211
+ "\n",
212
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
213
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
214
+ "mean, std = -4, 4\n",
215
+ "\n",
216
+ "def length_to_mask(lengths):\n",
217
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
218
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
219
+ " return mask\n",
220
+ "\n",
221
+ "def preprocess(wave):\n",
222
+ " wave_tensor = torch.from_numpy(wave).float()\n",
223
+ " mel_tensor = to_mel(wave_tensor)\n",
224
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
225
+ " return mel_tensor\n",
226
+ "\n",
227
+ "def compute_style(path):\n",
228
+ " wave, sr = librosa.load(path, sr=24000)\n",
229
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
230
+ " if sr != 24000:\n",
231
+ " audio = librosa.resample(audio, sr, 24000)\n",
232
+ " mel_tensor = preprocess(audio).to(device)\n",
233
+ "\n",
234
+ " with torch.no_grad():\n",
235
+ " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
236
+ " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
237
+ "\n",
238
+ " return torch.cat([ref_s, ref_p], dim=1)\n",
239
+ "\n",
240
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
241
+ "\n",
242
+ "# load phonemizer\n",
243
+ "import phonemizer\n",
244
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)\n",
245
+ "\n",
246
+ "config = yaml.safe_load(open(\"Models/LJSpeech/config_ft.yml\"))\n",
247
+ "\n",
248
+ "# load pretrained ASR model\n",
249
+ "ASR_config = config.get('ASR_config', False)\n",
250
+ "ASR_path = config.get('ASR_path', False)\n",
251
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
252
+ "\n",
253
+ "# load pretrained F0 model\n",
254
+ "F0_path = config.get('F0_path', False)\n",
255
+ "pitch_extractor = load_F0_models(F0_path)\n",
256
+ "\n",
257
+ "# load BERT model\n",
258
+ "from Utils.PLBERT.util import load_plbert\n",
259
+ "BERT_path = config.get('PLBERT_dir', False)\n",
260
+ "plbert = load_plbert(BERT_path)\n",
261
+ "\n",
262
+ "model_params = recursive_munch(config['model_params'])\n",
263
+ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
264
+ "_ = [model[key].eval() for key in model]\n",
265
+ "_ = [model[key].to(device) for key in model]"
266
+ ],
267
+ "metadata": {
268
+ "id": "jIIAoDACXJL0"
269
+ },
270
+ "execution_count": null,
271
+ "outputs": []
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "source": [
276
+ "files = [f for f in os.listdir(\"Models/LJSpeech/\") if f.endswith('.pth')]\n",
277
+ "sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))"
278
+ ],
279
+ "metadata": {
280
+ "id": "eKXRAyyzcMpQ"
281
+ },
282
+ "execution_count": null,
283
+ "outputs": []
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "source": [
288
+ "params_whole = torch.load(\"Models/LJSpeech/\" + sorted_files[-1], map_location='cpu')\n",
289
+ "params = params_whole['net']"
290
+ ],
291
+ "metadata": {
292
+ "id": "ULuU9-VDb9Pk"
293
+ },
294
+ "execution_count": null,
295
+ "outputs": []
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "source": [
300
+ "for key in model:\n",
301
+ " if key in params:\n",
302
+ " print('%s loaded' % key)\n",
303
+ " try:\n",
304
+ " model[key].load_state_dict(params[key])\n",
305
+ " except:\n",
306
+ " from collections import OrderedDict\n",
307
+ " state_dict = params[key]\n",
308
+ " new_state_dict = OrderedDict()\n",
309
+ " for k, v in state_dict.items():\n",
310
+ " name = k[7:] # remove `module.`\n",
311
+ " new_state_dict[name] = v\n",
312
+ " # load params\n",
313
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
314
+ "# except:\n",
315
+ "# _load(params[key], model[key])\n",
316
+ "_ = [model[key].eval() for key in model]"
317
+ ],
318
+ "metadata": {
319
+ "id": "J-U29yIYc2ea"
320
+ },
321
+ "execution_count": null,
322
+ "outputs": []
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "source": [
327
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
328
+ ],
329
+ "metadata": {
330
+ "id": "jrPQ_Yrwc3n6"
331
+ },
332
+ "execution_count": null,
333
+ "outputs": []
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "source": [
338
+ "sampler = DiffusionSampler(\n",
339
+ " model.diffusion.diffusion,\n",
340
+ " sampler=ADPM2Sampler(),\n",
341
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
342
+ " clamp=False\n",
343
+ ")"
344
+ ],
345
+ "metadata": {
346
+ "id": "n2CWYNoqc455"
347
+ },
348
+ "execution_count": null,
349
+ "outputs": []
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "source": [
354
+ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
355
+ " text = text.strip()\n",
356
+ " ps = global_phonemizer.phonemize([text])\n",
357
+ " ps = word_tokenize(ps[0])\n",
358
+ " ps = ' '.join(ps)\n",
359
+ " tokens = textclenaer(ps)\n",
360
+ " tokens.insert(0, 0)\n",
361
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
362
+ "\n",
363
+ " with torch.no_grad():\n",
364
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
365
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
366
+ "\n",
367
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
368
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
369
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2)\n",
370
+ "\n",
371
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),\n",
372
+ " embedding=bert_dur,\n",
373
+ " embedding_scale=embedding_scale,\n",
374
+ " features=ref_s, # reference from the same speaker as the embedding\n",
375
+ " num_steps=diffusion_steps).squeeze(1)\n",
376
+ "\n",
377
+ "\n",
378
+ " s = s_pred[:, 128:]\n",
379
+ " ref = s_pred[:, :128]\n",
380
+ "\n",
381
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
382
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
383
+ "\n",
384
+ " d = model.predictor.text_encoder(d_en,\n",
385
+ " s, input_lengths, text_mask)\n",
386
+ "\n",
387
+ " x, _ = model.predictor.lstm(d)\n",
388
+ " duration = model.predictor.duration_proj(x)\n",
389
+ "\n",
390
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
391
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
392
+ "\n",
393
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
394
+ " c_frame = 0\n",
395
+ " for i in range(pred_aln_trg.size(0)):\n",
396
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
397
+ " c_frame += int(pred_dur[i].data)\n",
398
+ "\n",
399
+ " # encode prosody\n",
400
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
401
+ " if model_params.decoder.type == \"hifigan\":\n",
402
+ " asr_new = torch.zeros_like(en)\n",
403
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
404
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
405
+ " en = asr_new\n",
406
+ "\n",
407
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
408
+ "\n",
409
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
410
+ " if model_params.decoder.type == \"hifigan\":\n",
411
+ " asr_new = torch.zeros_like(asr)\n",
412
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
413
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
414
+ " asr = asr_new\n",
415
+ "\n",
416
+ " out = model.decoder(asr,\n",
417
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
418
+ "\n",
419
+ "\n",
420
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
421
+ ],
422
+ "metadata": {
423
+ "id": "2x5kVb3nc_eY"
424
+ },
425
+ "execution_count": null,
426
+ "outputs": []
427
+ },
428
+ {
429
+ "cell_type": "markdown",
430
+ "source": [
431
+ "### Synthesize speech"
432
+ ],
433
+ "metadata": {
434
+ "id": "O159JnwCc6CC"
435
+ }
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "source": [
440
+ "text = '''Maltby and Company would issue warrants on them deliverable to the importer, and the goods were then passed to be stored in neighboring warehouses.\n",
441
+ "'''"
442
+ ],
443
+ "metadata": {
444
+ "id": "ThciXQ6rc9Eq"
445
+ },
446
+ "execution_count": null,
447
+ "outputs": []
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "source": [
452
+ "# get a random reference in the training set, note that it doesn't matter which one you use\n",
453
+ "path = \"Data/wavs/LJ001-0110.wav\"\n",
454
+ "# this style vector ref_s can be saved as a parameter together with the model weights\n",
455
+ "ref_s = compute_style(path)"
456
+ ],
457
+ "metadata": {
458
+ "id": "jldPkJyCc83a"
459
+ },
460
+ "execution_count": null,
461
+ "outputs": []
462
+ },
463
+ {
464
+ "cell_type": "code",
465
+ "source": [
466
+ "start = time.time()\n",
467
+ "wav = inference(text, ref_s, alpha=0.9, beta=0.9, diffusion_steps=10, embedding_scale=1)\n",
468
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
469
+ "print(f\"RTF = {rtf:5f}\")\n",
470
+ "import IPython.display as ipd\n",
471
+ "display(ipd.Audio(wav, rate=24000, normalize=False))"
472
+ ],
473
+ "metadata": {
474
+ "id": "_mIU0jqDdQ-c"
475
+ },
476
+ "execution_count": null,
477
+ "outputs": []
478
+ }
479
+ ]
480
+ }
stts_48khz/StyleTTS2_48khz/Configs/config.yml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/LJSpeech"
2
+ first_stage_path: "first_stage.pth"
3
+ save_freq: 2
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 200 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 100 # number of peochs for second stage training (joint training)
8
+ batch_size: 16
9
+ max_len: 325 # maximum number of frames
10
+ pretrained_model: ""
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ F0_path: "Utils/JDC/bst.t7"
15
+ ASR_config: "Utils/ASR/config.yml"
16
+ ASR_path: "Utils/ASR/epoch_00080.pth"
17
+ PLBERT_dir: 'Utils/PLBERT/'
18
+
19
+ data_params:
20
+ train_data: "Data/train_list.txt"
21
+ val_data: "Data/val_list.txt"
22
+ root_path: "/local/LJSpeech-1.1/wavs"
23
+ OOD_data: "Data/OOD_texts.txt"
24
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
25
+
26
+ preprocess_params:
27
+ sr: 24000
28
+ spect_params:
29
+ n_fft: 2048
30
+ win_length: 1200
31
+ hop_length: 300
32
+
33
+ model_params:
34
+ multispeaker: false
35
+
36
+ dim_in: 64
37
+ hidden_dim: 512
38
+ max_conv_dim: 512
39
+ n_layer: 3
40
+ n_mels: 80
41
+
42
+ n_token: 178 # number of phoneme tokens
43
+ max_dur: 50 # maximum duration of a single phoneme
44
+ style_dim: 128 # style vector size
45
+
46
+ dropout: 0.2
47
+
48
+ # config for decoder
49
+ decoder:
50
+ type: 'istftnet' # either hifigan or istftnet
51
+ resblock_kernel_sizes: [3,7,11]
52
+ upsample_rates : [10, 6]
53
+ upsample_initial_channel: 512
54
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
+ upsample_kernel_sizes: [20, 12]
56
+ gen_istft_n_fft: 20
57
+ gen_istft_hop_size: 5
58
+
59
+ # speech language model config
60
+ slm:
61
+ model: 'microsoft/wavlm-base-plus'
62
+ sr: 16000 # sampling rate of SLM
63
+ hidden: 768 # hidden size of SLM
64
+ nlayers: 13 # number of layers of SLM
65
+ initial_channel: 64 # initial channels of SLM discriminator head
66
+
67
+ # style diffusion model config
68
+ diffusion:
69
+ embedding_mask_proba: 0.1
70
+ # transformer config
71
+ transformer:
72
+ num_layers: 3
73
+ num_heads: 8
74
+ head_features: 64
75
+ multiplier: 2
76
+
77
+ # diffusion distribution config
78
+ dist:
79
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
80
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
81
+ mean: -3.0
82
+ std: 1.0
83
+
84
+ loss_params:
85
+ lambda_mel: 5. # mel reconstruction loss
86
+ lambda_gen: 1. # generator loss
87
+ lambda_slm: 1. # slm feature matching loss
88
+
89
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
90
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
91
+ TMA_epoch: 50 # TMA starting epoch (1st stage)
92
+
93
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
94
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
95
+ lambda_dur: 1. # duration loss (2nd stage)
96
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
97
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
98
+ lambda_diff: 1. # score matching loss (2nd stage)
99
+
100
+ diff_epoch: 20 # style diffusion starting epoch (2nd stage)
101
+ joint_epoch: 50 # joint training starting epoch (2nd stage)
102
+
103
+ optimizer_params:
104
+ lr: 0.0001 # general learning rate
105
+ bert_lr: 0.00001 # learning rate for PLBERT
106
+ ft_lr: 0.00001 # learning rate for acoustic modules
107
+
108
+ slmadv_params:
109
+ min_len: 400 # minimum length of samples
110
+ max_len: 500 # maximum length of samples
111
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
112
+ iter: 10 # update the discriminator every this iterations of generator update
113
+ thresh: 5 # gradient norm above which the gradient is scaled
114
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
115
+ sig: 1.5 # sigma for differentiable duration modeling
116
+
stts_48khz/StyleTTS2_48khz/Configs/config_ft.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Style_Kanade_48khz_fn"
2
+ save_freq: 1
3
+ log_interval: 10
4
+ device: "cuda"
5
+ epochs: 2
6
+ batch_size: 14
7
+ max_len: 2250 # maximum number of frames
8
+ pretrained_model: "/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz_fn/finetune_phase_13999.pth"
9
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
10
+ load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
11
+
12
+
13
+ F0_path: "/home/austin/disk1/stts-zs_cleaning/F0_extractor/PitchExtractor/Checkpoint_200k/PE_48khz_epoch_00060.pth"
14
+ ASR_config: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/config.yml"
15
+ ASR_path: "/home/austin/disk2/llmvcs/tt/AuxiliaryASR/Checkpoint_new_plus/epoch_00070.pth"
16
+
17
+ PLBERT_dir: '/home/austin/disk2/llmvcs/tt/stylekan/Utils/PLBERT'
18
+
19
+ data_params:
20
+ train_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/train_48_pure.csv"
21
+ val_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/val_48_pure.csv"
22
+ root_path: ""
23
+ OOD_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/OOD_LargeScale_.csv"
24
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
25
+
26
+ #CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 accelerate launch accelerate_train_finetune.py -config_path ./Configs/config_ft.yml
27
+ #CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 python train_second.py --config_path ./Configs/config_kanade.yml
28
+ preprocess_params:
29
+ sr: 48000
30
+ spect_params:
31
+ n_fft: 2048
32
+ win_length: 2048
33
+ hop_length: 512
34
+
35
+ model_params:
36
+ multispeaker: true
37
+
38
+ dim_in: 64
39
+ hidden_dim: 512
40
+ max_conv_dim: 512
41
+ n_layer: 3
42
+ n_mels: 80
43
+
44
+ n_token: 178 # number of phoneme tokens
45
+ max_dur: 50 # maximum duration of a single phoneme
46
+ style_dim: 128 # style vector size
47
+
48
+ dropout: 0.2
49
+
50
+ # config for decoder
51
+ decoder:
52
+ type: 'istftnet' # either hifigan or istftnet
53
+ resblock_kernel_sizes: [3,7,11]
54
+ upsample_rates : [16, 8]
55
+ upsample_initial_channel: 512
56
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
57
+ upsample_kernel_sizes: [32, 16]
58
+ gen_istft_n_fft: 32
59
+ gen_istft_hop_size: 4
60
+
61
+
62
+ # speech language model config
63
+ slm:
64
+ model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
65
+ sr: 16000 # sampling rate of SLM
66
+ hidden: 1280 # hidden size of SLM
67
+ nlayers: 33 # number of layers of SLM
68
+ initial_channel: 64 # initial channels of SLM discriminator head
69
+
70
+ # style diffusion model config
71
+ diffusion:
72
+ embedding_mask_proba: 0.1
73
+ # transformer config
74
+ transformer:
75
+ num_layers: 3
76
+ num_heads: 8
77
+ head_features: 64
78
+ multiplier: 2
79
+
80
+ # diffusion distribution config
81
+ dist:
82
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
83
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
84
+ mean: -3.0
85
+ std: 1.0
86
+
87
+ loss_params:
88
+ lambda_mel: 5. # mel reconstruction loss
89
+ lambda_gen: 1. # generator loss
90
+ lambda_slm: 1. # slm feature matching loss
91
+
92
+ lambda_mono: 1. # monotonic alignment loss (TMA)
93
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
94
+
95
+ lambda_F0: 1. # F0 reconstruction loss
96
+ lambda_norm: 1. # norm reconstruction loss
97
+ lambda_dur: 1. # duration loss
98
+ lambda_ce: 20. # duration predictor probability output CE loss
99
+ lambda_sty: 1. # style reconstruction loss
100
+ lambda_diff: 1. # score matching loss
101
+
102
+ diff_epoch: 0 # style diffusion starting epoch
103
+ joint_epoch: 30 # joint training starting epoch
104
+
105
+ optimizer_params:
106
+ lr: 0.0001 # general learning rate
107
+ bert_lr: 0.00001 # learning rate for PLBERT
108
+ ft_lr: 0.0001 # learning rate for acoustic modules
109
+
110
+ slmadv_params:
111
+ min_len: 400 # minimum length of samples
112
+ max_len: 500 # maximum length of samples
113
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
114
+ iter: 10 # update the discriminator every this iterations of generator update
115
+ thresh: 5 # gradient norm above which the gradient is scaled
116
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
117
+ sig: 1.5 # sigma for differentiable duration modeling
118
+
stts_48khz/StyleTTS2_48khz/Configs/config_kanade_48khz.yml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Style_Kanade_48khz"
2
+ first_stage_path: ""
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 25 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 15 # number of peochs for second stage training (joint training)
8
+ batch_size: 35
9
+
10
+ max_len: 560 # approximately 15 seconds -> (512 / 48000) × (**2812** // 2) = 14.997 sec
11
+
12
+ pretrained_model: "/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/Top_ckpt.pth"
13
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
14
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
15
+
16
+ # F0_path: "/home/ubuntu/STTS_48khz/StyleTTS2-48khz/Utils/JDC/bst_rmvpe_48k.t7"
17
+ # ASR_config: "Utils/ASR/config.yml"
18
+ # ASR_path: "/home/ubuntu/STTS_48khz/StyleTTS2-48khz/Utils/ASR/epoch_00050_48K.pth"
19
+
20
+ F0_path: "/home/austin/disk1/stts-zs_cleaning/F0_extractor/PitchExtractor/Checkpoint_200k/PE_48khz_epoch_00060.pth"
21
+ ASR_config: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/config.yml"
22
+ ASR_path: "/home/austin/disk2/llmvcs/tt/AuxiliaryASR/Checkpoint_new_plus/epoch_00070.pth"
23
+
24
+ PLBERT_dir: '/home/austin/disk2/llmvcs/tt/stylekan/Utils/PLBERT'
25
+
26
+ data_params:
27
+ train_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/train_48_pure.csv"
28
+ val_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/val_48_pure.csv"
29
+ root_path: ""
30
+ OOD_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/OOD_LargeScale_.csv"
31
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
32
+
33
+ #CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 accelerate launch accelerate_train_second.py --config_path ./Configs/config_kanade_48khz.yml
34
+ #CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6 python train_second.py --config_path ./Configs/config_kanade.yml
35
+ #CUDA_VISIBLE_DEVICES=7 accelerate launch accelerate_train_second.py --config_path ./Configs/config_ft.yml
36
+ preprocess_params:
37
+ sr: 48000
38
+ spect_params:
39
+ n_fft: 2048
40
+ win_length: 2048
41
+ hop_length: 512
42
+
43
+ model_params:
44
+ multispeaker: true
45
+
46
+ dim_in: 64
47
+ hidden_dim: 512
48
+ max_conv_dim: 512
49
+ n_layer: 3
50
+ n_mels: 80
51
+
52
+ n_token: 178 # number of phoneme tokens
53
+ max_dur: 50 # maximum duration of a single phoneme
54
+ style_dim: 128 # style vector size
55
+
56
+ dropout: 0.2
57
+
58
+ decoder:
59
+ type: 'istftnet' # either hifigan or istftnet
60
+ resblock_kernel_sizes: [3,7,11]
61
+ upsample_rates : [16, 8]
62
+ upsample_initial_channel: 512
63
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
64
+ upsample_kernel_sizes: [32, 16]
65
+ gen_istft_n_fft: 32
66
+ gen_istft_hop_size: 4
67
+
68
+
69
+ # speech language model config
70
+ slm:
71
+ model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
72
+ sr: 16000 # sampling rate of SLM
73
+ hidden: 1280 # hidden size of SLM
74
+ nlayers: 33 # number of layers of SLM
75
+ initial_channel: 64 # initial channels of SLM discriminator head
76
+
77
+ # style diffusion model config
78
+ diffusion:
79
+ embedding_mask_proba: 0.1
80
+ # transformer config
81
+ transformer:
82
+ num_layers: 3
83
+ num_heads: 8
84
+ head_features: 64
85
+ multiplier: 2
86
+
87
+ # diffusion distribution config
88
+ dist:
89
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
90
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
91
+ mean: -3.0
92
+ std: 1.0
93
+
94
+ loss_params:
95
+ lambda_mel: 10. # mel reconstruction loss
96
+ lambda_gen: 1. # generator loss
97
+ lambda_slm: 1. # slm feature matching loss
98
+
99
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
100
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
101
+ TMA_epoch: 3 # TMA starting epoch (1st stage)
102
+
103
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
104
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
105
+ lambda_dur: 1. # duration loss (2nd stage)
106
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
107
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
108
+ lambda_diff: 1. # score matching loss (2nd stage)
109
+
110
+ diff_epoch: 4 # style diffusion starting epoch (2nd stage)
111
+ joint_epoch: 999 # joint training starting epoch (2nd stage)
112
+
113
+ optimizer_params:
114
+ lr: 0.0001 # general learning rate
115
+ bert_lr: 0.00001 # learning rate for PLBERT
116
+ ft_lr: 0.00001 # learning rate for acoustic modules
117
+
118
+ slmadv_params:
119
+ min_len: 400 # minimum length of samples
120
+ max_len: 500 # maximum length of samples
121
+ batch_percentage: .5 # to prevent out of memory, only use 1/2 of the original batch size
122
+ iter: 20 # update the discriminator every this iterations of generator update
123
+ thresh: 5 # gradient norm above which the gradient is scaled
124
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
125
+ sig: 1.5 # sigma for differentiable duration modeling
stts_48khz/StyleTTS2_48khz/Configs/config_kanade_48khz_copy.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Style_Kanade_48khz_test"
2
+ first_stage_path: "/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/epoch_1st_00020.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 25 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 15 # number of peochs for second stage training (joint training)
8
+ batch_size: 2
9
+
10
+ max_len: 2812 # approximately 15 seconds -> 0.01066666666666666666666666666667 × (2812 // 2) = 14.999
11
+
12
+ pretrained_model: "/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/epoch_2nd_00004.pth"
13
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
14
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
15
+
16
+ # F0_path: "/home/ubuntu/STTS_48khz/StyleTTS2-48khz/Utils/JDC/bst_rmvpe_48k.t7"
17
+ # ASR_config: "Utils/ASR/config.yml"
18
+ # ASR_path: "/home/ubuntu/STTS_48khz/StyleTTS2-48khz/Utils/ASR/epoch_00050_48K.pth"
19
+
20
+ F0_path: "/home/austin/disk1/stts-zs_cleaning/F0_extractor/PitchExtractor/Checkpoint_200k/PE_48khz_epoch_00060.pth"
21
+ ASR_config: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/config.yml"
22
+ ASR_path: "/home/austin/disk2/llmvcs/tt/AuxiliaryASR/Checkpoint_new_plus/epoch_00070.pth"
23
+
24
+ PLBERT_dir: '/home/austin/disk2/llmvcs/tt/stylekan/Utils/PLBERT'
25
+
26
+ data_params:
27
+ train_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/val_48_pure.txt"
28
+ val_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/val_48_pure.csv"
29
+ root_path: ""
30
+ OOD_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/OOD_LargeScale_.csv"
31
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
32
+
33
+ #CUDA_VISIBLE_DEVICES=5,6,7 accelerate launch accelerate_train_second.py --config_path ./Configs/config_kanade_48khz_copy.yml
34
+ #CUDA_VISIBLE_DEVICES=6,7 python train_second.py --config_path ./Configs/config_kanade_48khz_copy.yml
35
+ preprocess_params:
36
+ sr: 48000
37
+ spect_params:
38
+ n_fft: 2048
39
+ win_length: 2048
40
+ hop_length: 512
41
+
42
+ model_params:
43
+ multispeaker: true
44
+
45
+ dim_in: 64
46
+ hidden_dim: 512
47
+ max_conv_dim: 512
48
+ n_layer: 3
49
+ n_mels: 80
50
+
51
+ n_token: 178 # number of phoneme tokens
52
+ max_dur: 50 # maximum duration of a single phoneme
53
+ style_dim: 128 # style vector size
54
+
55
+ dropout: 0.2
56
+
57
+ decoder:
58
+ type: 'istftnet' # either hifigan or istftnet
59
+ resblock_kernel_sizes: [3,7,11]
60
+ upsample_rates : [16, 8]
61
+ upsample_initial_channel: 512
62
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
63
+ upsample_kernel_sizes: [32, 16]
64
+ gen_istft_n_fft: 32
65
+ gen_istft_hop_size: 4
66
+
67
+
68
+ # speech language model config
69
+ slm:
70
+ model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
71
+ sr: 16000 # sampling rate of SLM
72
+ hidden: 1280 # hidden size of SLM
73
+ nlayers: 33 # number of layers of SLM
74
+ initial_channel: 64 # initial channels of SLM discriminator head
75
+
76
+ # style diffusion model config
77
+ diffusion:
78
+ embedding_mask_proba: 0.1
79
+ # transformer config
80
+ transformer:
81
+ num_layers: 3
82
+ num_heads: 8
83
+ head_features: 64
84
+ multiplier: 2
85
+
86
+ # diffusion distribution config
87
+ dist:
88
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
89
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
90
+ mean: -3.0
91
+ std: 1.0
92
+
93
+ loss_params:
94
+ lambda_mel: 10. # mel reconstruction loss
95
+ lambda_gen: 1. # generator loss
96
+ lambda_slm: 1. # slm feature matching loss
97
+
98
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
99
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
100
+ TMA_epoch: 3 # TMA starting epoch (1st stage)
101
+
102
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
103
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
104
+ lambda_dur: 1. # duration loss (2nd stage)
105
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
106
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
107
+ lambda_diff: 1. # score matching loss (2nd stage)
108
+
109
+ diff_epoch: 4 # style diffusion starting epoch (2nd stage)
110
+ joint_epoch: 12 # joint training starting epoch (2nd stage)
111
+
112
+ optimizer_params:
113
+ lr: 0.0001 # general learning rate
114
+ bert_lr: 0.00001 # learning rate for PLBERT
115
+ ft_lr: 0.00001 # learning rate for acoustic modules
116
+
117
+ slmadv_params:
118
+ min_len: 400 # minimum length of samples
119
+ max_len: 500 # maximum length of samples
120
+ batch_percentage: 1 # to prevent out of memory, only use half of the original batch size
121
+ iter: 20 # update the discriminator every this iterations of generator update
122
+ thresh: 5 # gradient norm above which the gradient is scaled
123
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
124
+ sig: 1.5 # sigma for differentiable duration modeling
stts_48khz/StyleTTS2_48khz/Demo/Inference_LJSpeech.ipynb ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9adb7bd1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# StyleTTS 2 Demo (LJSpeech)\n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "6108384d",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Utils"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "96e173bf",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "import torch\n",
27
+ "torch.manual_seed(0)\n",
28
+ "torch.backends.cudnn.benchmark = False\n",
29
+ "torch.backends.cudnn.deterministic = True\n",
30
+ "\n",
31
+ "import random\n",
32
+ "random.seed(0)\n",
33
+ "\n",
34
+ "import numpy as np\n",
35
+ "np.random.seed(0)"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "da84c60f",
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "%cd .."
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "5a3ddcc8",
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# load packages\n",
56
+ "import time\n",
57
+ "import random\n",
58
+ "import yaml\n",
59
+ "from munch import Munch\n",
60
+ "import numpy as np\n",
61
+ "import torch\n",
62
+ "from torch import nn\n",
63
+ "import torch.nn.functional as F\n",
64
+ "import torchaudio\n",
65
+ "import librosa\n",
66
+ "from nltk.tokenize import word_tokenize\n",
67
+ "\n",
68
+ "from models import *\n",
69
+ "from utils import *\n",
70
+ "from text_utils import TextCleaner\n",
71
+ "textclenaer = TextCleaner()\n",
72
+ "\n",
73
+ "%matplotlib inline"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "id": "bbdc04c0",
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "id": "00ee05e1",
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
94
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
95
+ "mean, std = -4, 4\n",
96
+ "\n",
97
+ "def length_to_mask(lengths):\n",
98
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
99
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
100
+ " return mask\n",
101
+ "\n",
102
+ "def preprocess(wave):\n",
103
+ " wave_tensor = torch.from_numpy(wave).float()\n",
104
+ " mel_tensor = to_mel(wave_tensor)\n",
105
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
106
+ " return mel_tensor\n",
107
+ "\n",
108
+ "def compute_style(ref_dicts):\n",
109
+ " reference_embeddings = {}\n",
110
+ " for key, path in ref_dicts.items():\n",
111
+ " wave, sr = librosa.load(path, sr=24000)\n",
112
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
113
+ " if sr != 24000:\n",
114
+ " audio = librosa.resample(audio, sr, 24000)\n",
115
+ " mel_tensor = preprocess(audio).to(device)\n",
116
+ "\n",
117
+ " with torch.no_grad():\n",
118
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
119
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
120
+ " \n",
121
+ " return reference_embeddings"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "markdown",
126
+ "id": "7b9cecbe",
127
+ "metadata": {},
128
+ "source": [
129
+ "### Load models"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "id": "64fc4c0f",
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "# load phonemizer\n",
140
+ "import phonemizer\n",
141
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "48e7b644",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "config = yaml.safe_load(open(\"Models/LJSpeech/config.yml\"))\n",
152
+ "\n",
153
+ "# load pretrained ASR model\n",
154
+ "ASR_config = config.get('ASR_config', False)\n",
155
+ "ASR_path = config.get('ASR_path', False)\n",
156
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
157
+ "\n",
158
+ "# load pretrained F0 model\n",
159
+ "F0_path = config.get('F0_path', False)\n",
160
+ "pitch_extractor = load_F0_models(F0_path)\n",
161
+ "\n",
162
+ "# load BERT model\n",
163
+ "from Utils.PLBERT.util import load_plbert\n",
164
+ "BERT_path = config.get('PLBERT_dir', False)\n",
165
+ "plbert = load_plbert(BERT_path)"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "ffc18cf7",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)\n",
176
+ "_ = [model[key].eval() for key in model]\n",
177
+ "_ = [model[key].to(device) for key in model]"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": null,
183
+ "id": "64529d5c",
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "params_whole = torch.load(\"Models/LJSpeech/epoch_2nd_00100.pth\", map_location='cpu')\n",
188
+ "params = params_whole['net']"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "895d9706",
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "for key in model:\n",
199
+ " if key in params:\n",
200
+ " print('%s loaded' % key)\n",
201
+ " try:\n",
202
+ " model[key].load_state_dict(params[key])\n",
203
+ " except:\n",
204
+ " from collections import OrderedDict\n",
205
+ " state_dict = params[key]\n",
206
+ " new_state_dict = OrderedDict()\n",
207
+ " for k, v in state_dict.items():\n",
208
+ " name = k[7:] # remove `module.`\n",
209
+ " new_state_dict[name] = v\n",
210
+ " # load params\n",
211
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
212
+ "# except:\n",
213
+ "# _load(params[key], model[key])\n",
214
+ "_ = [model[key].eval() for key in model]"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "id": "c1a59db2",
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "id": "e30985ab",
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "sampler = DiffusionSampler(\n",
235
+ " model.diffusion.diffusion,\n",
236
+ " sampler=ADPM2Sampler(),\n",
237
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
238
+ " clamp=False\n",
239
+ ")"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "markdown",
244
+ "id": "b803110e",
245
+ "metadata": {},
246
+ "source": [
247
+ "### Synthesize speech"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "id": "24655f46",
254
+ "metadata": {},
255
+ "outputs": [],
256
+ "source": [
257
+ "# synthesize a text\n",
258
+ "text = ''' StyleTTS 2 is a text-to-speech model that leverages style diffusion and adversarial training with large speech language models to achieve human-level text-to-speech synthesis. '''"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": null,
264
+ "id": "ca57469c",
265
+ "metadata": {},
266
+ "outputs": [],
267
+ "source": [
268
+ "def inference(text, noise, diffusion_steps=5, embedding_scale=1):\n",
269
+ " text = text.strip()\n",
270
+ " text = text.replace('\"', '')\n",
271
+ " ps = global_phonemizer.phonemize([text])\n",
272
+ " ps = word_tokenize(ps[0])\n",
273
+ " ps = ' '.join(ps)\n",
274
+ "\n",
275
+ " tokens = textclenaer(ps)\n",
276
+ " tokens.insert(0, 0)\n",
277
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
278
+ " \n",
279
+ " with torch.no_grad():\n",
280
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
281
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
282
+ "\n",
283
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
284
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
285
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
286
+ "\n",
287
+ " s_pred = sampler(noise, \n",
288
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
289
+ " embedding_scale=embedding_scale).squeeze(0)\n",
290
+ "\n",
291
+ " s = s_pred[:, 128:]\n",
292
+ " ref = s_pred[:, :128]\n",
293
+ "\n",
294
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
295
+ "\n",
296
+ " x, _ = model.predictor.lstm(d)\n",
297
+ " duration = model.predictor.duration_proj(x)\n",
298
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
299
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
300
+ "\n",
301
+ " pred_dur[-1] += 5\n",
302
+ "\n",
303
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
304
+ " c_frame = 0\n",
305
+ " for i in range(pred_aln_trg.size(0)):\n",
306
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
307
+ " c_frame += int(pred_dur[i].data)\n",
308
+ "\n",
309
+ " # encode prosody\n",
310
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
311
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
312
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
313
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
314
+ " \n",
315
+ " return out.squeeze().cpu().numpy()"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "markdown",
320
+ "id": "d438ef4f",
321
+ "metadata": {},
322
+ "source": [
323
+ "#### Basic synthesis (5 diffusion steps)"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": null,
329
+ "id": "d3d7f7d5",
330
+ "metadata": {
331
+ "scrolled": true
332
+ },
333
+ "outputs": [],
334
+ "source": [
335
+ "start = time.time()\n",
336
+ "noise = torch.randn(1,1,256).to(device)\n",
337
+ "wav = inference(text, noise, diffusion_steps=5, embedding_scale=1)\n",
338
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
339
+ "print(f\"RTF = {rtf:5f}\")\n",
340
+ "import IPython.display as ipd\n",
341
+ "display(ipd.Audio(wav, rate=24000))"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "markdown",
346
+ "id": "2d5d9df0",
347
+ "metadata": {},
348
+ "source": [
349
+ "#### With higher diffusion steps (more diverse)\n",
350
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "id": "a10129fd",
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "start = time.time()\n",
361
+ "noise = torch.randn(1,1,256).to(device)\n",
362
+ "wav = inference(text, noise, diffusion_steps=10, embedding_scale=1)\n",
363
+ "rtf = (time.time() - start) / (len(wav) / 24000)\n",
364
+ "print(f\"RTF = {rtf:5f}\")\n",
365
+ "import IPython.display as ipd\n",
366
+ "display(ipd.Audio(wav, rate=24000))"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "markdown",
371
+ "id": "1877ea15",
372
+ "metadata": {},
373
+ "source": [
374
+ "### Speech expressiveness\n",
375
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page."
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "markdown",
380
+ "id": "4c4777b7",
381
+ "metadata": {},
382
+ "source": [
383
+ "#### With embedding_scale=1\n",
384
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional. "
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": null,
390
+ "id": "c29ea2f0",
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": [
394
+ "texts = {}\n",
395
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
396
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
397
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
398
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
399
+ "\n",
400
+ "for k,v in texts.items():\n",
401
+ " noise = torch.randn(1,1,256).to(device)\n",
402
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=1)\n",
403
+ " print(k + \": \")\n",
404
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "markdown",
409
+ "id": "3c89499f",
410
+ "metadata": {},
411
+ "source": [
412
+ "#### With embedding_scale=2"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "f73be3aa",
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "texts = {}\n",
423
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
424
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
425
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
426
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
427
+ "\n",
428
+ "for k,v in texts.items():\n",
429
+ " noise = torch.randn(1,1,256).to(device)\n",
430
+ " wav = inference(v, noise, diffusion_steps=10, embedding_scale=2) # embedding_scale=2 for more pronounced emotion\n",
431
+ " print(k + \": \")\n",
432
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "markdown",
437
+ "id": "9320da63",
438
+ "metadata": {},
439
+ "source": [
440
+ "### Long-form generation\n",
441
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page. "
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": null,
447
+ "id": "cdd4db51",
448
+ "metadata": {},
449
+ "outputs": [],
450
+ "source": [
451
+ "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first-class homemade products there is a market in all large cities. All first-class grocers have customers who purchase such goods.'''"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": null,
457
+ "id": "ebb941c8",
458
+ "metadata": {},
459
+ "outputs": [],
460
+ "source": [
461
+ "def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):\n",
462
+ " text = text.strip()\n",
463
+ " text = text.replace('\"', '')\n",
464
+ " ps = global_phonemizer.phonemize([text])\n",
465
+ " ps = word_tokenize(ps[0])\n",
466
+ " ps = ' '.join(ps)\n",
467
+ "\n",
468
+ " tokens = textclenaer(ps)\n",
469
+ " tokens.insert(0, 0)\n",
470
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
471
+ " \n",
472
+ " with torch.no_grad():\n",
473
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)\n",
474
+ " text_mask = length_to_mask(input_lengths).to(tokens.device)\n",
475
+ "\n",
476
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
477
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
478
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
479
+ "\n",
480
+ " s_pred = sampler(noise, \n",
481
+ " embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,\n",
482
+ " embedding_scale=embedding_scale).squeeze(0)\n",
483
+ " \n",
484
+ " if s_prev is not None:\n",
485
+ " # convex combination of previous and current style\n",
486
+ " s_pred = alpha * s_prev + (1 - alpha) * s_pred\n",
487
+ " \n",
488
+ " s = s_pred[:, 128:]\n",
489
+ " ref = s_pred[:, :128]\n",
490
+ "\n",
491
+ " d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)\n",
492
+ "\n",
493
+ " x, _ = model.predictor.lstm(d)\n",
494
+ " duration = model.predictor.duration_proj(x)\n",
495
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
496
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
497
+ "\n",
498
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
499
+ " c_frame = 0\n",
500
+ " for i in range(pred_aln_trg.size(0)):\n",
501
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
502
+ " c_frame += int(pred_dur[i].data)\n",
503
+ "\n",
504
+ " # encode prosody\n",
505
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
506
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
507
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
508
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
509
+ " \n",
510
+ " return out.squeeze().cpu().numpy(), s_pred"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "code",
515
+ "execution_count": null,
516
+ "id": "7ca0ef2e",
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": [
520
+ "sentences = passage.split('.') # simple split by comma\n",
521
+ "wavs = []\n",
522
+ "s_prev = None\n",
523
+ "for text in sentences:\n",
524
+ " if text.strip() == \"\": continue\n",
525
+ " text += '.' # add it back\n",
526
+ " noise = torch.randn(1,1,256).to(device)\n",
527
+ " wav, s_prev = LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=10, embedding_scale=1.5)\n",
528
+ " wavs.append(wav)\n",
529
+ "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))"
530
+ ]
531
+ }
532
+ ],
533
+ "metadata": {
534
+ "kernelspec": {
535
+ "display_name": "NLP",
536
+ "language": "python",
537
+ "name": "nlp"
538
+ },
539
+ "language_info": {
540
+ "codemirror_mode": {
541
+ "name": "ipython",
542
+ "version": 3
543
+ },
544
+ "file_extension": ".py",
545
+ "mimetype": "text/x-python",
546
+ "name": "python",
547
+ "nbconvert_exporter": "python",
548
+ "pygments_lexer": "ipython3",
549
+ "version": "3.9.7"
550
+ }
551
+ },
552
+ "nbformat": 4,
553
+ "nbformat_minor": 5
554
+ }
stts_48khz/StyleTTS2_48khz/Demo/Inference_LibriTTS.ipynb ADDED
@@ -0,0 +1,1242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9adb7bd1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# StyleTTS 2 Demo (LibriTTS) 48khz\n",
9
+ "\n",
10
+ "Before you run the following cells, please make sure you have downloaded [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzipped it under the `demo` folder."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "id": "6108384d",
16
+ "metadata": {},
17
+ "source": [
18
+ "### Utils"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 8,
24
+ "id": "96e173bf",
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "name": "stdout",
29
+ "output_type": "stream",
30
+ "text": [
31
+ "1\n"
32
+ ]
33
+ }
34
+ ],
35
+ "source": [
36
+ "import os\n",
37
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"6\"\n",
38
+ "import torch\n",
39
+ "print(torch.cuda.device_count()) # should print: 1\n",
40
+ "\n",
41
+ "import torch\n",
42
+ "torch.manual_seed(0)\n",
43
+ "torch.backends.cudnn.benchmark = False\n",
44
+ "torch.backends.cudnn.deterministic = True\n",
45
+ "\n",
46
+ "import random\n",
47
+ "random.seed(0)\n",
48
+ "\n",
49
+ "import numpy as np\n",
50
+ "np.random.seed(0)"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 10,
56
+ "id": "da84c60f",
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "name": "stdout",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz\n"
64
+ ]
65
+ }
66
+ ],
67
+ "source": [
68
+ "%cd /home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 13,
74
+ "id": "5a3ddcc8",
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "177\n"
82
+ ]
83
+ }
84
+ ],
85
+ "source": [
86
+ "# load packages\n",
87
+ "import time\n",
88
+ "import random\n",
89
+ "import yaml\n",
90
+ "from munch import Munch\n",
91
+ "import numpy as np\n",
92
+ "import torch\n",
93
+ "from torch import nn\n",
94
+ "import torch.nn.functional as F\n",
95
+ "import torchaudio\n",
96
+ "import librosa\n",
97
+ "from nltk.tokenize import word_tokenize\n",
98
+ "\n",
99
+ "from models import *\n",
100
+ "from utils import *\n",
101
+ "from text_utils import TextCleaner\n",
102
+ "textclenaer = TextCleaner()\n",
103
+ "\n",
104
+ "%matplotlib inline"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 14,
110
+ "id": "00ee05e1",
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
115
+ " n_mels=80, n_fft=2048, win_length=2048, hop_length=512)\n",
116
+ "mean, std = -4, 4\n",
117
+ "\n",
118
+ "def length_to_mask(lengths):\n",
119
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
120
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
121
+ " return mask\n",
122
+ "\n",
123
+ "def preprocess(wave):\n",
124
+ " wave_tensor = torch.from_numpy(wave).float()\n",
125
+ " mel_tensor = to_mel(wave_tensor)\n",
126
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
127
+ " return mel_tensor\n",
128
+ "\n",
129
+ "def compute_style(path):\n",
130
+ " wave, sr = librosa.load(path, sr=48000)\n",
131
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
132
+ " if sr != 48000:\n",
133
+ " audio = librosa.resample(audio, sr, 48000)\n",
134
+ " mel_tensor = preprocess(audio).to(device)\n",
135
+ "\n",
136
+ " with torch.no_grad():\n",
137
+ " ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
138
+ " ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
139
+ "\n",
140
+ " return torch.cat([ref_s, ref_p], dim=1)"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 15,
146
+ "id": "bbdc04c0",
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "markdown",
155
+ "id": "7b9cecbe",
156
+ "metadata": {},
157
+ "source": [
158
+ "### Load models"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "id": "64fc4c0f",
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "# load phonemizer\n",
169
+ "import phonemizer\n",
170
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 17,
176
+ "id": "48e7b644",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "config = yaml.safe_load(open(\"/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/config_kanade_48khz.yml\"))\n",
181
+ "\n",
182
+ "# load pretrained ASR model\n",
183
+ "ASR_config = config.get('ASR_config', False)\n",
184
+ "ASR_path = config.get('ASR_path', False)\n",
185
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
186
+ "\n",
187
+ "# load pretrained F0 model\n",
188
+ "F0_path = config.get('F0_path', False)\n",
189
+ "pitch_extractor = load_F0_models(F0_path)\n",
190
+ "\n",
191
+ "# load BERT model\n",
192
+ "from Utils.PLBERT.util import load_plbert\n",
193
+ "BERT_path = config.get('PLBERT_dir', False)\n",
194
+ "plbert = load_plbert(BERT_path)"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 18,
200
+ "id": "ffc18cf7",
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "model_params = recursive_munch(config['model_params'])\n",
205
+ "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
206
+ "_ = [model[key].eval() for key in model]\n",
207
+ "_ = [model[key].to(device) for key in model]"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 19,
213
+ "id": "64529d5c",
214
+ "metadata": {},
215
+ "outputs": [],
216
+ "source": [
217
+ "params_whole = torch.load(\"/home/austin/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/epoch_2nd_00006.pth\", map_location='cpu')\n",
218
+ "params = params_whole['net']"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 20,
224
+ "id": "895d9706",
225
+ "metadata": {},
226
+ "outputs": [
227
+ {
228
+ "name": "stdout",
229
+ "output_type": "stream",
230
+ "text": [
231
+ "bert loaded\n",
232
+ "bert_encoder loaded\n",
233
+ "predictor loaded\n"
234
+ ]
235
+ },
236
+ {
237
+ "name": "stdout",
238
+ "output_type": "stream",
239
+ "text": [
240
+ "decoder loaded\n",
241
+ "text_encoder loaded\n",
242
+ "predictor_encoder loaded\n",
243
+ "style_encoder loaded\n",
244
+ "diffusion loaded\n",
245
+ "text_aligner loaded\n",
246
+ "pitch_extractor loaded\n",
247
+ "mpd loaded\n",
248
+ "msd loaded\n",
249
+ "wd loaded\n"
250
+ ]
251
+ }
252
+ ],
253
+ "source": [
254
+ "for key in model:\n",
255
+ " if key in params:\n",
256
+ " print('%s loaded' % key)\n",
257
+ " try:\n",
258
+ " model[key].load_state_dict(params[key])\n",
259
+ " except:\n",
260
+ " from collections import OrderedDict\n",
261
+ " state_dict = params[key]\n",
262
+ " new_state_dict = OrderedDict()\n",
263
+ " for k, v in state_dict.items():\n",
264
+ " name = k[7:] # remove `module.`\n",
265
+ " new_state_dict[name] = v\n",
266
+ " # load params\n",
267
+ " model[key].load_state_dict(new_state_dict, strict=False)\n",
268
+ "# except:\n",
269
+ "# _load(params[key], model[key])\n",
270
+ "_ = [model[key].eval() for key in model]"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": 22,
276
+ "id": "e30985ab",
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule\n",
281
+ "sampler = DiffusionSampler(\n",
282
+ " model.diffusion.diffusion,\n",
283
+ " sampler=ADPM2Sampler(),\n",
284
+ " sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
285
+ " clamp=False\n",
286
+ ")"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "markdown",
291
+ "id": "b803110e",
292
+ "metadata": {},
293
+ "source": [
294
+ "### Synthesize speech"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": 3,
300
+ "id": "ca57469c",
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": [
304
+ "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
305
+ " # text = text.strip()\n",
306
+ " # ps = global_phonemizer.phonemize([text])\n",
307
+ " # ps = word_tokenize(ps[0])\n",
308
+ " # ps = ' '.join(ps)\n",
309
+ " tokens = textclenaer(text)\n",
310
+ " tokens.insert(0, 0)\n",
311
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
312
+ " \n",
313
+ " with torch.no_grad():\n",
314
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
315
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
316
+ "\n",
317
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
318
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
319
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
320
+ " \n",
321
+ "\n",
322
+ "\n",
323
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
324
+ " embedding=bert_dur,\n",
325
+ " embedding_scale=embedding_scale,\n",
326
+ " features=ref_s, # reference from the same speaker as the embedding\n",
327
+ " num_steps=diffusion_steps).squeeze(1)\n",
328
+ "\n",
329
+ "\n",
330
+ " s = s_pred[:, 128:]\n",
331
+ " ref = s_pred[:, :128]\n",
332
+ "\n",
333
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
334
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
335
+ "\n",
336
+ " d = model.predictor.text_encoder(d_en, \n",
337
+ " s, input_lengths, text_mask)\n",
338
+ "\n",
339
+ " x = model.predictor.lstm(d)\n",
340
+ " x_mod = model.predictor.prepare_projection(x) # 640 -> 512\n",
341
+ " duration = model.predictor.duration_proj(x_mod)\n",
342
+ "\n",
343
+ "\n",
344
+ " duration = torch.sigmoid(duration).sum(axis=-1) \n",
345
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
346
+ "\n",
347
+ "\n",
348
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
349
+ " c_frame = 0\n",
350
+ " for i in range(pred_aln_trg.size(0)):\n",
351
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
352
+ " c_frame += int(pred_dur[i].data)\n",
353
+ "\n",
354
+ " # encode prosody\n",
355
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
356
+ " if model_params.decoder.type == \"hifigan\":\n",
357
+ " asr_new = torch.zeros_like(en)\n",
358
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
359
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
360
+ " en = asr_new\n",
361
+ "\n",
362
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
363
+ "\n",
364
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
365
+ " if model_params.decoder.type == \"hifigan\":\n",
366
+ " asr_new = torch.zeros_like(asr)\n",
367
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
368
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
369
+ " asr = asr_new\n",
370
+ "\n",
371
+ " out = model.decoder(asr, \n",
372
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
373
+ " \n",
374
+ " \n",
375
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "markdown",
380
+ "id": "d438ef4f",
381
+ "metadata": {},
382
+ "source": [
383
+ "#### Basic synthesis (5 diffusion steps, seen speakers)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 4,
389
+ "id": "cace9787",
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "text = '''soɯ iɯ imi de wa, sendʑoɯgawaɽa çitagi wa, kawaʔta no de mo, koɯsei ɕita no de mo, modoʔta no de mo toɽikaeɕita no de mo nakɯ, maɕite, deɽeta no de mo doɽota no de mo nakɯ.'''\n",
394
+ "reference_dicts = {}\n",
395
+ "# reference_dicts['696_92939'] = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/imas_split/Syuuko/Syuuko_Events_and_Card/Event/saite_jewel/saite_jewel_chunk90.wav\"\n",
396
+ "# reference_dicts['1789_142896'] = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/imas_split/Kanade/Kanade_Events_and_Card/Kanade_Events/KanLipps/Kanade_lipps_02/Kanade_lipps_02_chunk9.wav\"\n",
397
+ "# reference_dicts['1789_14289w'] = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/sakura_moyu/01/01102220.wav\"\n",
398
+ "reference_dicts['1789_14289w'] = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/monogatari/monogatari_voices/monogatari_split/sakamoto_maya/Sakamoto_Maya_01/Sakamoto_Maya_01_chunk1709.wav\""
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": 5,
404
+ "id": "16e8ac60",
405
+ "metadata": {},
406
+ "outputs": [
407
+ {
408
+ "ename": "NameError",
409
+ "evalue": "name 'time' is not defined",
410
+ "output_type": "error",
411
+ "traceback": [
412
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
413
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
414
+ "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m start \u001b[38;5;241m=\u001b[39m \u001b[43mtime\u001b[49m\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m 2\u001b[0m noise \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m256\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m k, path \u001b[38;5;129;01min\u001b[39;00m reference_dicts\u001b[38;5;241m.\u001b[39mitems():\n",
415
+ "\u001b[0;31mNameError\u001b[0m: name 'time' is not defined"
416
+ ]
417
+ }
418
+ ],
419
+ "source": [
420
+ "\n",
421
+ "\n",
422
+ "start = time.time()\n",
423
+ "noise = torch.randn(1,1,256).to(device)\n",
424
+ "for k, path in reference_dicts.items():\n",
425
+ " ref_s = compute_style(path)\n",
426
+ " \n",
427
+ " wav = inference(text, ref_s, alpha=0., beta=0.5, diffusion_steps=10, embedding_scale=2)\n",
428
+ " rtf = (time.time() - start) / (len(wav) / 48000)\n",
429
+ " print(f\"RTF = {rtf:5f}\")\n",
430
+ " import IPython.display as ipd\n",
431
+ " print(k + ' Synthesized:')\n",
432
+ " display(ipd.Audio(wav, rate=48000, normalize=False))\n",
433
+ " print('Reference:')\n",
434
+ " display(ipd.Audio(path, rate=48000, normalize=False))"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "markdown",
439
+ "id": "14838708",
440
+ "metadata": {},
441
+ "source": [
442
+ "#### With higher diffusion steps (more diverse)\n",
443
+ "\n",
444
+ "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": null,
450
+ "id": "6fbff03b",
451
+ "metadata": {},
452
+ "outputs": [],
453
+ "source": [
454
+ "noise = torch.randn(1,1,256).to(device)\n",
455
+ "for k, path in reference_dicts.items():\n",
456
+ " ref_s = compute_style(path)\n",
457
+ " start = time.time()\n",
458
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
459
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
460
+ " print(f\"RTF = {rtf:5f}\")\n",
461
+ " import IPython.display as ipd\n",
462
+ " print(k + ' Synthesized:')\n",
463
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
464
+ " print(k + ' Reference:')\n",
465
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "markdown",
470
+ "id": "7e6867fd",
471
+ "metadata": {},
472
+ "source": [
473
+ "#### Basic synthesis (5 diffusion steps, umseen speakers)\n",
474
+ "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": null,
480
+ "id": "f4e8faa0",
481
+ "metadata": {},
482
+ "outputs": [],
483
+ "source": [
484
+ "reference_dicts = {}\n",
485
+ "# format: (path, text)\n",
486
+ "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
487
+ "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
488
+ "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
489
+ "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": null,
495
+ "id": "653f1406",
496
+ "metadata": {},
497
+ "outputs": [],
498
+ "source": [
499
+ "noise = torch.randn(1,1,256).to(device)\n",
500
+ "for k, v in reference_dicts.items():\n",
501
+ " path, text = v\n",
502
+ " ref_s = compute_style(path)\n",
503
+ " start = time.time()\n",
504
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
505
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
506
+ " print(f\"RTF = {rtf:5f}\")\n",
507
+ " import IPython.display as ipd\n",
508
+ " print(k + ' Synthesized: ' + text)\n",
509
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
510
+ " print(k + ' Reference:')\n",
511
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "markdown",
516
+ "id": "141e91b3",
517
+ "metadata": {},
518
+ "source": [
519
+ "### Speech expressiveness\n",
520
+ "\n",
521
+ "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training. \n",
522
+ "\n",
523
+ "#### With `embedding_scale=1`\n",
524
+ "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.\n",
525
+ "\n"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "code",
530
+ "execution_count": null,
531
+ "id": "81addda4",
532
+ "metadata": {},
533
+ "outputs": [],
534
+ "source": [
535
+ "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
536
+ ]
537
+ },
538
+ {
539
+ "cell_type": "code",
540
+ "execution_count": null,
541
+ "id": "be1b2a11",
542
+ "metadata": {},
543
+ "outputs": [],
544
+ "source": [
545
+ "texts = {}\n",
546
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
547
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
548
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
549
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
550
+ "\n",
551
+ "for k,v in texts.items():\n",
552
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
553
+ " print(k + \": \")\n",
554
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "markdown",
559
+ "id": "96d262b8",
560
+ "metadata": {},
561
+ "source": [
562
+ "#### With `embedding_scale=2`"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": null,
568
+ "id": "3e7d40b4",
569
+ "metadata": {},
570
+ "outputs": [],
571
+ "source": [
572
+ "texts = {}\n",
573
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
574
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
575
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
576
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
577
+ "\n",
578
+ "for k,v in texts.items():\n",
579
+ " noise = torch.randn(1,1,256).to(device)\n",
580
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
581
+ " print(k + \": \")\n",
582
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
583
+ ]
584
+ },
585
+ {
586
+ "cell_type": "markdown",
587
+ "id": "402b2bd6",
588
+ "metadata": {},
589
+ "source": [
590
+ "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
591
+ "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody. "
592
+ ]
593
+ },
594
+ {
595
+ "cell_type": "code",
596
+ "execution_count": null,
597
+ "id": "599de5d5",
598
+ "metadata": {},
599
+ "outputs": [],
600
+ "source": [
601
+ "texts = {}\n",
602
+ "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
603
+ "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
604
+ "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
605
+ "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
606
+ "\n",
607
+ "for k,v in texts.items():\n",
608
+ " noise = torch.randn(1,1,256).to(device)\n",
609
+ " wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
610
+ " print(k + \": \")\n",
611
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "markdown",
616
+ "id": "48548866",
617
+ "metadata": {},
618
+ "source": [
619
+ "### Zero-shot speaker adaptation\n",
620
+ "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance. "
621
+ ]
622
+ },
623
+ {
624
+ "cell_type": "markdown",
625
+ "id": "23e81572",
626
+ "metadata": {},
627
+ "source": [
628
+ "#### Acoustic Environment Maintenance\n",
629
+ "\n",
630
+ "Since we want to maintain the acoustic environment in the speaker (timbre), we set `alpha = 0` to make the speaker as closer to the reference as possible while only changing the prosody according to the text. "
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": null,
636
+ "id": "8087bccb",
637
+ "metadata": {},
638
+ "outputs": [],
639
+ "source": [
640
+ "reference_dicts = {}\n",
641
+ "# format: (path, text)\n",
642
+ "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
643
+ "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
644
+ "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": null,
650
+ "id": "1e99c200",
651
+ "metadata": {},
652
+ "outputs": [],
653
+ "source": [
654
+ "noise = torch.randn(1,1,256).to(device)\n",
655
+ "for k, v in reference_dicts.items():\n",
656
+ " path, text = v\n",
657
+ " ref_s = compute_style(path)\n",
658
+ " start = time.time()\n",
659
+ " wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
660
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
661
+ " print(f\"RTF = {rtf:5f}\")\n",
662
+ " import IPython.display as ipd\n",
663
+ " print('Synthesized: ' + text)\n",
664
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
665
+ " print('Reference:')\n",
666
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
667
+ ]
668
+ },
669
+ {
670
+ "cell_type": "markdown",
671
+ "id": "7d56505d",
672
+ "metadata": {},
673
+ "source": [
674
+ "#### Speaker’s Emotion Maintenance\n",
675
+ "\n",
676
+ "Since we want to maintain the emotion in the speaker (prosody), we set `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "code",
681
+ "execution_count": null,
682
+ "id": "f90179e7",
683
+ "metadata": {},
684
+ "outputs": [],
685
+ "source": [
686
+ "reference_dicts = {}\n",
687
+ "# format: (path, text)\n",
688
+ "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
689
+ "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
690
+ "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
691
+ "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
692
+ ]
693
+ },
694
+ {
695
+ "cell_type": "code",
696
+ "execution_count": null,
697
+ "id": "2e6bdfed",
698
+ "metadata": {},
699
+ "outputs": [],
700
+ "source": [
701
+ "noise = torch.randn(1,1,256).to(device)\n",
702
+ "for k, v in reference_dicts.items():\n",
703
+ " path, text = v\n",
704
+ " ref_s = compute_style(path)\n",
705
+ " start = time.time()\n",
706
+ " wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
707
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
708
+ " print(f\"RTF = {rtf:5f}\")\n",
709
+ " import IPython.display as ipd\n",
710
+ " print(k + ' Synthesized: ' + text)\n",
711
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
712
+ " print(k + ' Reference:')\n",
713
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "markdown",
718
+ "id": "37ae3963",
719
+ "metadata": {},
720
+ "source": [
721
+ "### Longform Narration\n",
722
+ "\n",
723
+ "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
724
+ ]
725
+ },
726
+ {
727
+ "cell_type": "code",
728
+ "execution_count": null,
729
+ "id": "f12a716b",
730
+ "metadata": {},
731
+ "outputs": [],
732
+ "source": []
733
+ },
734
+ {
735
+ "cell_type": "code",
736
+ "execution_count": 19,
737
+ "id": "a1a38079",
738
+ "metadata": {},
739
+ "outputs": [],
740
+ "source": [
741
+ "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
742
+ " # text = text.strip()\n",
743
+ " # ps = global_phonemizer.phonemize([text])\n",
744
+ " # ps = word_tokenize(ps[0])\n",
745
+ " # ps = ' '.join(ps)\n",
746
+ " # ps = ps.replace('``', '\"')\n",
747
+ " # ps = ps.replace(\"''\", '\"')\n",
748
+ "\n",
749
+ " tokens = textclenaer(text)\n",
750
+ " tokens.insert(0, 0)\n",
751
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
752
+ " \n",
753
+ " with torch.no_grad():\n",
754
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
755
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
756
+ "\n",
757
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
758
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
759
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
760
+ "\n",
761
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
762
+ " embedding=bert_dur,\n",
763
+ " embedding_scale=embedding_scale,\n",
764
+ " features=ref_s, # reference from the same speaker as the embedding\n",
765
+ " num_steps=diffusion_steps).squeeze(1)\n",
766
+ " \n",
767
+ " if s_prev is not None:\n",
768
+ " # convex combination of previous and current style\n",
769
+ " s_pred = t * s_prev + (1 - t) * s_pred\n",
770
+ " \n",
771
+ " s = s_pred[:, 128:]\n",
772
+ " ref = s_pred[:, :128]\n",
773
+ " \n",
774
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
775
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
776
+ "\n",
777
+ " s_pred = torch.cat([ref, s], dim=-1)\n",
778
+ "\n",
779
+ " d = model.predictor.text_encoder(d_en, \n",
780
+ " s, input_lengths, text_mask)\n",
781
+ "\n",
782
+ " x = model.predictor.lstm(d)\n",
783
+ " x_mod = model.predictor.prepare_projection(x) # 640 -> 512\n",
784
+ " duration = model.predictor.duration_proj(x_mod)\n",
785
+ "\n",
786
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
787
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
788
+ "\n",
789
+ "\n",
790
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
791
+ " c_frame = 0\n",
792
+ " for i in range(pred_aln_trg.size(0)):\n",
793
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
794
+ " c_frame += int(pred_dur[i].data)\n",
795
+ "\n",
796
+ " # encode prosody\n",
797
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
798
+ " if model_params.decoder.type == \"hifigan\":\n",
799
+ " asr_new = torch.zeros_like(en)\n",
800
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
801
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
802
+ " en = asr_new\n",
803
+ "\n",
804
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
805
+ "\n",
806
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
807
+ " if model_params.decoder.type == \"hifigan\":\n",
808
+ " asr_new = torch.zeros_like(asr)\n",
809
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
810
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
811
+ " asr = asr_new\n",
812
+ "\n",
813
+ " out = model.decoder(asr, \n",
814
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
815
+ " \n",
816
+ " \n",
817
+ " return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later"
818
+ ]
819
+ },
820
+ {
821
+ "cell_type": "code",
822
+ "execution_count": 75,
823
+ "id": "e9088f7a",
824
+ "metadata": {},
825
+ "outputs": [
826
+ {
827
+ "ename": "RuntimeError",
828
+ "evalue": "The size of tensor a (512) must match the size of tensor b (531) at non-singleton dimension 3",
829
+ "output_type": "error",
830
+ "traceback": [
831
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
832
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
833
+ "Cell \u001b[0;32mIn[75], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 13\u001b[0m text \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# add it back\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m wav, s_prev \u001b[38;5;241m=\u001b[39m \u001b[43mLFinference\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43ms_prev\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43ms_ref\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0.3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# make it more suitable for the text\u001b[39;49;00m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0.7\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mdiffusion_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedding_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1.5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m wavs\u001b[38;5;241m.\u001b[39mappend(wav)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSynthesized: \u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
834
+ "Cell \u001b[0;32mIn[19], line 64\u001b[0m, in \u001b[0;36mLFinference\u001b[0;34m(text, s_prev, ref_s, alpha, beta, t, diffusion_steps, embedding_scale)\u001b[0m\n\u001b[1;32m 61\u001b[0m asr_new[:, :, \u001b[38;5;241m1\u001b[39m:] \u001b[38;5;241m=\u001b[39m en[:, :, \u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 62\u001b[0m en \u001b[38;5;241m=\u001b[39m asr_new\n\u001b[0;32m---> 64\u001b[0m F0_pred, N_pred \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredictor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mF0Ntrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43men\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 66\u001b[0m asr \u001b[38;5;241m=\u001b[39m (t_en \u001b[38;5;241m@\u001b[39m pred_aln_trg\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device))\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_params\u001b[38;5;241m.\u001b[39mdecoder\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhifigan\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
835
+ "File \u001b[0;32m~/disk2/llmvcs/tt/stts_48khz/StyleTTS2_48khz/models.py:1651\u001b[0m, in \u001b[0;36mProsodyPredictor.F0Ntrain\u001b[0;34m(self, x, s)\u001b[0m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mF0Ntrain\u001b[39m(\u001b[38;5;28mself\u001b[39m, x, s):\n\u001b[0;32m-> 1651\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshared\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_projection(x)\n\u001b[1;32m 1655\u001b[0m F0 \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m)\n",
836
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
837
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
838
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/xlstm/xlstm_block_stack.py:120\u001b[0m, in \u001b[0;36mxLSTMBlockStack.forward\u001b[0;34m(self, x, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x: torch\u001b[38;5;241m.\u001b[39mTensor, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m block \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocks:\n\u001b[0;32m--> 120\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mblock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 122\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_blocks_norm(x)\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n",
839
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
840
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
841
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/xlstm/blocks/xlstm_block.py:77\u001b[0m, in \u001b[0;36mxLSTMBlock.forward\u001b[0;34m(self, x, **kwargs)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x: torch\u001b[38;5;241m.\u001b[39mTensor, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[0;32m---> 77\u001b[0m x \u001b[38;5;241m=\u001b[39m x \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mxlstm\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mxlstm_norm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mffn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 79\u001b[0m x \u001b[38;5;241m=\u001b[39m x \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mffn(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mffn_norm(x), \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
842
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
843
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
844
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/xlstm/blocks/mlstm/layer.py:116\u001b[0m, in \u001b[0;36mmLSTMLayer.forward\u001b[0;34m(self, x, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mk_proj(x_mlstm_conv_act)\n\u001b[1;32m 114\u001b[0m v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mv_proj(x_mlstm)\n\u001b[0;32m--> 116\u001b[0m h_tilde_state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmlstm_cell\u001b[49m\u001b[43m(\u001b[49m\u001b[43mq\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 118\u001b[0m h_tilde_state_skip \u001b[38;5;241m=\u001b[39m h_tilde_state \u001b[38;5;241m+\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlearnable_skip \u001b[38;5;241m*\u001b[39m x_mlstm_conv_act)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# output / z branch\u001b[39;00m\n",
845
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
846
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
847
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/xlstm/blocks/mlstm/cell.py:61\u001b[0m, in \u001b[0;36mmLSTMCell.forward\u001b[0;34m(self, q, k, v, **kwargs)\u001b[0m\n\u001b[1;32m 58\u001b[0m fgate_preact \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfgate(if_gate_input) \u001b[38;5;66;03m# (B, S, NH)\u001b[39;00m\n\u001b[1;32m 59\u001b[0m fgate_preact \u001b[38;5;241m=\u001b[39m fgate_preact\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# (B, NH, S, 1)#\u001b[39;00m\n\u001b[0;32m---> 61\u001b[0m h_state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_fn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 62\u001b[0m \u001b[43m \u001b[49m\u001b[43mqueries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 64\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[43m \u001b[49m\u001b[43migate_preact\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43migate_preact\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 66\u001b[0m \u001b[43m \u001b[49m\u001b[43mfgate_preact\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfgate_preact\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 67\u001b[0m \u001b[43m \u001b[49m\u001b[43mlower_triangular_matrix\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcausal_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 68\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# (B, NH, S, DH)\u001b[39;00m\n\u001b[1;32m 70\u001b[0m h_state_norm \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutnorm(h_state) \u001b[38;5;66;03m# (B, NH, S, DH)\u001b[39;00m\n\u001b[1;32m 71\u001b[0m h_state_norm \u001b[38;5;241m=\u001b[39m h_state_norm\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mreshape(B, S, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# (B, NH, S, DH) -> (B, S, NH, DH) -> (B, S, H)\u001b[39;00m\n",
848
+ "File \u001b[0;32m~/disk2/micromamba/envs/sttszs/lib/python3.11/site-packages/xlstm/blocks/mlstm/backends.py:64\u001b[0m, in \u001b[0;36mparallel_stabilized_simple\u001b[0;34m(queries, keys, values, igate_preact, fgate_preact, lower_triangular_matrix, stabilize_rowwise, eps, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m _log_fg_matrix \u001b[38;5;241m=\u001b[39m rep_log_fgates_cumsum \u001b[38;5;241m-\u001b[39m rep_log_fgates_cumsum\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# (B, NH, S+1, S+1)\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# Causal masking & selection of the correct submatrix, such that forgetgate at timestep t is not applied\u001b[39;00m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# to the input at timestep t\u001b[39;00m\n\u001b[0;32m---> 64\u001b[0m log_fg_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m(\u001b[49m\u001b[43mltr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_log_fg_matrix\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# (B, NH, S, S)\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;66;03m# gate decay matrix D (combination of forget gate and input gate)\u001b[39;00m\n\u001b[1;32m 67\u001b[0m log_D_matrix \u001b[38;5;241m=\u001b[39m log_fg_matrix \u001b[38;5;241m+\u001b[39m igate_preact\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# (B, NH, S, S)\u001b[39;00m\n",
849
+ "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (512) must match the size of tensor b (531) at non-singleton dimension 3"
850
+ ]
851
+ }
852
+ ],
853
+ "source": [
854
+ "# unseen speaker\n",
855
+ "passage = '''ohoɕisama, doko kaɽa kita no? kiʔto, oːkina ɯtɕɯɯ no jɯme o miterɯɴ da ne. sono jɯme wa, wataɕitatɕi no kokoɽo ni mo todokɯ joɯ na, totemo fɯkai jɯme naɴ daɽoɯ ne. ohoɕisama ga çikarɯ jorɯ wa, wataɕi mo sono jɯme o mite, aɕita no boɯkeɴ o soɯzoɯ sɯrɯɴ da. dakaɽa, ohoɕisama, zɯʔto iʔɕo ni ite ne.'''\n",
856
+ "# reference_dicts = {}\n",
857
+ "# reference_dicts['696_92939'] = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/imas_split/Syuuko/Syuuko_Events_and_Card/Event/saite_jewel/saite_jewel_chunk90.wav\"\n",
858
+ "# reference_dicts['1789_142896'] = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/imas_split/Kanade/Kanade_Events_and_Card/Kanade_Events/KanLipps/Kanade_lipps_02/Kanade_lipps_02_chunk9.wav\"\n",
859
+ "path = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/sakura_moyu/01/01001290.wav\"\n",
860
+ "s_ref = compute_style(path)\n",
861
+ "sentences = passage.split('.') # simple split by comma\n",
862
+ "wavs = []\n",
863
+ "s_prev = None\n",
864
+ "for text in sentences:\n",
865
+ " if text.strip() == \"\": continue\n",
866
+ " text += ',' # add it back\n",
867
+ " \n",
868
+ " wav, s_prev = LFinference(text, \n",
869
+ " s_prev, \n",
870
+ " s_ref, \n",
871
+ " alpha = 0.1, \n",
872
+ " beta = 0.3, # make it more suitable for the text\n",
873
+ " t = 0.7, \n",
874
+ " diffusion_steps=10, embedding_scale=1.5)\n",
875
+ " wavs.append(wav)\n",
876
+ "print('Synthesized: ')\n",
877
+ "display(ipd.Audio(np.concatenate(wavs), rate=48000, normalize=False))\n",
878
+ "print('Reference: ')\n",
879
+ "display(ipd.Audio(path, rate=48000, normalize=False))"
880
+ ]
881
+ },
882
+ {
883
+ "cell_type": "markdown",
884
+ "id": "7517b657",
885
+ "metadata": {},
886
+ "source": [
887
+ "### Style Transfer\n",
888
+ "\n",
889
+ "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style). "
890
+ ]
891
+ },
892
+ {
893
+ "cell_type": "code",
894
+ "execution_count": null,
895
+ "id": "ed95d0f7",
896
+ "metadata": {},
897
+ "outputs": [],
898
+ "source": [
899
+ "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
900
+ " text = text.strip()\n",
901
+ " ps = global_phonemizer.phonemize([text])\n",
902
+ " ps = word_tokenize(ps[0])\n",
903
+ " ps = ' '.join(ps)\n",
904
+ "\n",
905
+ " tokens = textclenaer(ps)\n",
906
+ " tokens.insert(0, 0)\n",
907
+ " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
908
+ " \n",
909
+ " ref_text = ref_text.strip()\n",
910
+ " ps = global_phonemizer.phonemize([ref_text])\n",
911
+ " ps = word_tokenize(ps[0])\n",
912
+ " ps = ' '.join(ps)\n",
913
+ "\n",
914
+ " ref_tokens = textclenaer(ps)\n",
915
+ " ref_tokens.insert(0, 0)\n",
916
+ " ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
917
+ " \n",
918
+ " \n",
919
+ " with torch.no_grad():\n",
920
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
921
+ " text_mask = length_to_mask(input_lengths).to(device)\n",
922
+ "\n",
923
+ " t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
924
+ " bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
925
+ " d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
926
+ " \n",
927
+ " ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
928
+ " ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
929
+ " ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
930
+ " s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
931
+ " embedding=bert_dur,\n",
932
+ " embedding_scale=embedding_scale,\n",
933
+ " features=ref_s, # reference from the same speaker as the embedding\n",
934
+ " num_steps=diffusion_steps).squeeze(1)\n",
935
+ "\n",
936
+ "\n",
937
+ " s = s_pred[:, 128:]\n",
938
+ " ref = s_pred[:, :128]\n",
939
+ "\n",
940
+ " ref = alpha * ref + (1 - alpha) * ref_s[:, :128]\n",
941
+ " s = beta * s + (1 - beta) * ref_s[:, 128:]\n",
942
+ "\n",
943
+ " d = model.predictor.text_encoder(d_en, \n",
944
+ " s, input_lengths, text_mask)\n",
945
+ "\n",
946
+ " x, _ = model.predictor.lstm(d)\n",
947
+ " duration = model.predictor.duration_proj(x)\n",
948
+ "\n",
949
+ " duration = torch.sigmoid(duration).sum(axis=-1)\n",
950
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
951
+ "\n",
952
+ "\n",
953
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
954
+ " c_frame = 0\n",
955
+ " for i in range(pred_aln_trg.size(0)):\n",
956
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
957
+ " c_frame += int(pred_dur[i].data)\n",
958
+ "\n",
959
+ " # encode prosody\n",
960
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
961
+ " if model_params.decoder.type == \"hifigan\":\n",
962
+ " asr_new = torch.zeros_like(en)\n",
963
+ " asr_new[:, :, 0] = en[:, :, 0]\n",
964
+ " asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
965
+ " en = asr_new\n",
966
+ "\n",
967
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
968
+ "\n",
969
+ " asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
970
+ " if model_params.decoder.type == \"hifigan\":\n",
971
+ " asr_new = torch.zeros_like(asr)\n",
972
+ " asr_new[:, :, 0] = asr[:, :, 0]\n",
973
+ " asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
974
+ " asr = asr_new\n",
975
+ "\n",
976
+ " out = model.decoder(asr, \n",
977
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
978
+ " \n",
979
+ " \n",
980
+ " return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
981
+ ]
982
+ },
983
+ {
984
+ "cell_type": "code",
985
+ "execution_count": null,
986
+ "id": "ec3f0da4",
987
+ "metadata": {},
988
+ "outputs": [],
989
+ "source": [
990
+ "# reference texts to sample styles\n",
991
+ "\n",
992
+ "ref_texts = {}\n",
993
+ "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
994
+ "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
995
+ "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
996
+ "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
997
+ ]
998
+ },
999
+ {
1000
+ "cell_type": "code",
1001
+ "execution_count": null,
1002
+ "id": "6d0a3825",
1003
+ "metadata": {
1004
+ "scrolled": false
1005
+ },
1006
+ "outputs": [],
1007
+ "source": [
1008
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1009
+ "s_ref = compute_style(path)\n",
1010
+ "\n",
1011
+ "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
1012
+ "for k,v in ref_texts.items():\n",
1013
+ " wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
1014
+ " print(k + \": \")\n",
1015
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1016
+ ]
1017
+ },
1018
+ {
1019
+ "cell_type": "markdown",
1020
+ "id": "6750aed9",
1021
+ "metadata": {},
1022
+ "source": [
1023
+ "### Speech diversity\n",
1024
+ "\n",
1025
+ "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page. \n",
1026
+ "\n",
1027
+ "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
1028
+ "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different). \n",
1029
+ "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis). \n"
1030
+ ]
1031
+ },
1032
+ {
1033
+ "cell_type": "markdown",
1034
+ "id": "f6ae0aa5",
1035
+ "metadata": {},
1036
+ "source": [
1037
+ "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
1038
+ "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text. "
1039
+ ]
1040
+ },
1041
+ {
1042
+ "cell_type": "code",
1043
+ "execution_count": null,
1044
+ "id": "36dc0148",
1045
+ "metadata": {},
1046
+ "outputs": [],
1047
+ "source": [
1048
+ "# unseen speaker\n",
1049
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1050
+ "ref_s = compute_style(path)\n",
1051
+ "\n",
1052
+ "text = \"How much variation is there?\"\n",
1053
+ "for _ in range(5):\n",
1054
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
1055
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1056
+ ]
1057
+ },
1058
+ {
1059
+ "cell_type": "markdown",
1060
+ "id": "bf9ef421",
1061
+ "metadata": {},
1062
+ "source": [
1063
+ "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
1064
+ "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples. "
1065
+ ]
1066
+ },
1067
+ {
1068
+ "cell_type": "code",
1069
+ "execution_count": null,
1070
+ "id": "9ba406bd",
1071
+ "metadata": {},
1072
+ "outputs": [],
1073
+ "source": [
1074
+ "# unseen speaker\n",
1075
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1076
+ "ref_s = compute_style(path)\n",
1077
+ "\n",
1078
+ "text = \"How much variation is there?\"\n",
1079
+ "for _ in range(5):\n",
1080
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
1081
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1082
+ ]
1083
+ },
1084
+ {
1085
+ "cell_type": "markdown",
1086
+ "id": "a38fe464",
1087
+ "metadata": {},
1088
+ "source": [
1089
+ "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
1090
+ "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker. "
1091
+ ]
1092
+ },
1093
+ {
1094
+ "cell_type": "code",
1095
+ "execution_count": null,
1096
+ "id": "5f25bf94",
1097
+ "metadata": {},
1098
+ "outputs": [],
1099
+ "source": [
1100
+ "# unseen speaker\n",
1101
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1102
+ "ref_s = compute_style(path)\n",
1103
+ "\n",
1104
+ "text = \"How much variation is there?\"\n",
1105
+ "for _ in range(5):\n",
1106
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
1107
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1108
+ ]
1109
+ },
1110
+ {
1111
+ "cell_type": "markdown",
1112
+ "id": "21c3a071",
1113
+ "metadata": {},
1114
+ "source": [
1115
+ "#### Extreme setting (`alpha = 1, beta=1`)\n",
1116
+ "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker. "
1117
+ ]
1118
+ },
1119
+ {
1120
+ "cell_type": "code",
1121
+ "execution_count": null,
1122
+ "id": "fff8bab1",
1123
+ "metadata": {},
1124
+ "outputs": [],
1125
+ "source": [
1126
+ "# unseen speaker\n",
1127
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1128
+ "ref_s = compute_style(path)\n",
1129
+ "\n",
1130
+ "text = \"How much variation is there?\"\n",
1131
+ "for _ in range(5):\n",
1132
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
1133
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1134
+ ]
1135
+ },
1136
+ {
1137
+ "cell_type": "markdown",
1138
+ "id": "a8741e5a",
1139
+ "metadata": {},
1140
+ "source": [
1141
+ "#### No variation (`alpha = 0, beta=0`)\n",
1142
+ "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very similar to the reference speaker, but there is no variation. "
1143
+ ]
1144
+ },
1145
+ {
1146
+ "cell_type": "code",
1147
+ "execution_count": null,
1148
+ "id": "e55dd281",
1149
+ "metadata": {},
1150
+ "outputs": [],
1151
+ "source": [
1152
+ "# unseen speaker\n",
1153
+ "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
1154
+ "ref_s = compute_style(path)\n",
1155
+ "\n",
1156
+ "text = \"How much variation is there?\"\n",
1157
+ "for _ in range(5):\n",
1158
+ " wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
1159
+ " display(ipd.Audio(wav, rate=24000, normalize=False))"
1160
+ ]
1161
+ },
1162
+ {
1163
+ "cell_type": "markdown",
1164
+ "id": "d5e86423",
1165
+ "metadata": {},
1166
+ "source": [
1167
+ "### Extra fun!\n",
1168
+ "\n",
1169
+ "Here we clone some of the authors' voice of the StyleTTS 2 papers with a few seconds of the recording in the wild. None of the voices is in the dataset and all authors agreed to have their voices cloned here."
1170
+ ]
1171
+ },
1172
+ {
1173
+ "cell_type": "code",
1174
+ "execution_count": null,
1175
+ "id": "6f558314",
1176
+ "metadata": {},
1177
+ "outputs": [],
1178
+ "source": [
1179
+ "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
1180
+ ]
1181
+ },
1182
+ {
1183
+ "cell_type": "code",
1184
+ "execution_count": null,
1185
+ "id": "caa5747c",
1186
+ "metadata": {},
1187
+ "outputs": [],
1188
+ "source": [
1189
+ "reference_dicts = {}\n",
1190
+ "reference_dicts['Yinghao'] = \"Demo/reference_audio/Yinghao.wav\"\n",
1191
+ "reference_dicts['Gavin'] = \"Demo/reference_audio/Gavin.wav\"\n",
1192
+ "reference_dicts['Vinay'] = \"Demo/reference_audio/Vinay.wav\"\n",
1193
+ "reference_dicts['Nima'] = \"Demo/reference_audio/Nima.wav\""
1194
+ ]
1195
+ },
1196
+ {
1197
+ "cell_type": "code",
1198
+ "execution_count": null,
1199
+ "id": "44a4cea1",
1200
+ "metadata": {
1201
+ "scrolled": false
1202
+ },
1203
+ "outputs": [],
1204
+ "source": [
1205
+ "start = time.time()\n",
1206
+ "noise = torch.randn(1,1,256).to(device)\n",
1207
+ "for k, path in reference_dicts.items():\n",
1208
+ " ref_s = compute_style(path)\n",
1209
+ " \n",
1210
+ " wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
1211
+ " rtf = (time.time() - start) / (len(wav) / 24000)\n",
1212
+ " print('Speaker: ' + k)\n",
1213
+ " import IPython.display as ipd\n",
1214
+ " print('Synthesized:')\n",
1215
+ " display(ipd.Audio(wav, rate=24000, normalize=False))\n",
1216
+ " print('Reference:')\n",
1217
+ " display(ipd.Audio(path, rate=24000, normalize=False))"
1218
+ ]
1219
+ }
1220
+ ],
1221
+ "metadata": {
1222
+ "kernelspec": {
1223
+ "display_name": "Python 3",
1224
+ "language": "python",
1225
+ "name": "python3"
1226
+ },
1227
+ "language_info": {
1228
+ "codemirror_mode": {
1229
+ "name": "ipython",
1230
+ "version": 3
1231
+ },
1232
+ "file_extension": ".py",
1233
+ "mimetype": "text/x-python",
1234
+ "name": "python",
1235
+ "nbconvert_exporter": "python",
1236
+ "pygments_lexer": "ipython3",
1237
+ "version": "3.11.10"
1238
+ }
1239
+ },
1240
+ "nbformat": 4,
1241
+ "nbformat_minor": 5
1242
+ }
stts_48khz/StyleTTS2_48khz/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade/config_kanade.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Style_Kanade"
2
+ first_stage_path: "/home/austin/disk2/llmvcs/tt/stylekan/Models/Style_Kanade/epoch_1st_00013.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 25 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 15 # number of peochs for second stage training (joint training)
8
+ batch_size: 24
9
+ max_len: 560 # maximum number of frames
10
+ pretrained_model: "/home/austin/disk2/llmvcs/tt/stylekan/Models/Style_Kanade/succ_epoch_2nd_00002.pth"
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ # F0_path: "/home/ubuntu/STTS_48khz/StyleTTS2-48khz/Utils/JDC/bst_rmvpe_48k.t7"
15
+ # ASR_config: "Utils/ASR/config.yml"
16
+ # ASR_path: "/home/ubuntu/STTS_48khz/StyleTTS2-48khz/Utils/ASR/epoch_00050_48K.pth"
17
+
18
+
19
+
20
+ F0_path: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/JDC/bst.t7"
21
+ ASR_config: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/config.yml"
22
+ ASR_path: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/bst_00080.pth"
23
+
24
+ PLBERT_dir: 'Utils/PLBERT/'
25
+
26
+ data_params:
27
+ train_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/filtered_train_list.csv"
28
+ val_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/mg_valid.txt"
29
+ root_path: ""
30
+ OOD_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/OOD_LargeScale_.csv"
31
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
32
+
33
+
34
+ preprocess_params:
35
+ sr: 24000
36
+ spect_params:
37
+ n_fft: 2048
38
+ win_length: 1200
39
+ hop_length: 300
40
+
41
+ model_params:
42
+ multispeaker: true
43
+
44
+ dim_in: 64
45
+ hidden_dim: 512
46
+ max_conv_dim: 512
47
+ n_layer: 3
48
+ n_mels: 80
49
+
50
+ n_token: 178 # number of phoneme tokens
51
+ max_dur: 50 # maximum duration of a single phoneme
52
+ style_dim: 128 # style vector size
53
+
54
+ dropout: 0.2
55
+
56
+ decoder:
57
+ type: 'istftnet' # either hifigan or istftnet
58
+ resblock_kernel_sizes: [3,7,11]
59
+ upsample_rates : [10, 6]
60
+ upsample_initial_channel: 512
61
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
62
+ upsample_kernel_sizes: [20, 12]
63
+ gen_istft_n_fft: 20
64
+ gen_istft_hop_size: 5
65
+
66
+
67
+
68
+ # speech language model config
69
+ slm:
70
+ model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
71
+ sr: 16000 # sampling rate of SLM
72
+ hidden: 1280 # hidden size of SLM
73
+ nlayers: 33 # number of layers of SLM
74
+ initial_channel: 64 # initial channels of SLM discriminator head
75
+
76
+ # style diffusion model config
77
+ diffusion:
78
+ embedding_mask_proba: 0.1
79
+ # transformer config
80
+ transformer:
81
+ num_layers: 3
82
+ num_heads: 8
83
+ head_features: 64
84
+ multiplier: 2
85
+
86
+ # diffusion distribution config
87
+ dist:
88
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
89
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
90
+ mean: -3.0
91
+ std: 1.0
92
+
93
+ loss_params:
94
+ lambda_mel: 10. # mel reconstruction loss
95
+ lambda_gen: 1. # generator loss
96
+ lambda_slm: 1. # slm feature matching loss
97
+
98
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
99
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
100
+ TMA_epoch: 9 # TMA starting epoch (1st stage)
101
+
102
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
103
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
104
+ lambda_dur: 1. # duration loss (2nd stage)
105
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
106
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
107
+ lambda_diff: 1. # score matching loss (2nd stage)
108
+
109
+ diff_epoch: 2 # style diffusion starting epoch (2nd stage)
110
+ joint_epoch: 6 # joint training starting epoch (2nd stage)
111
+
112
+ optimizer_params:
113
+ lr: 0.0001 # general learning rate
114
+ bert_lr: 0.00001 # learning rate for PLBERT
115
+ ft_lr: 0.00001 # learning rate for acoustic modules
116
+
117
+ slmadv_params:
118
+ min_len: 400 # minimum length of samples
119
+ max_len: 500 # maximum length of samples
120
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
121
+ iter: 20 # update the discriminator every this iterations of generator update
122
+ thresh: 5 # gradient norm above which the gradient is scaled
123
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
124
+ sig: 1.5 # sigma for differentiable duration modeling
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade/tensorboard/events.out.tfevents.1728511195.node-1.1421403.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b345ee45d60a91882d9371bbc7e7f2e516d966f6ea5b1c9152a12fb4b3c5e7f2
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade/train.log ADDED
File without changes
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/2nd_phase_165885.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d88946c2ad6951102438d4377a0de03e8e932cdb921a96491c8302d2ccee062
3
+ size 2634096094
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/2nd_phase_65527.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0234f6214df09eb84fccc4da8a32abbcafa66e538443f50ab0248547f7fb536
3
+ size 2056542305
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/2nd_phase_last.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:837323ec6d6a23319c3e80c169595a13010a4ed783015c4b727e7e26bae2928e
3
+ size 2056545874
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/DP_epoch_2nd_00004.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebade56e9e2aab901b8c80811a807201b34a64eb56dfc267e84e71a275f09f48
3
+ size 1522665000
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/NO_SLM_epoch_2nd_00009.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:845b43f224edc0ad3a57d3a5bfaa2c293a28cbf3caa8c80a7bb9fa1bb08d0470
3
+ size 2056549032
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/NO_SLM_epoch_2nd_00010.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79aa6cd825f51004625f7ba8806da65c6fcc95484dacfb7f862982b81b4a0308
3
+ size 2056549032
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462180.node-1.1003680.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f7b7ad158fbb253067e0aac2b0dd0d55aedce480f843a0da1df65c621cba037
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462294.node-1.1004682.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fe150cc2d1132dd2eb70b41c013159ef0d7f025f66c81a404da755a4535b8cb
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462472.node-1.1005638.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851ea96b4c47b93d43be6f80acff6230535e6bfbe99c36667589b124441c38cb
3
+ size 2560
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728462951.node-1.1007312.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f59ad7609f349f7396199de23168c05c873f722309d0b0f17830b46c430bb07
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463094.node-1.1008219.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5df01fc68dfb3b019689711260ed2a9321af4c95fdddbb2a96229d5978396625
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463336.node-1.1010823.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83d76a0888871f133e42fc11450319a6561a6495a789b7ae87d47ecd65b4f97a
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463388.node-1.1011249.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2545722cbdece2a028da4a0a5b270adc17334aab1fbae91501816d21a7464ba3
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463515.node-1.1013548.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d20009b085d787b22a11e53fae855158377954e651cafc1f47cbb9a00b724ad7
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728463957.node-1.1016238.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487baec629ff2b35ef5a5dee52804320ff18926173581103b1e8707b68bd0065
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464009.node-1.1016738.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31294dbd8a51f7d1bcc38516e988fb01106b7e3a95b221746b2ef8265e6f2041
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464233.node-1.1019060.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c1c0bf4b9a6b7df5234a63b7a053f3566cdc36ff8f767eb38f0385df5adc12
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464354.node-1.1019744.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5120ccb9594416d07241b12993c9062eaef365a5143f138487b5142f3651fbf
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464586.node-1.1020751.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2f9e3f52c93ab01722243e4b44521e9f56eb045b9a0fc310dbe4451933ab70f
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464707.node-1.1021516.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30a01b19998be89e637bd8a7c708043a48040b16666113b33b220bc75b7a1e8
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464831.node-1.1022361.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21a04faa35f646804b73e9dbb99f9959c273c80ee8adeeb8b3d31120575db24
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728464900.node-1.1022907.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff57ace3d453046db3a73d86ca648eb0d41e380919dadc739ad58ef6fab3a82
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465001.node-1.1025007.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe554f916ef302f23d632f05cba6257659a3f4690450ab700e91f6da91439a80
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465067.node-1.1026980.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5def090452ecadea26774681015c59eb6156b32b32c18043ee1c9f5a463649a0
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465130.node-1.1028957.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36986ddd50b5ac71411f0d8c23dd32b29f955e1eb687b626db75c7db9d454b58
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465526.node-1.1031919.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd287d2ae4069eef2ce60bb1f8e4d82ce6f0aefba2ae8b500d40986446cb0357
3
+ size 1324
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465719.node-1.1034258.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a3b4ae4ac7f2b657745d442f650035264c9bd73bcc6946c55c3ad68736b31a
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465773.node-1.1034708.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7a3dedca2882cc634dfc00ec09320fb229b606405d02dd7028fcefc78c10d9f
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728465956.node-1.1037028.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:142f9797269c386eae3311cb4c463edbe4db04085b3211ab6cd2e25774125efd
3
+ size 4414
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466151.node-1.1039623.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6ec80c55b1ac401c4154cec4959b4a6b580413479e9c39ad6527889bbea685
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466402.node-1.1042081.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2212f2ad84e8300d7c193b214dae7021fba8336b9e5a8bed28a95d63e67de435
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466494.node-1.1044247.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eae22e2ba3ce18c37779e09b95aa6c6b05c73c9884fd725a412b5d35ff73e18a
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466592.node-1.1046287.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28504f4d924fd17bd6c5ad2a5622bf78a16dc81a71ad9cb9832a9c8b6ec72e3b
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466648.node-1.1048190.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29375abbff4881e86b811dccec6d07e7bb4c45395e9be31210d760248689cb95
3
+ size 88
stts_48khz/StyleTTS2_48khz/Models/Style_Kanade_48khz/StyleTTS2-Second-Stage/events.out.tfevents.1728466722.node-1.1050256.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a0a5169b6144f9da5dad9b9cde28cdb7f582da193f133b540c778d9959793c
3
+ size 88