jbetker commited on
Commit
56b10cc
1 Parent(s): 54a946d

Add colab notebook

Browse files
Files changed (1) hide show
  1. tortoise_tts.ipynb +248 -0
tortoise_tts.ipynb ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "name": "tortoise-tts.ipynb",
7
+ "provenance": [],
8
+ "collapsed_sections": []
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "accelerator": "GPU"
18
+ },
19
+ "cells": [
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "metadata": {
24
+ "id": "JrK20I32grP6"
25
+ },
26
+ "outputs": [],
27
+ "source": [
28
+ "!git clone https://github.com/neonbjb/tortoise-tts.git\n",
29
+ "%cd tortoise-tts\n",
30
+ "!pip install -r requirements.txt"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "source": [
36
+ "# Imports used through the rest of the notebook.\n",
37
+ "import torch\n",
38
+ "import torchaudio\n",
39
+ "import torch.nn as nn\n",
40
+ "import torch.nn.functional as F\n",
41
+ "from tqdm import tqdm\n",
42
+ "\n",
43
+ "from utils.tokenizer import VoiceBpeTokenizer\n",
44
+ "from models.discrete_diffusion_vocoder import DiscreteDiffusionVocoder\n",
45
+ "from models.text_voice_clip import VoiceCLIP\n",
46
+ "from models.dvae import DiscreteVAE\n",
47
+ "from models.autoregressive import UnifiedVoice\n",
48
+ "\n",
49
+ "# These have some fairly interesting code that is hidden in the colab. Consider checking it out.\n",
50
+ "from do_tts import download_models, load_discrete_vocoder_diffuser, load_conditioning, fix_autoregressive_output, do_spectrogram_diffusion"
51
+ ],
52
+ "metadata": {
53
+ "id": "Gen09NM4hONQ"
54
+ },
55
+ "execution_count": null,
56
+ "outputs": []
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "source": [
61
+ "# Download pretrained models and set up pretrained voice bank. Feel free to upload and add your own voices here.\n",
62
+ "# To do so, upload two WAV files cropped to 5-10 seconds of someone speaking.\n",
63
+ "download_models()\n",
64
+ "preselected_cond_voices = {\n",
65
+ " # Male voices\n",
66
+ " 'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],\n",
67
+ " 'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],\n",
68
+ " 'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],\n",
69
+ " 'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],\n",
70
+ " # Female voices\n",
71
+ " 'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],\n",
72
+ " 'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],\n",
73
+ " 'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],\n",
74
+ " 'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],\n",
75
+ " }"
76
+ ],
77
+ "metadata": {
78
+ "id": "SSleVnRAiEE2"
79
+ },
80
+ "execution_count": null,
81
+ "outputs": []
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "source": [
86
+ "# This is the text that will be spoken.\n",
87
+ "text = \"And took the other as just as fair, and having perhaps the better claim, because it was grassy and wanted wear.\"\n",
88
+ "# This is the voice that will speak it.\n",
89
+ "voice = 'atkins'\n",
90
+ "# This is the number of samples we will generate from the DALLE-style model. More will produce better results, but will take longer to produce.\n",
91
+ "# I don't recommend going less than 128.\n",
92
+ "num_autoregressive_samples = 128"
93
+ ],
94
+ "metadata": {
95
+ "id": "bt_aoxONjfL2"
96
+ },
97
+ "execution_count": null,
98
+ "outputs": []
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "source": [
103
+ "# Prepare data.\n",
104
+ "tokenizer = VoiceBpeTokenizer()\n",
105
+ "text = torch.IntTensor(tokenizer.encode(text)).unsqueeze(0).cuda()\n",
106
+ "text = F.pad(text, (0,1)) # This may not be necessary.\n",
107
+ "cond_paths = preselected_cond_voices[voice]\n",
108
+ "conds = []\n",
109
+ "for cond_path in cond_paths:\n",
110
+ " c, cond_wav = load_conditioning(cond_path)\n",
111
+ " conds.append(c)\n",
112
+ "conds = torch.stack(conds, dim=1) # And just use the last cond_wav for the diffusion model."
113
+ ],
114
+ "metadata": {
115
+ "id": "KEXOKjIvn6NW"
116
+ },
117
+ "execution_count": null,
118
+ "outputs": []
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "source": [
123
+ "# Load the autoregressive model.\n",
124
+ "autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,\n",
125
+ " heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()\n",
126
+ "autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))\n",
127
+ "stop_mel_token = autoregressive.stop_mel_token"
128
+ ],
129
+ "metadata": {
130
+ "id": "Z15xFT_uhP8v"
131
+ },
132
+ "execution_count": null,
133
+ "outputs": []
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "source": [
138
+ "# Perform inference with the autoregressive model, generating num_autoregressive_samples\n",
139
+ "with torch.no_grad():\n",
140
+ " samples = []\n",
141
+ " for b in tqdm(range(num_autoregressive_samples // 16)):\n",
142
+ " codes = autoregressive.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95,\n",
143
+ " temperature=.9, num_return_sequences=16, length_penalty=1)\n",
144
+ " padding_needed = 250 - codes.shape[1]\n",
145
+ " codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)\n",
146
+ " samples.append(codes)\n",
147
+ "\n",
148
+ "# Delete model weights to conserve memory.\n",
149
+ "del autoregressive"
150
+ ],
151
+ "metadata": {
152
+ "id": "xajqWiEik-j0"
153
+ },
154
+ "execution_count": null,
155
+ "outputs": []
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "source": [
160
+ "# Load the CLIP model.\n",
161
+ "clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,\n",
162
+ " num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()\n",
163
+ "clip.load_state_dict(torch.load('.models/clip.pth'))"
164
+ ],
165
+ "metadata": {
166
+ "id": "KNgYSyuyliMs"
167
+ },
168
+ "execution_count": null,
169
+ "outputs": []
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "source": [
174
+ "# Use the CLIP model to select the best autoregressive output to match the given text.\n",
175
+ "clip_results = []\n",
176
+ "with torch.no_grad():\n",
177
+ " for batch in samples:\n",
178
+ " for i in range(batch.shape[0]):\n",
179
+ " batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)\n",
180
+ " text = text[:, :120] # Ugly hack to fix the fact that I didn't train CLIP to handle long enough text.\n",
181
+ " clip_results.append(clip(text.repeat(batch.shape[0], 1),\n",
182
+ " torch.full((batch.shape[0],), fill_value=text.shape[1]-1, dtype=torch.long, device='cuda'),\n",
183
+ " batch, torch.full((batch.shape[0],), fill_value=batch.shape[1]*1024, dtype=torch.long, device='cuda'),\n",
184
+ " return_loss=False))\n",
185
+ " clip_results = torch.cat(clip_results, dim=0)\n",
186
+ " samples = torch.cat(samples, dim=0)\n",
187
+ " best_results = samples[torch.topk(clip_results, k=1).indices]\n",
188
+ "\n",
189
+ "# Save samples to CPU memory, delete clip to conserve memory.\n",
190
+ "samples = samples.cpu()\n",
191
+ "del clip"
192
+ ],
193
+ "metadata": {
194
+ "id": "DDXkM0lclp4U"
195
+ },
196
+ "execution_count": null,
197
+ "outputs": []
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "source": [
202
+ "# Load the DVAE and diffusion model.\n",
203
+ "dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,\n",
204
+ " record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()\n",
205
+ "dvae.load_state_dict(torch.load('.models/dvae.pth'), strict=False)\n",
206
+ "diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],\n",
207
+ " spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,\n",
208
+ " conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()\n",
209
+ "diffusion.load_state_dict(torch.load('.models/diffusion.pth'))\n",
210
+ "diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)"
211
+ ],
212
+ "metadata": {
213
+ "id": "97acSnBal8Q2"
214
+ },
215
+ "execution_count": null,
216
+ "outputs": []
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "source": [
221
+ "# Decode the (best) discrete sequence created by the autoregressive model.\n",
222
+ "with torch.no_grad():\n",
223
+ " for b in range(best_results.shape[0]):\n",
224
+ " code = best_results[b].unsqueeze(0)\n",
225
+ " wav = do_spectrogram_diffusion(diffusion, dvae, diffuser, code, cond_wav, spectrogram_compression_factor=256, mean=True)\n",
226
+ " torchaudio.save(f'{voice}_{b}.wav', wav.squeeze(0).cpu(), 22050)"
227
+ ],
228
+ "metadata": {
229
+ "id": "HEDABTrdl_kM"
230
+ },
231
+ "execution_count": null,
232
+ "outputs": []
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "source": [
237
+ "# Listen to your text! (told you that'd take a long time..)\n",
238
+ "from IPython.display import Audio\n",
239
+ "Audio(data=wav.squeeze(0).cpu().numpy(), rate=22050)"
240
+ ],
241
+ "metadata": {
242
+ "id": "EyHmcdqBmSvf"
243
+ },
244
+ "execution_count": null,
245
+ "outputs": []
246
+ }
247
+ ]
248
+ }