jbetker commited on
Commit
f96d866
β€’
1 Parent(s): f12b51f

fix notebook

Browse files
Files changed (1) hide show
  1. tortoise_tts.ipynb +11 -255
tortoise_tts.ipynb CHANGED
@@ -34,88 +34,9 @@
34
  "cell_type": "code",
35
  "execution_count": null,
36
  "metadata": {
37
- "id": "JrK20I32grP6",
38
- "colab": {
39
- "base_uri": "https://localhost:8080/"
40
- },
41
- "outputId": "44f55dca-5d0a-405e-a4cc-54bc8e16b780"
42
  },
43
- "outputs": [
44
- {
45
- "output_type": "stream",
46
- "name": "stdout",
47
- "text": [
48
- "Cloning into 'tortoise-tts'...\n",
49
- "remote: Enumerating objects: 736, done.\u001b[K\n",
50
- "remote: Counting objects: 100% (23/23), done.\u001b[K\n",
51
- "remote: Compressing objects: 100% (15/15), done.\u001b[K\n",
52
- "remote: Total 736 (delta 10), reused 20 (delta 8), pack-reused 713\u001b[K\n",
53
- "Receiving objects: 100% (736/736), 348.62 MiB | 24.08 MiB/s, done.\n",
54
- "Resolving deltas: 100% (161/161), done.\n",
55
- "/content/tortoise-tts\n",
56
- "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 1)) (1.10.0+cu111)\n",
57
- "Requirement already satisfied: torchaudio in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 2)) (0.10.0+cu111)\n",
58
- "Collecting rotary_embedding_torch\n",
59
- " Downloading rotary_embedding_torch-0.1.5-py3-none-any.whl (4.1 kB)\n",
60
- "Collecting transformers\n",
61
- " Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)\n",
62
- "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4.0 MB 5.3 MB/s \n",
63
- "\u001b[?25hCollecting tokenizers\n",
64
- " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n",
65
- "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6.6 MB 31.3 MB/s \n",
66
- "\u001b[?25hRequirement already satisfied: inflect in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 6)) (2.1.0)\n",
67
- "Collecting progressbar\n",
68
- " Downloading progressbar-2.5.tar.gz (10 kB)\n",
69
- "Collecting einops\n",
70
- " Downloading einops-0.4.1-py3-none-any.whl (28 kB)\n",
71
- "Collecting unidecode\n",
72
- " Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)\n",
73
- "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 235 kB 44.3 MB/s \n",
74
- "\u001b[?25hCollecting entmax\n",
75
- " Downloading entmax-1.0.tar.gz (7.2 kB)\n",
76
- "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->-r requirements.txt (line 1)) (4.1.1)\n",
77
- "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (4.64.0)\n",
78
- "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (21.3)\n",
79
- "Collecting sacremoses\n",
80
- " Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n",
81
- "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 895 kB 36.6 MB/s \n",
82
- "\u001b[?25hCollecting huggingface-hub<1.0,>=0.1.0\n",
83
- " Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n",
84
- "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 77 kB 6.3 MB/s \n",
85
- "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (3.6.0)\n",
86
- "Collecting pyyaml>=5.1\n",
87
- " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
88
- "\u001b[K |β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 596 kB 38.9 MB/s \n",
89
- "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (1.21.6)\n",
90
- "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (2.23.0)\n",
91
- "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (2019.12.20)\n",
92
- "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers->-r requirements.txt (line 4)) (4.11.3)\n",
93
- "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers->-r requirements.txt (line 4)) (3.0.8)\n",
94
- "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers->-r requirements.txt (line 4)) (3.8.0)\n",
95
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (1.24.3)\n",
96
- "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (3.0.4)\n",
97
- "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (2.10)\n",
98
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->-r requirements.txt (line 4)) (2021.10.8)\n",
99
- "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->-r requirements.txt (line 4)) (1.15.0)\n",
100
- "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->-r requirements.txt (line 4)) (1.1.0)\n",
101
- "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->-r requirements.txt (line 4)) (7.1.2)\n",
102
- "Building wheels for collected packages: progressbar, entmax\n",
103
- " Building wheel for progressbar (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
104
- " Created wheel for progressbar: filename=progressbar-2.5-py3-none-any.whl size=12082 sha256=bb7d90605d0bf4d89aedc46bd8ed39538f55e00ee70fa382c1af81f142f08fa8\n",
105
- " Stored in directory: /root/.cache/pip/wheels/f0/fd/1f/3e35ed57e94cd8ced38dd46771f1f0f94f65fec548659ed855\n",
106
- " Building wheel for entmax (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
107
- " Created wheel for entmax: filename=entmax-1.0-py3-none-any.whl size=11015 sha256=5e2cf723e790ec941984d2030eb3231e1ae3ce75231709391a13edcd2bfb4770\n",
108
- " Stored in directory: /root/.cache/pip/wheels/f7/e8/0d/acc29c2f66e69a1f42483347fa8545c293dec12325ee161716\n",
109
- "Successfully built progressbar entmax\n",
110
- "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, einops, unidecode, transformers, rotary-embedding-torch, progressbar, entmax\n",
111
- " Attempting uninstall: pyyaml\n",
112
- " Found existing installation: PyYAML 3.13\n",
113
- " Uninstalling PyYAML-3.13:\n",
114
- " Successfully uninstalled PyYAML-3.13\n",
115
- "Successfully installed einops-0.4.1 entmax-1.0 huggingface-hub-0.5.1 progressbar-2.5 pyyaml-6.0 rotary-embedding-torch-0.1.5 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.18.0 unidecode-1.3.4\n"
116
- ]
117
- }
118
- ],
119
  "source": [
120
  "!git clone https://github.com/neonbjb/tortoise-tts.git\n",
121
  "%cd tortoise-tts\n",
@@ -138,97 +59,10 @@
138
  "tts = TextToSpeech()"
139
  ],
140
  "metadata": {
141
- "id": "Gen09NM4hONQ",
142
- "colab": {
143
- "base_uri": "https://localhost:8080/"
144
- },
145
- "outputId": "35c1fb4b-5998-4e75-9ec9-29521b301db6"
146
  },
147
  "execution_count": null,
148
- "outputs": [
149
- {
150
- "output_type": "stream",
151
- "name": "stdout",
152
- "text": [
153
- "Downloading autoregressive.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth...\n"
154
- ]
155
- },
156
- {
157
- "output_type": "stream",
158
- "name": "stderr",
159
- "text": [
160
- "\n"
161
- ]
162
- },
163
- {
164
- "output_type": "stream",
165
- "name": "stdout",
166
- "text": [
167
- "Done.\n",
168
- "Downloading clvp.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth...\n"
169
- ]
170
- },
171
- {
172
- "output_type": "stream",
173
- "name": "stderr",
174
- "text": [
175
- "\n"
176
- ]
177
- },
178
- {
179
- "output_type": "stream",
180
- "name": "stdout",
181
- "text": [
182
- "Done.\n",
183
- "Downloading cvvp.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/cvvp.pth...\n"
184
- ]
185
- },
186
- {
187
- "output_type": "stream",
188
- "name": "stderr",
189
- "text": [
190
- "\n"
191
- ]
192
- },
193
- {
194
- "output_type": "stream",
195
- "name": "stdout",
196
- "text": [
197
- "Done.\n",
198
- "Downloading diffusion_decoder.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/diffusion_decoder.pth...\n"
199
- ]
200
- },
201
- {
202
- "output_type": "stream",
203
- "name": "stderr",
204
- "text": [
205
- "\n"
206
- ]
207
- },
208
- {
209
- "output_type": "stream",
210
- "name": "stdout",
211
- "text": [
212
- "Done.\n",
213
- "Downloading vocoder.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/vocoder.pth...\n"
214
- ]
215
- },
216
- {
217
- "output_type": "stream",
218
- "name": "stderr",
219
- "text": [
220
- "\n"
221
- ]
222
- },
223
- {
224
- "output_type": "stream",
225
- "name": "stdout",
226
- "text": [
227
- "Done.\n",
228
- "Removing weight norm...\n"
229
- ]
230
- }
231
- ]
232
  },
233
  {
234
  "cell_type": "code",
@@ -239,28 +73,10 @@
239
  "%ls voices"
240
  ],
241
  "metadata": {
242
- "id": "SSleVnRAiEE2",
243
- "colab": {
244
- "base_uri": "https://localhost:8080/"
245
- },
246
- "outputId": "e1eb09e2-1b68-4f81-b679-edb97538da39"
247
  },
248
  "execution_count": null,
249
- "outputs": [
250
- {
251
- "output_type": "stream",
252
- "name": "stdout",
253
- "text": [
254
- "\u001b[0m\u001b[01;34mangelina_jolie\u001b[0m/ \u001b[01;34mhalle_barry\u001b[0m/ \u001b[01;34mlj\u001b[0m/ \u001b[01;34msamuel_jackson\u001b[0m/\n",
255
- "\u001b[01;34matkins\u001b[0m/ \u001b[01;34mharris\u001b[0m/ \u001b[01;34mmol\u001b[0m/ \u001b[01;34msigourney_weaver\u001b[0m/\n",
256
- "\u001b[01;34mcarlin\u001b[0m/ \u001b[01;34mhenry_cavill\u001b[0m/ \u001b[01;34mmorgan_freeman\u001b[0m/ \u001b[01;34mtom_hanks\u001b[0m/\n",
257
- "\u001b[01;34mdaniel_craig\u001b[0m/ \u001b[01;34mjennifer_lawrence\u001b[0m/ \u001b[01;34mmyself\u001b[0m/ \u001b[01;34mwilliam_shatner\u001b[0m/\n",
258
- "\u001b[01;34mdotrice\u001b[0m/ \u001b[01;34mjohn_krasinski\u001b[0m/ \u001b[01;34motto\u001b[0m/\n",
259
- "\u001b[01;34memma_stone\u001b[0m/ \u001b[01;34mkennard\u001b[0m/ \u001b[01;34mpatrick_stewart\u001b[0m/\n",
260
- "\u001b[01;34mgrace\u001b[0m/ \u001b[01;34mlescault\u001b[0m/ \u001b[01;34mrobert_deniro\u001b[0m/\n"
261
- ]
262
- }
263
- ]
264
  },
265
  {
266
  "cell_type": "code",
@@ -302,40 +118,10 @@
302
  "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)"
303
  ],
304
  "metadata": {
305
- "id": "KEXOKjIvn6NW",
306
- "colab": {
307
- "base_uri": "https://localhost:8080/"
308
- },
309
- "outputId": "7977bfd7-9fbc-41f7-d3ac-25fd4e350049"
310
  },
311
  "execution_count": null,
312
- "outputs": [
313
- {
314
- "output_type": "stream",
315
- "name": "stderr",
316
- "text": [
317
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [01:18<00:00, 13.11s/it]\n",
318
- "/usr/local/lib/python3.7/dist-packages/torch/utils/checkpoint.py:25: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
319
- " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n",
320
- "/content/tortoise-tts/models/autoregressive.py:359: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
321
- " mel_lengths = wav_lengths // self.mel_length_compression\n"
322
- ]
323
- },
324
- {
325
- "output_type": "stream",
326
- "name": "stdout",
327
- "text": [
328
- "Performing vocoding..\n"
329
- ]
330
- },
331
- {
332
- "output_type": "stream",
333
- "name": "stderr",
334
- "text": [
335
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:16<00:00, 1.94it/s]\n"
336
- ]
337
- }
338
- ]
339
  },
340
  {
341
  "cell_type": "code",
@@ -346,7 +132,7 @@
346
  "#\n",
347
  "# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n",
348
  "conds = []\n",
349
- "for v in ['patrick_stewart', 'william_shatner']:\n",
350
  " cond_paths = voices[v]\n",
351
  " for cond_path in cond_paths:\n",
352
  " c = load_audio(cond_path, 22050)\n",
@@ -356,40 +142,10 @@
356
  "torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)"
357
  ],
358
  "metadata": {
359
- "colab": {
360
- "base_uri": "https://localhost:8080/"
361
- },
362
- "id": "fYTk8KUezUr5",
363
- "outputId": "8a07f251-c90f-4e6a-c204-132b737dfff8"
364
  },
365
  "execution_count": null,
366
- "outputs": [
367
- {
368
- "output_type": "stream",
369
- "name": "stderr",
370
- "text": [
371
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [01:45<00:00, 17.62s/it]\n",
372
- "/usr/local/lib/python3.7/dist-packages/torch/utils/checkpoint.py:25: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
373
- " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n",
374
- "/content/tortoise-tts/models/autoregressive.py:359: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
375
- " mel_lengths = wav_lengths // self.mel_length_compression\n"
376
- ]
377
- },
378
- {
379
- "output_type": "stream",
380
- "name": "stdout",
381
- "text": [
382
- "Performing vocoding..\n"
383
- ]
384
- },
385
- {
386
- "output_type": "stream",
387
- "name": "stderr",
388
- "text": [
389
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:16<00:00, 2.00it/s]\n"
390
- ]
391
- }
392
- ]
393
  }
394
  ]
395
  }
 
34
  "cell_type": "code",
35
  "execution_count": null,
36
  "metadata": {
37
+ "id": "JrK20I32grP6"
 
 
 
 
38
  },
39
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "source": [
41
  "!git clone https://github.com/neonbjb/tortoise-tts.git\n",
42
  "%cd tortoise-tts\n",
 
59
  "tts = TextToSpeech()"
60
  ],
61
  "metadata": {
62
+ "id": "Gen09NM4hONQ"
 
 
 
 
63
  },
64
  "execution_count": null,
65
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  },
67
  {
68
  "cell_type": "code",
 
73
  "%ls voices"
74
  ],
75
  "metadata": {
76
+ "id": "SSleVnRAiEE2"
 
 
 
 
77
  },
78
  "execution_count": null,
79
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  },
81
  {
82
  "cell_type": "code",
 
118
  "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)"
119
  ],
120
  "metadata": {
121
+ "id": "KEXOKjIvn6NW"
 
 
 
 
122
  },
123
  "execution_count": null,
124
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  },
126
  {
127
  "cell_type": "code",
 
132
  "#\n",
133
  "# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n",
134
  "conds = []\n",
135
+ "for v in ['pat', 'william']:\n",
136
  " cond_paths = voices[v]\n",
137
  " for cond_path in cond_paths:\n",
138
  " c = load_audio(cond_path, 22050)\n",
 
142
  "torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)"
143
  ],
144
  "metadata": {
145
+ "id": "fYTk8KUezUr5"
 
 
 
 
146
  },
147
  "execution_count": null,
148
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  }
150
  ]
151
  }