LearnItAnyway commited on
Commit
bd25455
·
1 Parent(s): 0980104

Upload 3 files

Browse files
Files changed (3) hide show
  1. tester.ipynb +510 -0
  2. unique_text_tokens.k2symbols +432 -0
  3. vall-e_ko_v0.pt +3 -0
tester.ipynb ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "cb5d0890-3f2d-4020-8270-f3a9bb9f63c6",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%%bash # install the vall-e and required libraries\n",
11
+ "# PyTorch\n",
12
+ "pip install torch==1.13.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116\n",
13
+ "pip install torchmetrics==0.11.1\n",
14
+ "# fbank\n",
15
+ "pip install librosa==0.8.1\n",
16
+ "\n",
17
+ "# phonemizer pypinyin\n",
18
+ "apt-get install espeak-ng\n",
19
+ "## OSX: brew install espeak\n",
20
+ "pip install phonemizer==3.2.1 pypinyin==0.48.0\n",
21
+ "\n",
22
+ "# lhotse update to newest version\n",
23
+ "# https://github.com/lhotse-speech/lhotse/pull/956\n",
24
+ "# https://github.com/lhotse-speech/lhotse/pull/960\n",
25
+ "pip uninstall lhotse\n",
26
+ "pip install lhotse\n",
27
+ "\n",
28
+ "# k2\n",
29
+ "# find the right version in https://huggingface.co/csukuangfj/k2\n",
30
+ "pip install https://huggingface.co/csukuangfj/k2/resolve/main/cuda/k2-1.23.4.dev20230224+cuda11.6.torch1.13.1-cp310-cp310-linux_x86_64.whl\n",
31
+ "\n",
32
+ "# icefall\n",
33
+ "git clone https://github.com/k2-fsa/icefall\n",
34
+ "cd icefall\n",
35
+ "pip install -r requirements.txt\n",
36
+ "export PYTHONPATH=`pwd`/../icefall:$PYTHONPATH\n",
37
+ "echo \"export PYTHONPATH=`pwd`/../icefall:\\$PYTHONPATH\" >> ~/.zshrc\n",
38
+ "echo \"export PYTHONPATH=`pwd`/../icefall:\\$PYTHONPATH\" >> ~/.bashrc\n",
39
+ "cd -\n",
40
+ "source ~/.zshrc\n",
41
+ "\n",
42
+ "# valle\n",
43
+ "git clone https://github.com/lifeiteng/valle.git\n",
44
+ "cd valle\n",
45
+ "pip install -e ."
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 1,
51
+ "id": "1b8a4af2-5851-4c41-96bb-bda4b259f857",
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stderr",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "/home/dongsun/.local/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/home/dongsun/.local/lib/python3.10/site-packages/torchvision/image.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n",
59
+ " warn(\n"
60
+ ]
61
+ },
62
+ {
63
+ "name": "stdout",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "[2023-09-21 14:36:33,978] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
67
+ "Use 8 cpu cores for computing\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "import argparse\n",
73
+ "import logging\n",
74
+ "import os\n",
75
+ "import pathlib\n",
76
+ "import time\n",
77
+ "import tempfile\n",
78
+ "import platform\n",
79
+ "import webbrowser\n",
80
+ "import sys\n",
81
+ "import torch, torchaudio\n",
82
+ "import random\n",
83
+ "\n",
84
+ "import numpy as np\n",
85
+ "\n",
86
+ "from valle.data import (\n",
87
+ " AudioTokenizer,\n",
88
+ " TextTokenizer,\n",
89
+ " tokenize_audio,\n",
90
+ " tokenize_text,\n",
91
+ ")\n",
92
+ "from icefall.utils import AttributeDict\n",
93
+ "from valle.data.collation import get_text_token_collater\n",
94
+ "from valle.models import get_model\n",
95
+ "\n",
96
+ "from vocos import Vocos\n",
97
+ "from encodec.utils import convert_audio\n",
98
+ "import multiprocessing\n",
99
+ "\n",
100
+ "thread_count = multiprocessing.cpu_count()\n",
101
+ "\n",
102
+ "print(\"Use\",thread_count,\"cpu cores for computing\")\n",
103
+ "\n",
104
+ "torch.set_num_threads(thread_count)\n",
105
+ "torch.set_num_interop_threads(thread_count)\n",
106
+ "torch._C._jit_set_profiling_executor(False)\n",
107
+ "torch._C._jit_set_profiling_mode(False)\n",
108
+ "torch._C._set_graph_executor_optimize(False)\n",
109
+ "\n",
110
+ "text_tokenizer = TextTokenizer(language='ko')\n",
111
+ "\n",
112
+ "device = torch.device(\"cpu\")\n",
113
+ "if torch.cuda.is_available():\n",
114
+ " device = torch.device(\"cuda\", 0)\n",
115
+ "\n",
116
+ "checkpoint = torch.load(\"./vall-e_ko_v0.pt\", map_location='cpu')\n",
117
+ "model = get_model(AttributeDict(checkpoint))\n",
118
+ "missing_keys, unexpected_keys = model.load_state_dict(\n",
119
+ " checkpoint[\"model\"], strict=True\n",
120
+ ")\n",
121
+ "assert not missing_keys\n",
122
+ "model.eval()\n",
123
+ "model.to(device)\n",
124
+ "text_collater = get_text_token_collater('./unique_text_tokens.k2symbols')\n",
125
+ "\n",
126
+ "# Encodec model\n",
127
+ "audio_tokenizer = AudioTokenizer(device)\n",
128
+ "\n",
129
+ "# Vocos decoder\n",
130
+ "vocos = Vocos.from_pretrained('charactr/vocos-encodec-24khz').to(device)\n",
131
+ "\n",
132
+ "model.to(device)\n",
133
+ "@torch.no_grad()\n",
134
+ "def infer_from_prompt(text_prompt, audio_prompt, text):\n",
135
+ " ## text to token\n",
136
+ " text_tokens, text_tokens_lens = text_collater(\n",
137
+ " [\n",
138
+ " tokenize_text(\n",
139
+ " text_tokenizer, text=f\"{text_prompt} {text}\".strip()\n",
140
+ " )\n",
141
+ " ]\n",
142
+ " )\n",
143
+ " _, enroll_x_lens = text_collater(\n",
144
+ " [\n",
145
+ " tokenize_text(\n",
146
+ " text_tokenizer, text=f\"{text_prompt}\".strip()\n",
147
+ " )\n",
148
+ " ]\n",
149
+ " )\n",
150
+ " print('text_loaded')\n",
151
+ "\n",
152
+ " # text to synthesize\n",
153
+ " wav_pr, sr = torchaudio.load(audio_prompt)\n",
154
+ " wav_pr = convert_audio(wav_pr, sr, audio_tokenizer.sample_rate, audio_tokenizer.channels)\n",
155
+ " audio_prompts = audio_tokenizer.encode(wav_pr.unsqueeze(0))[0][0].transpose(2, 1).to(device)\n",
156
+ " print('Audio encoded')\n",
157
+ "\n",
158
+ " encoded_frames = model.inference(\n",
159
+ " text_tokens.to(device), text_tokens_lens.to(device),\n",
160
+ " audio_prompts, enroll_x_lens=enroll_x_lens,\n",
161
+ " top_k=-100, temperature=1)\n",
162
+ " vocos_features = vocos.codes_to_features(encoded_frames.permute(2, 0, 1))\n",
163
+ " samples = vocos.decode(vocos_features, bandwidth_id=torch.tensor([2], device=device))\n",
164
+ " message = f\"sythesized text: {text}\"\n",
165
+ " return message, (24000, samples.squeeze(0).cpu().numpy())\n"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "markdown",
170
+ "id": "fa6e2e1d-7522-43f0-985c-e731047acd9c",
171
+ "metadata": {},
172
+ "source": [
173
+ "# Example"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 2,
179
+ "id": "41e40fe5-595e-4f9a-8dd7-dfda52944529",
180
+ "metadata": {},
181
+ "outputs": [
182
+ {
183
+ "data": {
184
+ "text/html": [
185
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
186
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
187
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
188
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1 </span>text_prompt = <span style=\"color: #808000; text-decoration-color: #808000\">''</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># text of the audio </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
189
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2 </span>audio_prompt = <span style=\"color: #808000; text-decoration-color: #808000\">''</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># path to the audio file</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
190
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3 </span>text = <span style=\"color: #808000; text-decoration-color: #808000\">''</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
191
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>4 message, (sr, data) = infer_from_prompt(text_prompt, audio_prompt, text) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
192
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">5 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
193
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
194
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/dongsun/.local/lib/python3.10/site-packages/torch/autograd/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">grad_mode.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">27</span> in <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
195
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decorate_context</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
196
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
197
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 24 │ │ </span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">@functools</span>.wraps(func) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
198
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 25 │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decorate_context</span>(*args, **kwargs): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
199
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 26 │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">with</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.clone(): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
200
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 27 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> func(*args, **kwargs) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
201
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 28 │ │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> cast(F, decorate_context) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
202
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 29 │ </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
203
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 30 │ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_wrap_generator</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, func): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
204
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
205
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">infer_from_prompt</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
206
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
207
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">64 │ ## text to token</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
208
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">65 │ </span>text_tokens, text_tokens_lens = text_collater( <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
209
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66 │ │ </span>[ <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
210
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>67 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ │ │ </span>tokenize_text( <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
211
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">68 │ │ │ │ </span>text_tokenizer, text=<span style=\"color: #808000; text-decoration-color: #808000\">f\"{</span>text_prompt<span style=\"color: #808000; text-decoration-color: #808000\">} {</span>text<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>.strip() <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
212
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">69 │ │ │ </span>) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
213
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">70 │ │ </span>] <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
214
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
215
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/dongsun/vall-e/valle/data/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">tokenizer.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">178</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">tokenize_text</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
216
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
217
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">175 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
218
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">176 </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">tokenize_text</span>(tokenizer: TextTokenizer, text: <span style=\"color: #00ffff; text-decoration-color: #00ffff\">str</span>) -&gt; List[<span style=\"color: #00ffff; text-decoration-color: #00ffff\">str</span>]: <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
219
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">177 │ </span>phonemes = tokenizer([text.strip()]) <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
220
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>178 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│ </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> phonemes[<span style=\"color: #0000ff; text-decoration-color: #0000ff\">0</span>] <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># k2symbols</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
221
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">179 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
222
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">180 </span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
223
+ "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">181 </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">remove_encodec_weight_norm</span>(model): <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
224
+ "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
225
+ "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">IndexError: </span>list index out of range\n",
226
+ "</pre>\n"
227
+ ],
228
+ "text/plain": [
229
+ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
230
+ "\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m \u001b[31m│\u001b[0m\n",
231
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
232
+ "\u001b[31m│\u001b[0m \u001b[2m1 \u001b[0mtext_prompt = \u001b[33m'\u001b[0m\u001b[33m'\u001b[0m \u001b[2m# text of the audio \u001b[0m \u001b[31m│\u001b[0m\n",
233
+ "\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0maudio_prompt = \u001b[33m'\u001b[0m\u001b[33m'\u001b[0m \u001b[2m# path to the audio file\u001b[0m \u001b[31m│\u001b[0m\n",
234
+ "\u001b[31m│\u001b[0m \u001b[2m3 \u001b[0mtext = \u001b[33m'\u001b[0m\u001b[33m'\u001b[0m \u001b[2m# \u001b[0m \u001b[31m│\u001b[0m\n",
235
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m4 message, (sr, data) = infer_from_prompt(text_prompt, audio_prompt, text) \u001b[31m│\u001b[0m\n",
236
+ "\u001b[31m│\u001b[0m \u001b[2m5 \u001b[0m \u001b[31m│\u001b[0m\n",
237
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
238
+ "\u001b[31m│\u001b[0m \u001b[2;33m/home/dongsun/.local/lib/python3.10/site-packages/torch/autograd/\u001b[0m\u001b[1;33mgrad_mode.py\u001b[0m:\u001b[94m27\u001b[0m in \u001b[31m│\u001b[0m\n",
239
+ "\u001b[31m│\u001b[0m \u001b[92mdecorate_context\u001b[0m \u001b[31m│\u001b[0m\n",
240
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
241
+ "\u001b[31m│\u001b[0m \u001b[2m 24 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[1;95m@functools\u001b[0m.wraps(func) \u001b[31m│\u001b[0m\n",
242
+ "\u001b[31m│\u001b[0m \u001b[2m 25 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdecorate_context\u001b[0m(*args, **kwargs): \u001b[31m│\u001b[0m\n",
243
+ "\u001b[31m│\u001b[0m \u001b[2m 26 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mself\u001b[0m.clone(): \u001b[31m│\u001b[0m\n",
244
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 27 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[31m│\u001b[0m\n",
245
+ "\u001b[31m│\u001b[0m \u001b[2m 28 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m cast(F, decorate_context) \u001b[31m│\u001b[0m\n",
246
+ "\u001b[31m│\u001b[0m \u001b[2m 29 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
247
+ "\u001b[31m│\u001b[0m \u001b[2m 30 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_wrap_generator\u001b[0m(\u001b[96mself\u001b[0m, func): \u001b[31m│\u001b[0m\n",
248
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
249
+ "\u001b[31m│\u001b[0m in \u001b[92minfer_from_prompt\u001b[0m \u001b[31m│\u001b[0m\n",
250
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
251
+ "\u001b[31m│\u001b[0m \u001b[2m64 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m## text to token\u001b[0m \u001b[31m│\u001b[0m\n",
252
+ "\u001b[31m│\u001b[0m \u001b[2m65 \u001b[0m\u001b[2m│ \u001b[0mtext_tokens, text_tokens_lens = text_collater( \u001b[31m│\u001b[0m\n",
253
+ "\u001b[31m│\u001b[0m \u001b[2m66 \u001b[0m\u001b[2m│ │ \u001b[0m[ \u001b[31m│\u001b[0m\n",
254
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m67 \u001b[2m│ │ │ \u001b[0mtokenize_text( \u001b[31m│\u001b[0m\n",
255
+ "\u001b[31m│\u001b[0m \u001b[2m68 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mtext_tokenizer, text=\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m{\u001b[0mtext_prompt\u001b[33m}\u001b[0m\u001b[33m \u001b[0m\u001b[33m{\u001b[0mtext\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m.strip() \u001b[31m│\u001b[0m\n",
256
+ "\u001b[31m│\u001b[0m \u001b[2m69 \u001b[0m\u001b[2m│ │ │ \u001b[0m) \u001b[31m│\u001b[0m\n",
257
+ "\u001b[31m│\u001b[0m \u001b[2m70 \u001b[0m\u001b[2m│ │ \u001b[0m] \u001b[31m│\u001b[0m\n",
258
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
259
+ "\u001b[31m│\u001b[0m \u001b[2;33m/home/dongsun/vall-e/valle/data/\u001b[0m\u001b[1;33mtokenizer.py\u001b[0m:\u001b[94m178\u001b[0m in \u001b[92mtokenize_text\u001b[0m \u001b[31m│\u001b[0m\n",
260
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
261
+ "\u001b[31m│\u001b[0m \u001b[2m175 \u001b[0m \u001b[31m│\u001b[0m\n",
262
+ "\u001b[31m│\u001b[0m \u001b[2m176 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mtokenize_text\u001b[0m(tokenizer: TextTokenizer, text: \u001b[96mstr\u001b[0m) -> List[\u001b[96mstr\u001b[0m]: \u001b[31m│\u001b[0m\n",
263
+ "\u001b[31m│\u001b[0m \u001b[2m177 \u001b[0m\u001b[2m│ \u001b[0mphonemes = tokenizer([text.strip()]) \u001b[31m│\u001b[0m\n",
264
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m178 \u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m phonemes[\u001b[94m0\u001b[0m] \u001b[2m# k2symbols\u001b[0m \u001b[31m│\u001b[0m\n",
265
+ "\u001b[31m│\u001b[0m \u001b[2m179 \u001b[0m \u001b[31m│\u001b[0m\n",
266
+ "\u001b[31m│\u001b[0m \u001b[2m180 \u001b[0m \u001b[31m│\u001b[0m\n",
267
+ "\u001b[31m│\u001b[0m \u001b[2m181 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mremove_encodec_weight_norm\u001b[0m(model): \u001b[31m│\u001b[0m\n",
268
+ "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
269
+ "\u001b[1;91mIndexError: \u001b[0mlist index out of range\n"
270
+ ]
271
+ },
272
+ "metadata": {},
273
+ "output_type": "display_data"
274
+ }
275
+ ],
276
+ "source": [
277
+ "text_prompt = '' # text of the audio \n",
278
+ "audio_prompt = '' # path to the audio file\n",
279
+ "text = '' # \n",
280
+ "message, (sr, data) = infer_from_prompt(text_prompt, audio_prompt, text)"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": null,
286
+ "id": "1f97f088-74a4-4cbb-a18b-d884adf81546",
287
+ "metadata": {},
288
+ "outputs": [],
289
+ "source": [
290
+ "print(message)\n",
291
+ "from IPython.display import Audio\n",
292
+ "Audio(data, rate=sr)"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "markdown",
297
+ "id": "1cedb3cc-7486-4a3d-9dcd-1facffdb78ad",
298
+ "metadata": {},
299
+ "source": [
300
+ "# Simple Gradio App"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": 3,
306
+ "id": "723c13c7-36f5-4af6-bc0b-bbf6d65c2e3a",
307
+ "metadata": {
308
+ "collapsed": true,
309
+ "jupyter": {
310
+ "outputs_hidden": true
311
+ },
312
+ "tags": []
313
+ },
314
+ "outputs": [
315
+ {
316
+ "name": "stdout",
317
+ "output_type": "stream",
318
+ "text": [
319
+ "Defaulting to user installation because normal site-packages is not writeable\n",
320
+ "\u001b[33mWARNING: Ignoring invalid distribution -orch (/home/dongsun/.local/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
321
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -orch (/home/dongsun/.local/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
322
+ "\u001b[0mRequirement already satisfied: gradio in /home/dongsun/.local/lib/python3.10/site-packages (3.32.0)\n",
323
+ "Requirement already satisfied: markdown-it-py[linkify]>=2.0.0 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (2.1.0)\n",
324
+ "Requirement already satisfied: semantic-version in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (2.10.0)\n",
325
+ "Requirement already satisfied: pandas in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (2.0.3)\n",
326
+ "Requirement already satisfied: uvicorn>=0.14.0 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.19.0)\n",
327
+ "Requirement already satisfied: mdit-py-plugins<=0.3.3 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.3.1)\n",
328
+ "Requirement already satisfied: httpx in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.23.0)\n",
329
+ "Requirement already satisfied: orjson in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (3.8.0)\n",
330
+ "Requirement already satisfied: ffmpy in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.3.0)\n",
331
+ "Requirement already satisfied: pygments>=2.12.0 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (2.14.0)\n",
332
+ "Requirement already satisfied: pillow in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (9.5.0)\n",
333
+ "Requirement already satisfied: numpy in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (1.23.0)\n",
334
+ "Requirement already satisfied: python-multipart in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.0.5)\n",
335
+ "Requirement already satisfied: markupsafe in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (2.1.0)\n",
336
+ "Requirement already satisfied: pydantic in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (1.8.2)\n",
337
+ "Requirement already satisfied: aiohttp in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (3.8.1)\n",
338
+ "Requirement already satisfied: websockets>=10.0 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (10.3)\n",
339
+ "Requirement already satisfied: typing-extensions in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (4.5.0)\n",
340
+ "Requirement already satisfied: gradio-client>=0.2.4 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.2.5)\n",
341
+ "Requirement already satisfied: matplotlib in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (3.7.0)\n",
342
+ "Requirement already satisfied: huggingface-hub>=0.13.0 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.15.1)\n",
343
+ "Requirement already satisfied: pydub in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.25.1)\n",
344
+ "Requirement already satisfied: fastapi in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (0.94.0)\n",
345
+ "Requirement already satisfied: requests in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (2.31.0)\n",
346
+ "Requirement already satisfied: jinja2 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (3.0.3)\n",
347
+ "Requirement already satisfied: pyyaml in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (6.0)\n",
348
+ "Requirement already satisfied: aiofiles in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (23.1.0)\n",
349
+ "Requirement already satisfied: altair>=4.2.0 in /home/dongsun/.local/lib/python3.10/site-packages (from gradio) (4.2.2)\n",
350
+ "Requirement already satisfied: jsonschema>=3.0 in /home/dongsun/.local/lib/python3.10/site-packages (from altair>=4.2.0->gradio) (4.4.0)\n",
351
+ "Requirement already satisfied: entrypoints in /home/dongsun/.local/lib/python3.10/site-packages (from altair>=4.2.0->gradio) (0.4)\n",
352
+ "Requirement already satisfied: toolz in /home/dongsun/.local/lib/python3.10/site-packages (from altair>=4.2.0->gradio) (0.11.2)\n",
353
+ "Requirement already satisfied: fsspec in /home/dongsun/.local/lib/python3.10/site-packages (from gradio-client>=0.2.4->gradio) (2022.3.0)\n",
354
+ "Requirement already satisfied: packaging in /home/dongsun/.local/lib/python3.10/site-packages (from gradio-client>=0.2.4->gradio) (23.1)\n",
355
+ "Requirement already satisfied: filelock in /home/dongsun/.local/lib/python3.10/site-packages (from huggingface-hub>=0.13.0->gradio) (3.11.0)\n",
356
+ "Requirement already satisfied: tqdm>=4.42.1 in /home/dongsun/.local/lib/python3.10/site-packages (from huggingface-hub>=0.13.0->gradio) (4.65.0)\n",
357
+ "Requirement already satisfied: mdurl~=0.1 in /home/dongsun/.local/lib/python3.10/site-packages (from markdown-it-py[linkify]>=2.0.0->gradio) (0.1.2)\n",
358
+ "Requirement already satisfied: linkify-it-py~=1.0 in /home/dongsun/.local/lib/python3.10/site-packages (from markdown-it-py[linkify]>=2.0.0->gradio) (1.0.3)\n",
359
+ "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.10/site-packages (from pandas->gradio) (2022.7)\n",
360
+ "Requirement already satisfied: tzdata>=2022.1 in /home/dongsun/.local/lib/python3.10/site-packages (from pandas->gradio) (2022.7)\n",
361
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/lib/python3.10/site-packages (from pandas->gradio) (2.8.2)\n",
362
+ "Requirement already satisfied: h11>=0.8 in /home/dongsun/.local/lib/python3.10/site-packages (from uvicorn>=0.14.0->gradio) (0.12.0)\n",
363
+ "Requirement already satisfied: click>=7.0 in /usr/lib/python3.10/site-packages (from uvicorn>=0.14.0->gradio) (8.1.3)\n",
364
+ "Requirement already satisfied: frozenlist>=1.1.1 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (1.3.0)\n",
365
+ "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (4.0.2)\n",
366
+ "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (2.1.1)\n",
367
+ "Requirement already satisfied: attrs>=17.3.0 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (21.4.0)\n",
368
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (1.7.2)\n",
369
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (6.0.2)\n",
370
+ "Requirement already satisfied: aiosignal>=1.1.2 in /home/dongsun/.local/lib/python3.10/site-packages (from aiohttp->gradio) (1.2.0)\n",
371
+ "Requirement already satisfied: starlette<0.27.0,>=0.26.0 in /home/dongsun/.local/lib/python3.10/site-packages (from fastapi->gradio) (0.26.1)\n",
372
+ "Requirement already satisfied: certifi in /home/dongsun/.local/lib/python3.10/site-packages (from httpx->gradio) (2022.12.7)\n",
373
+ "Requirement already satisfied: sniffio in /home/dongsun/.local/lib/python3.10/site-packages (from httpx->gradio) (1.2.0)\n",
374
+ "Requirement already satisfied: rfc3986[idna2008]<2,>=1.3 in /home/dongsun/.local/lib/python3.10/site-packages (from httpx->gradio) (1.5.0)\n",
375
+ "Requirement already satisfied: httpcore<0.16.0,>=0.15.0 in /home/dongsun/.local/lib/python3.10/site-packages (from httpx->gradio) (0.15.0)\n",
376
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /home/dongsun/.local/lib/python3.10/site-packages (from matplotlib->gradio) (1.3.2)\n",
377
+ "Requirement already satisfied: fonttools>=4.22.0 in /home/dongsun/.local/lib/python3.10/site-packages (from matplotlib->gradio) (4.29.1)\n",
378
+ "Requirement already satisfied: contourpy>=1.0.1 in /home/dongsun/.local/lib/python3.10/site-packages (from matplotlib->gradio) (1.0.7)\n",
379
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/lib/python3.10/site-packages (from matplotlib->gradio) (3.0.9)\n",
380
+ "Requirement already satisfied: cycler>=0.10 in /home/dongsun/.local/lib/python3.10/site-packages (from matplotlib->gradio) (0.11.0)\n",
381
+ "Requirement already satisfied: six>=1.4.0 in /usr/lib/python3.10/site-packages (from python-multipart->gradio) (1.16.0)\n",
382
+ "Requirement already satisfied: idna<4,>=2.5 in /home/dongsun/.local/lib/python3.10/site-packages (from requests->gradio) (2.10)\n",
383
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/dongsun/.local/lib/python3.10/site-packages (from requests->gradio) (1.26.15)\n",
384
+ "Requirement already satisfied: anyio==3.* in /home/dongsun/.local/lib/python3.10/site-packages (from httpcore<0.16.0,>=0.15.0->httpx->gradio) (3.5.0)\n",
385
+ "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /home/dongsun/.local/lib/python3.10/site-packages (from jsonschema>=3.0->altair>=4.2.0->gradio) (0.18.1)\n",
386
+ "Requirement already satisfied: uc-micro-py in /home/dongsun/.local/lib/python3.10/site-packages (from linkify-it-py~=1.0->markdown-it-py[linkify]>=2.0.0->gradio) (1.0.1)\n",
387
+ "\u001b[33mWARNING: Ignoring invalid distribution -orch (/home/dongsun/.local/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
388
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -orch (/home/dongsun/.local/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
389
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -orch (/home/dongsun/.local/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
390
+ "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -orch (/home/dongsun/.local/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
391
+ "\u001b[0m\n",
392
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
393
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
394
+ ]
395
+ }
396
+ ],
397
+ "source": [
398
+ "!pip install gradio"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": 4,
404
+ "id": "6b59cf2b-2826-40be-a27f-e6dbfe0cc1c0",
405
+ "metadata": {},
406
+ "outputs": [
407
+ {
408
+ "name": "stdout",
409
+ "output_type": "stream",
410
+ "text": [
411
+ "Running on local URL: http://127.0.0.1:7860\n",
412
+ "Running on public URL: https://b3512daf295a0b63b1.gradio.live\n",
413
+ "\n",
414
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n"
415
+ ]
416
+ },
417
+ {
418
+ "data": {
419
+ "text/html": [
420
+ "<div><iframe src=\"https://b3512daf295a0b63b1.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
421
+ ],
422
+ "text/plain": [
423
+ "<IPython.core.display.HTML object>"
424
+ ]
425
+ },
426
+ "metadata": {},
427
+ "output_type": "display_data"
428
+ },
429
+ {
430
+ "data": {
431
+ "text/plain": []
432
+ },
433
+ "execution_count": 4,
434
+ "metadata": {},
435
+ "output_type": "execute_result"
436
+ },
437
+ {
438
+ "name": "stdout",
439
+ "output_type": "stream",
440
+ "text": [
441
+ "text_loaded\n",
442
+ "Audio encoded\n",
443
+ "VALL-E EOS [356 -> 899]\n"
444
+ ]
445
+ },
446
+ {
447
+ "name": "stderr",
448
+ "output_type": "stream",
449
+ "text": [
450
+ "/home/dongsun/.local/lib/python3.10/site-packages/gradio/processing_utils.py:171: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
451
+ " warnings.warn(warning.format(data.dtype))\n"
452
+ ]
453
+ }
454
+ ],
455
+ "source": [
456
+ "import gradio as gr\n",
457
+ "app = gr.Blocks(title=\"VALL-E Korean\")\n",
458
+ "with app:\n",
459
+ " #gr.Markdown(top_md)\n",
460
+ " with gr.Tab(\"VALL-E Korean Demo\"):\n",
461
+ " #gr.Markdown(infer_from_prompt_md)\n",
462
+ " with gr.Row():\n",
463
+ " with gr.Column():\n",
464
+ " text_prompt = gr.TextArea(label=\"Input Text\",\n",
465
+ " placeholder=\"Type text in the audio file (Korean)\",)\n",
466
+ " audio_prompt= gr.Audio(label=\"Input Audio\", source='upload', interactive=True, type=\"filepath\")\n",
467
+ " text_input = gr.TextArea(label=\"Output Text\",\n",
468
+ " placeholder=\"Type text you want to generate (Korean)\",)\n",
469
+ " with gr.Column():\n",
470
+ " text_output = gr.Textbox(label=\"Message\")\n",
471
+ " audio_output= gr.Audio(label=\"Output Audio\")\n",
472
+ " btn = gr.Button(\"Generate!\")\n",
473
+ " btn.click(infer_from_prompt,\n",
474
+ " inputs=[text_prompt, audio_prompt, text_input],\n",
475
+ " outputs=[text_output, audio_output])\n",
476
+ "webbrowser.open(\"http://127.0.0.1:7860\")\n",
477
+ "app.launch(share=True)"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": null,
483
+ "id": "fafc648b-2165-45a1-b422-38ced5f4d8fa",
484
+ "metadata": {},
485
+ "outputs": [],
486
+ "source": []
487
+ }
488
+ ],
489
+ "metadata": {
490
+ "kernelspec": {
491
+ "display_name": "Python 3 (ipykernel)",
492
+ "language": "python",
493
+ "name": "python3"
494
+ },
495
+ "language_info": {
496
+ "codemirror_mode": {
497
+ "name": "ipython",
498
+ "version": 3
499
+ },
500
+ "file_extension": ".py",
501
+ "mimetype": "text/x-python",
502
+ "name": "python",
503
+ "nbconvert_exporter": "python",
504
+ "pygments_lexer": "ipython3",
505
+ "version": "3.10.9"
506
+ }
507
+ },
508
+ "nbformat": 4,
509
+ "nbformat_minor": 5
510
+ }
unique_text_tokens.k2symbols ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <eps> 0
2
+ ! 1
3
+ " 2
4
+ ( 3
5
+ ) 4
6
+ , 5
7
+ . 6
8
+ : 7
9
+ ; 8
10
+ ? 9
11
+ _ 10
12
+ a 11
13
+ aɪ 12
14
+ aɪə 13
15
+ aʊ 14
16
+ b 15
17
+ d 16
18
+ di 17
19
+ ds 18
20
+ dɯ 19
21
+ dʌ 20
22
+ dʑ 21
23
+ dʒ 22
24
+ e 23
25
+ ed 24
26
+ ee 25
27
+ eh 26
28
+ ei 27
29
+ ej 28
30
+ ekh 29
31
+ em 30
32
+ en 31
33
+ eo 32
34
+ ep 33
35
+ eph 34
36
+ eq 35
37
+ es 36
38
+ et 37
39
+ etɕ 38
40
+ etʃ 39
41
+ eu 40
42
+ ew 41
43
+ eɐ 42
44
+ eə 43
45
+ eɛ 44
46
+ eɡ 45
47
+ eɪ 46
48
+ eɯ 47
49
+ eɾ 48
50
+ eʌ 49
51
+ f 50
52
+ h 51
53
+ hʲ 52
54
+ i 53
55
+ id 54
56
+ ih 55
57
+ ij 56
58
+ ikh 57
59
+ im 58
60
+ ip 59
61
+ iph 60
62
+ iq 61
63
+ is 62
64
+ it 63
65
+ itɕ 64
66
+ itʃ 65
67
+ iw 66
68
+ iə 67
69
+ iɡ 68
70
+ iɾ 69
71
+ iː 70
72
+ j 71
73
+ jd 72
74
+ je 73
75
+ jh 74
76
+ ji 75
77
+ jj 76
78
+ jkh 77
79
+ jo 78
80
+ jp 79
81
+ jph 80
82
+ jq 81
83
+ js 82
84
+ jt 83
85
+ jtɕ 84
86
+ jtʃ 85
87
+ ju 86
88
+ jw 87
89
+ jɐ 88
90
+ jɛ 89
91
+ jɡ 90
92
+ jɯ 91
93
+ jɾ 92
94
+ jʌ 93
95
+ k 94
96
+ kd 95
97
+ kh 96
98
+ ki 97
99
+ ko 98
100
+ ktɕ 99
101
+ kʌ 100
102
+ l 101
103
+ m 102
104
+ md 103
105
+ me 104
106
+ mh 105
107
+ mi 106
108
+ mj 107
109
+ mkh 108
110
+ mo 109
111
+ mp 110
112
+ mph 111
113
+ mq 112
114
+ ms 113
115
+ mt 114
116
+ mtɕ 115
117
+ mtʃ 116
118
+ mu 117
119
+ mw 118
120
+ mɐ 119
121
+ mɛ 120
122
+ mɡ 121
123
+ mɯ 122
124
+ mɾ 123
125
+ mʌ 124
126
+ n 125
127
+ nd 126
128
+ ne 127
129
+ nh 128
130
+ ni 129
131
+ nj 130
132
+ nkh 131
133
+ nm 132
134
+ no 133
135
+ np 134
136
+ nph 135
137
+ nq 136
138
+ ns 137
139
+ nt 138
140
+ ntɕ 139
141
+ ntʃ 140
142
+ nu 141
143
+ nw 142
144
+ nɐ 143
145
+ nɛ 144
146
+ nɡ 145
147
+ nɯ 146
148
+ nɾ 147
149
+ nʌ 148
150
+ o 149
151
+ od 150
152
+ oe 151
153
+ oh 152
154
+ oi 153
155
+ oj 154
156
+ okh 155
157
+ oo 156
158
+ op 157
159
+ oph 158
160
+ oq 159
161
+ os 160
162
+ ot 161
163
+ otɕ 162
164
+ otʃ 163
165
+ ou 164
166
+ ow 165
167
+ oɐ 166
168
+ oɛ 167
169
+ oɡ 168
170
+ oɯ 169
171
+ oɾ 170
172
+ oʌ 171
173
+ p 172
174
+ pd 173
175
+ pe 174
176
+ ph 175
177
+ pi 176
178
+ pj 177
179
+ pkh 178
180
+ po 179
181
+ pp 180
182
+ pph 181
183
+ pq 182
184
+ ps 183
185
+ pt 184
186
+ ptɕ 185
187
+ ptʃ 186
188
+ pu 187
189
+ pw 188
190
+ pɐ 189
191
+ pɛ 190
192
+ pɡ 191
193
+ pɯ 192
194
+ pɾ 193
195
+ pʌ 194
196
+ q 195
197
+ qd 196
198
+ qe 197
199
+ qh 198
200
+ qi 199
201
+ qj 200
202
+ qkh 201
203
+ qo 202
204
+ qp 203
205
+ qph 204
206
+ qq 205
207
+ qs 206
208
+ qt 207
209
+ qtɕ 208
210
+ qtʃ 209
211
+ qu 210
212
+ qw 211
213
+ qɐ 212
214
+ qɛ 213
215
+ qɡ 214
216
+ qɯ 215
217
+ qɾ 216
218
+ qʌ 217
219
+ r 218
220
+ s 219
221
+ t 220
222
+ td 221
223
+ te 222
224
+ th 223
225
+ ti 224
226
+ tj 225
227
+ tkh 226
228
+ to 227
229
+ tp 228
230
+ tph 229
231
+ tq 230
232
+ ts 231
233
+ tt 232
234
+ ttɕ 233
235
+ ttʃ 234
236
+ tu 235
237
+ tw 236
238
+ tɐ 237
239
+ tɕ 238
240
+ tɡ 239
241
+ tɯ 240
242
+ tɾ 241
243
+ tʃ 242
244
+ tʌ 243
245
+ u 244
246
+ ud 245
247
+ ue 246
248
+ uh 247
249
+ ui 248
250
+ uj 249
251
+ ukh 250
252
+ uo 251
253
+ up 252
254
+ uph 253
255
+ uq 254
256
+ us 255
257
+ ut 256
258
+ utɕ 257
259
+ utʃ 258
260
+ uu 259
261
+ uw 260
262
+ uɐ 261
263
+ uɛ 262
264
+ uɡ 263
265
+ uɯ 264
266
+ uɾ 265
267
+ uʌ 266
268
+ uː 267
269
+ v 268
270
+ w 269
271
+ z 270
272
+ ð 271
273
+ ŋ 272
274
+ ŋd 273
275
+ ŋe 274
276
+ ŋh 275
277
+ ŋi 276
278
+ ŋj 277
279
+ ŋkh 278
280
+ ŋo 279
281
+ ŋp 280
282
+ ŋph 281
283
+ ŋq 282
284
+ ŋs 283
285
+ ŋt 284
286
+ ŋtɕ 285
287
+ ŋtʃ 286
288
+ ŋu 287
289
+ ŋw 288
290
+ ŋɐ 289
291
+ ŋɛ 290
292
+ ŋɡ 291
293
+ ŋɯ 292
294
+ ŋɾ 293
295
+ ŋʌ 294
296
+ ɐ 295
297
+ ɐd 296
298
+ ɐe 297
299
+ ɐh 298
300
+ ɐi 299
301
+ ɐj 300
302
+ ɐkh 301
303
+ ɐm 302
304
+ ɐo 303
305
+ ɐp 304
306
+ ɐph 305
307
+ ɐq 306
308
+ ɐs 307
309
+ ɐt 308
310
+ ɐtɕ 309
311
+ ɐtʃ 310
312
+ ɐu 311
313
+ ɐw 312
314
+ ɐɐ 313
315
+ ɐɛ 314
316
+ ɐɡ 315
317
+ ɐɯ 316
318
+ ɐɾ 317
319
+ ɐʌ 318
320
+ ɑ 319
321
+ ɑː 320
322
+ ɒ 321
323
+ ɔ 322
324
+ ɔɪ 323
325
+ ɔː 324
326
+ ə 325
327
+ əl 326
328
+ əʊ 327
329
+ ɛ 328
330
+ ɛd 329
331
+ ɛe 330
332
+ ɛh 331
333
+ ɛi 332
334
+ ɛj 333
335
+ ɛkh 334
336
+ ɛo 335
337
+ ɛp 336
338
+ ɛph 337
339
+ ɛq 338
340
+ ɛs 339
341
+ ɛt 340
342
+ ɛtɕ 341
343
+ ɛtʃ 342
344
+ ɛu 343
345
+ ɛw 344
346
+ ɛɐ 345
347
+ ɛɛ 346
348
+ ɛɡ 347
349
+ ɛɯ 348
350
+ ɛɾ 349
351
+ ɛʌ 350
352
+ ɜː 351
353
+ ɡ 352
354
+ ɪ 353
355
+ ɫ 354
356
+ ɫd 355
357
+ ɫe 356
358
+ ɫh 357
359
+ ɫi 358
360
+ ɫj 359
361
+ ɫkh 360
362
+ ɫm 361
363
+ ɫo 362
364
+ ɫp 363
365
+ ɫph 364
366
+ ɫq 365
367
+ ɫs 366
368
+ ɫt 367
369
+ ɫtɕ 368
370
+ ɫtʃ 369
371
+ ɫu 370
372
+ ɫw 371
373
+ ɫɐ 372
374
+ ɫɛ 373
375
+ ɫɡ 374
376
+ ɫɯ 375
377
+ ɫɾ 376
378
+ ɫʌ 377
379
+ ɯ 378
380
+ ɯd 379
381
+ ɯe 380
382
+ ɯh 381
383
+ ɯi 382
384
+ ɯj 383
385
+ ɯkh 384
386
+ ɯo 385
387
+ ɯp 386
388
+ ɯph 387
389
+ ɯq 388
390
+ ɯs 389
391
+ ɯt 390
392
+ ɯtɕ 391
393
+ ɯtʃ 392
394
+ ɯu 393
395
+ ɯw 394
396
+ ɯɐ 395
397
+ ɯɛ 396
398
+ ɯɡ 397
399
+ ɯɯ 398
400
+ ɯɾ 399
401
+ ɯʌ 400
402
+ ɹ 401
403
+ ɾ 402
404
+ ʃ 403
405
+ ʊ 404
406
+ ʌ 405
407
+ ʌd 406
408
+ ʌe 407
409
+ ʌh 408
410
+ ʌi 409
411
+ ʌj 410
412
+ ʌkh 411
413
+ ʌo 412
414
+ ʌp 413
415
+ ʌph 414
416
+ ʌq 415
417
+ ʌs 416
418
+ ʌt 417
419
+ ʌtɕ 418
420
+ ʌtʃ 419
421
+ ʌu 420
422
+ ʌw 421
423
+ ʌɐ 422
424
+ ʌɛ 423
425
+ ʌɡ 424
426
+ ʌɯ 425
427
+ ʌɾ 426
428
+ ʌʌ 427
429
+ ʒ 428
430
+ ̃ 429
431
+ θ 430
432
+ … 431
vall-e_ko_v0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2894775b160c24132f8f6d6d7df6cc8bf59a9b465778d08989e200859610560
3
+ size 2699959616