Xtiphyn commited on
Commit
6bf8a6e
·
verified ·
1 Parent(s): 4b603ff

Upload xaven_audio.ipynb

Browse files
Files changed (1) hide show
  1. xaven_audio.ipynb +419 -0
xaven_audio.ipynb ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "colab": {
8
+ "background_save": true
9
+ },
10
+ "id": "ASOVL50_1iP0"
11
+ },
12
+ "outputs": [],
13
+ "source": [
14
+ "# This cell installs a minimal package called 'snac' quietly (-q suppresses output)\n",
15
+ "# %%capture ensures that any output or errors from this cell are not shown\n",
16
+ "# %%bash runs the command as a bash shell script within the notebook\n",
17
+ "# Purpose: Set up a minimal environment with required package(s) for the MVP (Minimum Viable Product)\n",
18
+ "\n",
19
+ "%%capture\n",
20
+ "%%bash\n",
21
+ "pip install -q snac\n"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {
28
+ "colab": {
29
+ "background_save": true
30
+ },
31
+ "id": "ZcwcX6AW2TAf",
32
+ "outputId": "3a81e287-b29d-43b6-aa91-6c090557379d"
33
+ },
34
+ "outputs": [
35
+ {
36
+ "name": "stdout",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "torch: 2.6.0+cu124\n",
40
+ "cuda available: True\n",
41
+ "cuda device count: 1\n",
42
+ "current device: 0\n",
43
+ "device name: Tesla T4\n",
44
+ "bfloat16 supported: True\n"
45
+ ]
46
+ }
47
+ ],
48
+ "source": [
49
+ "# Import PyTorch library for deep learning tasks\n",
50
+ "import torch\n",
51
+ "\n",
52
+ "# Print the installed PyTorch version\n",
53
+ "print(\"torch:\", torch.__version__)\n",
54
+ "\n",
55
+ "# Check if CUDA (NVIDIA GPU acceleration) is available on this machine\n",
56
+ "print(\"cuda available:\", torch.cuda.is_available())\n",
57
+ "\n",
58
+ "# Print the number of CUDA-capable GPU devices detected\n",
59
+ "print(\"cuda device count:\", torch.cuda.device_count())\n",
60
+ "\n",
61
+ "# If a GPU is available, display details about the current GPU device\n",
62
+ "if torch.cuda.is_available():\n",
63
+ " print(\"current device:\", torch.cuda.current_device()) # GPU device index in use\n",
64
+ " print(\"device name:\", torch.cuda.get_device_name(torch.cuda.current_device())) # GPU model name\n",
65
+ " print(\"bfloat16 supported:\", torch.cuda.is_bf16_supported()) # Whether bfloat16 precision is supported (useful for efficient training)\n",
66
+ "else:\n",
67
+ " # If no GPU is found, notify that computation will be done on CPU, which is slower\n",
68
+ " print(\"No GPU detected — we'll run on CPU (slower).\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "metadata": {
75
+ "colab": {
76
+ "background_save": true
77
+ },
78
+ "id": "eKMY8bdT2zoj"
79
+ },
80
+ "outputs": [],
81
+ "source": [
82
+ "# Import PyTorch for tensor computations and model handling\n",
83
+ "import torch\n",
84
+ "\n",
85
+ "# Import tokenizer and causal language model classes from Hugging Face transformers library\n",
86
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
87
+ "\n",
88
+ "from snac import SNAC\n",
89
+ "\n",
90
+ "# Define the pre-trained voice synthesis model name to load from Hugging Face Hub\n",
91
+ "voice_model_name = \"webbigdata/VoiceCore\"\n",
92
+ "\n",
93
+ "# Define the SNAC model name (possibly for audio feature extraction or conditioning) to load from Hugging Face Hub\n",
94
+ "snac_model_name = \"hubertsiuzdak/snac_24khz\"\n"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "metadata": {
101
+ "colab": {
102
+ "background_save": true
103
+ },
104
+ "id": "ixiO7XRu21is",
105
+ "outputId": "b130d165-fae6-4b5d-a007-b3181131ef68"
106
+ },
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "Loading VoiceCore model...\n"
113
+ ]
114
+ }
115
+ ],
116
+ "source": [
117
+ "# Choose data type for model tensors:\n",
118
+ "# Use bfloat16 precision if supported by the GPU for faster and more memory-efficient computation,\n",
119
+ "# otherwise fallback to float16 precision\n",
120
+ "dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16\n",
121
+ "\n",
122
+ "# Inform the user that the VoiceCore voice generation model is being loaded\n",
123
+ "print(\"Loading VoiceCore model...\")"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "metadata": {
130
+ "id": "GiabMUtc3D3Z"
131
+ },
132
+ "outputs": [],
133
+ "source": [
134
+ "# Load the pre-trained VoiceCore causal language model for voice generation:\n",
135
+ "# - from the specified model repository (voice_model_name)\n",
136
+ "# - using the selected data type (bfloat16 or float16) for optimized GPU usage\n",
137
+ "# - device_map=\"auto\" to automatically distribute the model across available devices (GPU/CPU)\n",
138
+ "# - use_cache=True enables caching past key values to speed up autoregressive generation\n",
139
+ "\n",
140
+ "voice_model = AutoModelForCausalLM.from_pretrained(\n",
141
+ " voice_model_name,\n",
142
+ " torch_dtype=dtype,\n",
143
+ " device_map=\"auto\",\n",
144
+ " use_cache=True\n",
145
+ ")\n",
146
+ "\n",
147
+ "# Load the tokenizer associated with the VoiceCore model for converting text to tokens\n",
148
+ "voice_tokenizer = AutoTokenizer.from_pretrained(voice_model_name)"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {
155
+ "id": "gv7M0hlB3znv"
156
+ },
157
+ "outputs": [],
158
+ "source": [
159
+ "]print(\"Loading SNAC decoder...\")\n",
160
+ "\n",
161
+ "# Load the SNAC model from the specified repository for audio decoding or processing\n",
162
+ "snac_model = SNAC.from_pretrained(snac_model_name)\n",
163
+ "\n",
164
+ "# Move the SNAC model to CPU (assuming it may not require GPU or for compatibility)\n",
165
+ "snac_model.to(\"cpu\")\n",
166
+ "\n",
167
+ "# Confirm that all models have been loaded without issues\n",
168
+ "print(\"Models loaded successfully.\")"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "metadata": {
175
+ "id": "aV72Fdh1-jYk"
176
+ },
177
+ "outputs": [],
178
+ "source": [
179
+ "import scipy.io.wavfile as wavfile\n",
180
+ "from IPython.display import Audio, display\n",
181
+ "import torchaudio # Added torchaudio for saving the waveform\n",
182
+ "\n",
183
+ "# Available voices\n",
184
+ "voices = [\n",
185
+ " \"matsukaze_male\", # Refreshing male\n",
186
+ " \"amitaro_female\", # Cheerful girl\n",
187
+ " \"naraku_female\", # Calm woman\n",
188
+ " \"shiguu_male\", # Mature boy\n",
189
+ " \"sayoko_female\", # Elderly woman\n",
190
+ " \"nekketsu_female\", # Hot-blooded heroine\n",
191
+ " \"dahara1_male\" # General male\n",
192
+ "]\n",
193
+ "\n",
194
+ "# The text to speak\n",
195
+ "text = \"what am i eating this night\"\n"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {
202
+ "id": "KkGFFMLw6xvu"
203
+ },
204
+ "outputs": [],
205
+ "source": [
206
+ "import random\n",
207
+ "\n",
208
+ "# Select a random voice from the predefined voices list to generate speech\n",
209
+ "voice_type = random.choice(voices)\n",
210
+ "\n",
211
+ "# Prepare the text prompt for the voice generation model:\n",
212
+ "# Append \"[neutral]\" emotion tag to the chosen voice for neutral tone synthesis\n",
213
+ "chosen_voice = voice_type + \"[neutral]\"\n",
214
+ "\n",
215
+ "# Format prompt by combining voice tag and input text to guide the model's output\n",
216
+ "prompt = f\"{chosen_voice}: {text}\"\n",
217
+ "\n",
218
+ "# Tokenize the prompt text to get input IDs for the model (PyTorch tensors)\n",
219
+ "input_ids = voice_tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
220
+ "\n",
221
+ "# Define special tokens used for voice generation control:\n",
222
+ "# start_token marks the beginning of human speech segment\n",
223
+ "start_token = torch.tensor([[128259]], dtype=torch.int64)\n",
224
+ "\n",
225
+ "# end_tokens mark possible token IDs that indicate end of speech generation\n",
226
+ "end_tokens = torch.tensor([[128009, 128260, 128261]], dtype=torch.int64)\n"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": null,
232
+ "metadata": {
233
+ "id": "ocyFxQbx605H"
234
+ },
235
+ "outputs": [],
236
+ "source": [
237
+ "# Add special start and end tokens to the input token sequence:\n",
238
+ "# Concatenate start_token at the beginning, input_ids in the middle, and end_tokens at the end along the token dimension\n",
239
+ "modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)\n",
240
+ "\n",
241
+ "# Move the modified input tokens to the same device as the voice model (e.g., GPU) for faster processing\n",
242
+ "input_ids = modified_input_ids.to(voice_model.device)\n",
243
+ "\n",
244
+ "# Create an attention mask of ones with the same shape as input_ids to indicate all tokens should be attended to during inference\n",
245
+ "attention_mask = torch.ones_like(input_ids)"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": null,
251
+ "metadata": {
252
+ "id": "tAN566Ch626k"
253
+ },
254
+ "outputs": [],
255
+ "source": [
256
+ "# 4) Generate audio tokens from the voice generation model based on the input prompt\n",
257
+ "\n",
258
+ "print(\"🎤 Generating voice tokens...\")\n",
259
+ "\n",
260
+ "# Generate token IDs representing the synthesized voice audio using autoregressive generation:\n",
261
+ "# - input_ids: tokenized prompt with start/end tokens\n",
262
+ "# - attention_mask: indicates tokens to attend to\n",
263
+ "# - max_new_tokens: limit max tokens generated to control output length\n",
264
+ "# - do_sample=True: sample tokens probabilistically for natural variation\n",
265
+ "# - temperature=0.6: controls randomness (lower = more focused)\n",
266
+ "# - top_p=0.9: nucleus sampling threshold to limit token pool\n",
267
+ "# - repetition_penalty=1.1: discourage repetitive tokens for more natural speech\n",
268
+ "# - eos_token_id=128258: token indicating end of sequence\n",
269
+ "# - use_cache=True: speed up generation with caching past states\n",
270
+ "\n",
271
+ "generated_ids = voice_model.generate(\n",
272
+ " input_ids=input_ids,\n",
273
+ " attention_mask=attention_mask,\n",
274
+ " max_new_tokens=8196,\n",
275
+ " do_sample=True,\n",
276
+ " temperature=0.6,\n",
277
+ " top_p=0.9,\n",
278
+ " repetition_penalty=1.1,\n",
279
+ " eos_token_id=128258,\n",
280
+ " use_cache=True\n",
281
+ ")\n",
282
+ "\n",
283
+ "# Print the generated token IDs representing the synthesized voice audio\n",
284
+ "print(generated_ids)"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": null,
290
+ "metadata": {
291
+ "id": "E5-f026j66ok"
292
+ },
293
+ "outputs": [],
294
+ "source": [
295
+ "# Extract the audio codes from the generated token sequence\n",
296
+ "\n",
297
+ "# Define tokens to locate and exclude:\n",
298
+ "# token_to_find marks the boundary before audio codes start\n",
299
+ "token_to_find = 128257\n",
300
+ "# token_to_remove is an end-of-sequence token to exclude\n",
301
+ "token_to_remove = 128258\n",
302
+ "\n",
303
+ "# Find all positions where token_to_find appears in generated_ids\n",
304
+ "token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)\n",
305
+ "\n",
306
+ "# If token_to_find exists, crop the generated_ids tensor to keep only tokens after its last occurrence\n",
307
+ "if len(token_indices[1]) > 0:\n",
308
+ " last_occurrence_idx = token_indices[1][-1].item()\n",
309
+ " cropped_tensor = generated_ids[:, last_occurrence_idx+1:]\n",
310
+ "else:\n",
311
+ " # If token_to_find is not found, keep the entire generated token sequence\n",
312
+ " cropped_tensor = generated_ids\n"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": null,
318
+ "metadata": {
319
+ "id": "d2uTSNha68j8"
320
+ },
321
+ "outputs": [],
322
+ "source": [
323
+ "# Remove all occurrences of the token_to_remove (end token) from the cropped tensor\n",
324
+ "processed_row = cropped_tensor[0][cropped_tensor[0] != token_to_remove]\n",
325
+ "\n",
326
+ "# Convert the filtered tensor of tokens into a Python list for easier processing\n",
327
+ "code_list = processed_row.tolist()\n",
328
+ "\n",
329
+ "# Adjust the length of the code list to be a multiple of 7 (required by downstream processing)\n",
330
+ "new_length = (len(code_list) // 7) * 7\n",
331
+ "\n",
332
+ "# Trim the list to the new length and normalize token values by subtracting 128266\n",
333
+ "# This likely converts tokens into audio code indices starting from zero\n",
334
+ "code_list = [t - 128266 for t in code_list[:new_length]]"
335
+ ]
336
+ },
337
+ {
338
+ "cell_type": "code",
339
+ "execution_count": null,
340
+ "metadata": {
341
+ "id": "9soI9X0F7Ag7"
342
+ },
343
+ "outputs": [],
344
+ "source": [
345
+ "# 6) Redistribute the processed audio codes into three separate SNAC layers\n",
346
+ "\n",
347
+ "# Initialize empty lists for each SNAC layer\n",
348
+ "layer_1, layer_2, layer_3 = [], [], []\n",
349
+ "\n",
350
+ "# Iterate over the code_list in chunks of 7 tokens each\n",
351
+ "for i in range(len(code_list) // 7):\n",
352
+ " # Append tokens to layer_1 and layer_2/3 with specific offsets to decode multi-layered representation\n",
353
+ " layer_1.append(code_list[7*i]) # First token goes to layer_1 as is\n",
354
+ " layer_2.append(code_list[7*i + 1] - 4096) # Second token shifted by 4096 for layer_2\n",
355
+ " layer_3.append(code_list[7*i + 2] - 8192) # Third token shifted by 8192 for layer_3\n",
356
+ " layer_3.append(code_list[7*i + 3] - 12288) # Fourth token shifted by 12288 for layer_3\n",
357
+ " layer_2.append(code_list[7*i + 4] - 16384) # Fifth token shifted by 16384 for layer_2\n",
358
+ " layer_3.append(code_list[7*i + 5] - 20480) # Sixth token shifted by 20480 for layer_3\n",
359
+ " layer_3.append(code_list[7*i + 6] - 24576) # Seventh token shifted by 24576 for layer_3\n",
360
+ "\n",
361
+ "# Convert each layer list to a PyTorch tensor and add a batch dimension with unsqueeze(0)\n",
362
+ "codes = [\n",
363
+ " torch.tensor(layer_1).unsqueeze(0),\n",
364
+ " torch.tensor(layer_2).unsqueeze(0),\n",
365
+ " torch.tensor(layer_3).unsqueeze(0)\n",
366
+ "]"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": null,
372
+ "metadata": {
373
+ "id": "vlQ4CFq-7Ef6"
374
+ },
375
+ "outputs": [],
376
+ "source": [
377
+ "# 7) Decode the redistributed SNAC codes into a waveform audio tensor\n",
378
+ "print(\" Decoding audio...\")\n",
379
+ "audio = snac_model.decode(codes) # Convert SNAC codes back into raw audio waveform\n",
380
+ "\n",
381
+ "# Convert the PyTorch tensor audio to a NumPy array after removing batch dimension and moving to CPU\n",
382
+ "audio_np = audio.detach().squeeze().cpu().numpy()\n",
383
+ "\n",
384
+ "# 8) Save the decoded audio waveform as a WAV file at 24kHz sample rate\n",
385
+ "filename = \"first_voice.wav\"\n",
386
+ "wavfile.write(filename, 24000, audio_np)\n",
387
+ "print(f\"Audio saved as {filename}\")\n",
388
+ "\n",
389
+ "# Play the generated audio inline in the notebook with the correct sampling rate\n",
390
+ "display(Audio(audio_np, rate=24000))"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": null,
396
+ "metadata": {
397
+ "id": "KyAyJSSeClL9"
398
+ },
399
+ "outputs": [],
400
+ "source": []
401
+ }
402
+ ],
403
+ "metadata": {
404
+ "accelerator": "GPU",
405
+ "colab": {
406
+ "gpuType": "T4",
407
+ "provenance": []
408
+ },
409
+ "kernelspec": {
410
+ "display_name": "Python 3",
411
+ "name": "python3"
412
+ },
413
+ "language_info": {
414
+ "name": "python"
415
+ }
416
+ },
417
+ "nbformat": 4,
418
+ "nbformat_minor": 0
419
+ }