Spaces:

Xtiphyn
/

Xaven-Voicecore

Runtime error

App Files Files Community

Xtiphyn commited on Aug 9

Commit

6bf8a6e

verified ·

1 Parent(s): 4b603ff

Upload xaven_audio.ipynb

Browse files

Files changed (1) hide show

xaven_audio.ipynb +419 -0

xaven_audio.ipynb ADDED Viewed

	@@ -0,0 +1,419 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true
+        },
+        "id": "ASOVL50_1iP0"
+      },
+      "outputs": [],
+      "source": [
+        "# This cell installs a minimal package called 'snac' quietly (-q suppresses output)\n",
+        "# %%capture ensures that any output or errors from this cell are not shown\n",
+        "# %%bash runs the command as a bash shell script within the notebook\n",
+        "# Purpose: Set up a minimal environment with required package(s) for the MVP (Minimum Viable Product)\n",
+        "\n",
+        "%%capture\n",
+        "%%bash\n",
+        "pip install -q snac\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true
+        },
+        "id": "ZcwcX6AW2TAf",
+        "outputId": "3a81e287-b29d-43b6-aa91-6c090557379d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "torch: 2.6.0+cu124\n",
+            "cuda available: True\n",
+            "cuda device count: 1\n",
+            "current device: 0\n",
+            "device name: Tesla T4\n",
+            "bfloat16 supported: True\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Import PyTorch library for deep learning tasks\n",
+        "import torch\n",
+        "\n",
+        "# Print the installed PyTorch version\n",
+        "print(\"torch:\", torch.__version__)\n",
+        "\n",
+        "# Check if CUDA (NVIDIA GPU acceleration) is available on this machine\n",
+        "print(\"cuda available:\", torch.cuda.is_available())\n",
+        "\n",
+        "# Print the number of CUDA-capable GPU devices detected\n",
+        "print(\"cuda device count:\", torch.cuda.device_count())\n",
+        "\n",
+        "# If a GPU is available, display details about the current GPU device\n",
+        "if torch.cuda.is_available():\n",
+        "    print(\"current device:\", torch.cuda.current_device())  # GPU device index in use\n",
+        "    print(\"device name:\", torch.cuda.get_device_name(torch.cuda.current_device()))  # GPU model name\n",
+        "    print(\"bfloat16 supported:\", torch.cuda.is_bf16_supported())  # Whether bfloat16 precision is supported (useful for efficient training)\n",
+        "else:\n",
+        "    # If no GPU is found, notify that computation will be done on CPU, which is slower\n",
+        "    print(\"No GPU detected — we'll run on CPU (slower).\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true
+        },
+        "id": "eKMY8bdT2zoj"
+      },
+      "outputs": [],
+      "source": [
+        "# Import PyTorch for tensor computations and model handling\n",
+        "import torch\n",
+        "\n",
+        "# Import tokenizer and causal language model classes from Hugging Face transformers library\n",
+        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+        "\n",
+        "from snac import SNAC\n",
+        "\n",
+        "# Define the pre-trained voice synthesis model name to load from Hugging Face Hub\n",
+        "voice_model_name = \"webbigdata/VoiceCore\"\n",
+        "\n",
+        "# Define the SNAC model name (possibly for audio feature extraction or conditioning) to load from Hugging Face Hub\n",
+        "snac_model_name = \"hubertsiuzdak/snac_24khz\"\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true
+        },
+        "id": "ixiO7XRu21is",
+        "outputId": "b130d165-fae6-4b5d-a007-b3181131ef68"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Loading VoiceCore model...\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Choose data type for model tensors:\n",
+        "# Use bfloat16 precision if supported by the GPU for faster and more memory-efficient computation,\n",
+        "# otherwise fallback to float16 precision\n",
+        "dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16\n",
+        "\n",
+        "# Inform the user that the VoiceCore voice generation model is being loaded\n",
+        "print(\"Loading VoiceCore model...\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GiabMUtc3D3Z"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the pre-trained VoiceCore causal language model for voice generation:\n",
+        "# - from the specified model repository (voice_model_name)\n",
+        "# - using the selected data type (bfloat16 or float16) for optimized GPU usage\n",
+        "# - device_map=\"auto\" to automatically distribute the model across available devices (GPU/CPU)\n",
+        "# - use_cache=True enables caching past key values to speed up autoregressive generation\n",
+        "\n",
+        "voice_model = AutoModelForCausalLM.from_pretrained(\n",
+        "    voice_model_name,\n",
+        "    torch_dtype=dtype,\n",
+        "    device_map=\"auto\",\n",
+        "    use_cache=True\n",
+        ")\n",
+        "\n",
+        "# Load the tokenizer associated with the VoiceCore model for converting text to tokens\n",
+        "voice_tokenizer = AutoTokenizer.from_pretrained(voice_model_name)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gv7M0hlB3znv"
+      },
+      "outputs": [],
+      "source": [
+        "]print(\"Loading SNAC decoder...\")\n",
+        "\n",
+        "# Load the SNAC model from the specified repository for audio decoding or processing\n",
+        "snac_model = SNAC.from_pretrained(snac_model_name)\n",
+        "\n",
+        "# Move the SNAC model to CPU (assuming it may not require GPU or for compatibility)\n",
+        "snac_model.to(\"cpu\")\n",
+        "\n",
+        "# Confirm that all models have been loaded without issues\n",
+        "print(\"Models loaded successfully.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aV72Fdh1-jYk"
+      },
+      "outputs": [],
+      "source": [
+        "import scipy.io.wavfile as wavfile\n",
+        "from IPython.display import Audio, display\n",
+        "import torchaudio # Added torchaudio for saving the waveform\n",
+        "\n",
+        "# Available voices\n",
+        "voices = [\n",
+        "    \"matsukaze_male\",   # Refreshing male\n",
+        "    \"amitaro_female\",   # Cheerful girl\n",
+        "    \"naraku_female\",    # Calm woman\n",
+        "    \"shiguu_male\",      # Mature boy\n",
+        "    \"sayoko_female\",    # Elderly woman\n",
+        "    \"nekketsu_female\",  # Hot-blooded heroine\n",
+        "    \"dahara1_male\"      # General male\n",
+        "]\n",
+        "\n",
+        "# The text to speak\n",
+        "text = \"what am i eating this night\"\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KkGFFMLw6xvu"
+      },
+      "outputs": [],
+      "source": [
+        "import random\n",
+        "\n",
+        "# Select a random voice from the predefined voices list to generate speech\n",
+        "voice_type = random.choice(voices)\n",
+        "\n",
+        "# Prepare the text prompt for the voice generation model:\n",
+        "# Append \"[neutral]\" emotion tag to the chosen voice for neutral tone synthesis\n",
+        "chosen_voice = voice_type + \"[neutral]\"\n",
+        "\n",
+        "# Format prompt by combining voice tag and input text to guide the model's output\n",
+        "prompt = f\"{chosen_voice}: {text}\"\n",
+        "\n",
+        "# Tokenize the prompt text to get input IDs for the model (PyTorch tensors)\n",
+        "input_ids = voice_tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
+        "\n",
+        "# Define special tokens used for voice generation control:\n",
+        "# start_token marks the beginning of human speech segment\n",
+        "start_token = torch.tensor([[128259]], dtype=torch.int64)\n",
+        "\n",
+        "# end_tokens mark possible token IDs that indicate end of speech generation\n",
+        "end_tokens = torch.tensor([[128009, 128260, 128261]], dtype=torch.int64)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ocyFxQbx605H"
+      },
+      "outputs": [],
+      "source": [
+        "# Add special start and end tokens to the input token sequence:\n",
+        "# Concatenate start_token at the beginning, input_ids in the middle, and end_tokens at the end along the token dimension\n",
+        "modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)\n",
+        "\n",
+        "# Move the modified input tokens to the same device as the voice model (e.g., GPU) for faster processing\n",
+        "input_ids = modified_input_ids.to(voice_model.device)\n",
+        "\n",
+        "# Create an attention mask of ones with the same shape as input_ids to indicate all tokens should be attended to during inference\n",
+        "attention_mask = torch.ones_like(input_ids)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tAN566Ch626k"
+      },
+      "outputs": [],
+      "source": [
+        "# 4) Generate audio tokens from the voice generation model based on the input prompt\n",
+        "\n",
+        "print(\"🎤 Generating voice tokens...\")\n",
+        "\n",
+        "# Generate token IDs representing the synthesized voice audio using autoregressive generation:\n",
+        "# - input_ids: tokenized prompt with start/end tokens\n",
+        "# - attention_mask: indicates tokens to attend to\n",
+        "# - max_new_tokens: limit max tokens generated to control output length\n",
+        "# - do_sample=True: sample tokens probabilistically for natural variation\n",
+        "# - temperature=0.6: controls randomness (lower = more focused)\n",
+        "# - top_p=0.9: nucleus sampling threshold to limit token pool\n",
+        "# - repetition_penalty=1.1: discourage repetitive tokens for more natural speech\n",
+        "# - eos_token_id=128258: token indicating end of sequence\n",
+        "# - use_cache=True: speed up generation with caching past states\n",
+        "\n",
+        "generated_ids = voice_model.generate(\n",
+        "    input_ids=input_ids,\n",
+        "    attention_mask=attention_mask,\n",
+        "    max_new_tokens=8196,\n",
+        "    do_sample=True,\n",
+        "    temperature=0.6,\n",
+        "    top_p=0.9,\n",
+        "    repetition_penalty=1.1,\n",
+        "    eos_token_id=128258,\n",
+        "    use_cache=True\n",
+        ")\n",
+        "\n",
+        "# Print the generated token IDs representing the synthesized voice audio\n",
+        "print(generated_ids)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "E5-f026j66ok"
+      },
+      "outputs": [],
+      "source": [
+        "# Extract the audio codes from the generated token sequence\n",
+        "\n",
+        "# Define tokens to locate and exclude:\n",
+        "# token_to_find marks the boundary before audio codes start\n",
+        "token_to_find = 128257\n",
+        "# token_to_remove is an end-of-sequence token to exclude\n",
+        "token_to_remove = 128258\n",
+        "\n",
+        "# Find all positions where token_to_find appears in generated_ids\n",
+        "token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)\n",
+        "\n",
+        "# If token_to_find exists, crop the generated_ids tensor to keep only tokens after its last occurrence\n",
+        "if len(token_indices[1]) > 0:\n",
+        "    last_occurrence_idx = token_indices[1][-1].item()\n",
+        "    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]\n",
+        "else:\n",
+        "    # If token_to_find is not found, keep the entire generated token sequence\n",
+        "    cropped_tensor = generated_ids\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "d2uTSNha68j8"
+      },
+      "outputs": [],
+      "source": [
+        "# Remove all occurrences of the token_to_remove (end token) from the cropped tensor\n",
+        "processed_row = cropped_tensor[0][cropped_tensor[0] != token_to_remove]\n",
+        "\n",
+        "# Convert the filtered tensor of tokens into a Python list for easier processing\n",
+        "code_list = processed_row.tolist()\n",
+        "\n",
+        "# Adjust the length of the code list to be a multiple of 7 (required by downstream processing)\n",
+        "new_length = (len(code_list) // 7) * 7\n",
+        "\n",
+        "# Trim the list to the new length and normalize token values by subtracting 128266\n",
+        "# This likely converts tokens into audio code indices starting from zero\n",
+        "code_list = [t - 128266 for t in code_list[:new_length]]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9soI9X0F7Ag7"
+      },
+      "outputs": [],
+      "source": [
+        "# 6) Redistribute the processed audio codes into three separate SNAC layers\n",
+        "\n",
+        "# Initialize empty lists for each SNAC layer\n",
+        "layer_1, layer_2, layer_3 = [], [], []\n",
+        "\n",
+        "# Iterate over the code_list in chunks of 7 tokens each\n",
+        "for i in range(len(code_list) // 7):\n",
+        "    # Append tokens to layer_1 and layer_2/3 with specific offsets to decode multi-layered representation\n",
+        "    layer_1.append(code_list[7*i])                         # First token goes to layer_1 as is\n",
+        "    layer_2.append(code_list[7*i + 1] - 4096)              # Second token shifted by 4096 for layer_2\n",
+        "    layer_3.append(code_list[7*i + 2] - 8192)              # Third token shifted by 8192 for layer_3\n",
+        "    layer_3.append(code_list[7*i + 3] - 12288)             # Fourth token shifted by 12288 for layer_3\n",
+        "    layer_2.append(code_list[7*i + 4] - 16384)             # Fifth token shifted by 16384 for layer_2\n",
+        "    layer_3.append(code_list[7*i + 5] - 20480)             # Sixth token shifted by 20480 for layer_3\n",
+        "    layer_3.append(code_list[7*i + 6] - 24576)             # Seventh token shifted by 24576 for layer_3\n",
+        "\n",
+        "# Convert each layer list to a PyTorch tensor and add a batch dimension with unsqueeze(0)\n",
+        "codes = [\n",
+        "    torch.tensor(layer_1).unsqueeze(0),\n",
+        "    torch.tensor(layer_2).unsqueeze(0),\n",
+        "    torch.tensor(layer_3).unsqueeze(0)\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vlQ4CFq-7Ef6"
+      },
+      "outputs": [],
+      "source": [
+        "# 7) Decode the redistributed SNAC codes into a waveform audio tensor\n",
+        "print(\" Decoding audio...\")\n",
+        "audio = snac_model.decode(codes)  # Convert SNAC codes back into raw audio waveform\n",
+        "\n",
+        "# Convert the PyTorch tensor audio to a NumPy array after removing batch dimension and moving to CPU\n",
+        "audio_np = audio.detach().squeeze().cpu().numpy()\n",
+        "\n",
+        "# 8) Save the decoded audio waveform as a WAV file at 24kHz sample rate\n",
+        "filename = \"first_voice.wav\"\n",
+        "wavfile.write(filename, 24000, audio_np)\n",
+        "print(f\"Audio saved as {filename}\")\n",
+        "\n",
+        "# Play the generated audio inline in the notebook with the correct sampling rate\n",
+        "display(Audio(audio_np, rate=24000))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KyAyJSSeClL9"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}