{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "# Bark text-to-speech voice cloning.\n", "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).\n", "(This version of the notebook is made to work on Google Colab, make sure your runtime hardware accelerator is set to GPU)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "# Google Colab: Clone the repository" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/\n", "%cd bark-voice-cloning-HuBERT-quantizer" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Install packages" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%pip install -r requirements.txt\n", "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Load models" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading HuBERT...\n", "Loading Quantizer...\n", "Loading Encodec...\n", "Downloaded and loaded models!\n" ] } ], "source": [ "large_quant_model = False # Use the larger pretrained model\n", "device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n", "\n", "import numpy as np\n", "import torch\n", "import torchaudio\n", "from encodec import EncodecModel\n", "from encodec.utils import convert_audio\n", "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n", "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n", "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n", "\n", "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n", "\n", "print('Loading HuBERT...')\n", "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n", "print('Loading Quantizer...')\n", "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n", "print('Loading Encodec...')\n", "encodec_model = EncodecModel.encodec_model_24khz()\n", "encodec_model.set_target_bandwidth(6.0)\n", "encodec_model.to(device)\n", "\n", "print('Downloaded and loaded models!')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Load wav and create speaker history prompt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting semantics...\n", "Tokenizing semantics...\n", "Creating coarse and fine prompts...\n", "Done!\n" ] } ], "source": [ "wav_file = 'speaker.wav' # Put the path of the speaker you want to use here.\n", "out_file = 'speaker.npz' # Put the path to save the cloned speaker to here.\n", "\n", "wav, sr = torchaudio.load(wav_file)\n", "\n", "wav_hubert = wav.to(device)\n", "\n", "if wav_hubert.shape[0] == 2: # Stereo to mono if needed\n", " wav_hubert = wav_hubert.mean(0, keepdim=True)\n", "\n", "print('Extracting semantics...')\n", "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n", "print('Tokenizing semantics...')\n", "semantic_tokens = quant_model.get_token(semantic_vectors)\n", "print('Creating coarse and fine prompts...')\n", "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n", "\n", "wav = wav.to(device)\n", "\n", "with torch.no_grad():\n", " encoded_frames = encodec_model.encode(wav)\n", "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n", "\n", "codes = codes.cpu()\n", "semantic_tokens = semantic_tokens.cpu()\n", "\n", "np.savez(out_file,\n", " semantic_prompt=semantic_tokens,\n", " fine_prompt=codes,\n", " coarse_prompt=codes[:2, :]\n", " )\n", "\n", "print('Done!')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }