Spaces:

RenderAI
/

bark-cloning

No application file

File size: 4,965 Bytes

b5dba8a

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Bark text-to-speech voice cloning.\n",
    "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Install packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%pip install -r requirements.txt\n",
    "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Load models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading HuBERT...\n",
      "Loading Quantizer...\n",
      "Loading Encodec...\n",
      "Downloaded and loaded models!\n"
     ]
    }
   ],
   "source": [
    "large_quant_model = False  # Use the larger pretrained model\n",
    "device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "import torchaudio\n",
    "from encodec import EncodecModel\n",
    "from encodec.utils import convert_audio\n",
    "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
    "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
    "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
    "\n",
    "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
    "\n",
    "print('Loading HuBERT...')\n",
    "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
    "print('Loading Quantizer...')\n",
    "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
    "print('Loading Encodec...')\n",
    "encodec_model = EncodecModel.encodec_model_24khz()\n",
    "encodec_model.set_target_bandwidth(6.0)\n",
    "encodec_model.to(device)\n",
    "\n",
    "print('Downloaded and loaded models!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Load wav and create speaker history prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting semantics...\n",
      "Tokenizing semantics...\n",
      "Creating coarse and fine prompts...\n",
      "Done!\n"
     ]
    }
   ],
   "source": [
    "wav_file = 'speaker.wav'  # Put the path of the speaker you want to use here.\n",
    "out_file = 'speaker.npz'  # Put the path to save the cloned speaker to here.\n",
    "\n",
    "wav, sr = torchaudio.load(wav_file)\n",
    "\n",
    "wav_hubert = wav.to(device)\n",
    "\n",
    "if wav_hubert.shape[0] == 2:  # Stereo to mono if needed\n",
    "    wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
    "\n",
    "print('Extracting semantics...')\n",
    "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
    "print('Tokenizing semantics...')\n",
    "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
    "print('Creating coarse and fine prompts...')\n",
    "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
    "\n",
    "wav = wav.to(device)\n",
    "\n",
    "with torch.no_grad():\n",
    "    encoded_frames = encodec_model.encode(wav)\n",
    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
    "\n",
    "codes = codes.cpu()\n",
    "semantic_tokens = semantic_tokens.cpu()\n",
    "\n",
    "np.savez(out_file,\n",
    "         semantic_prompt=semantic_tokens,\n",
    "         fine_prompt=codes,\n",
    "         coarse_prompt=codes[:2, :]\n",
    "         )\n",
    "\n",
    "print('Done!')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}