Spaces:

RenderAI
/

bark-cloning

No application file

App Files Files Community

jameshuntercarter commited on Dec 2, 2023

Commit

b5dba8a

1 Parent(s): ed39a03

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +1 -0
LICENSE +21 -0
args.py +10 -0
bark_hubert_quantizer/__init__.py +0 -0
bark_hubert_quantizer/customtokenizer.py +200 -0
bark_hubert_quantizer/hubert_manager.py +33 -0
bark_hubert_quantizer/pre_kmeans_hubert.py +106 -0
cog.yaml +20 -0
colab_notebook.ipynb +202 -0
data/.DS_Store +0 -0
data/models/.DS_Store +0 -0
data/models/hubert/hubert_base_ls960.pt +3 -0
data/models/hubert/quantifier_V1_hubert_base_ls960_23.pth +3 -0
data/models/hubert/quantifier_hubert_base_ls960_14.pth +3 -0
examples/biden_example.mov +0 -0
install_hubert.py +28 -0
notebook.ipynb +180 -0
predict.py +87 -0
prepare.py +88 -0
process.py +22 -0
readme.md +108 -0
requirements.txt +8 -0
sample-speaker.wav +3 -0
setup.py +17 -0
test_hubert.py +23 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample-speaker.wav filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Mylo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

args.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument('--path', required=True, help='The path containing your semantic tokens and wavs')
+parser.add_argument('--mode', required=True, help='The mode to use', choices=['prepare', 'prepare2', 'train', 'test'])
+parser.add_argument('--hubert-model', default='model/hubert/hubert_base_ls960.pt', help='The hubert model to use for preparing the data and later creation of semantic tokens.')
+parser.add_argument('--train-save-epochs', default=1, type=int, help='The amount of epochs to train before saving')
+args = parser.parse_args()

bark_hubert_quantizer/__init__.py ADDED Viewed

File without changes

bark_hubert_quantizer/customtokenizer.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Custom tokenizer model.
+Author: https://www.github.com/gitmylo/
+License: MIT
+"""
+import json
+import os.path
+from zipfile import ZipFile
+import numpy
+import torch
+from torch import nn, optim
+from torch.serialization import MAP_LOCATION
+class CustomTokenizer(nn.Module):
+    def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
+        super(CustomTokenizer, self).__init__()
+        next_size = input_size
+        if version == 0:
+            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+            next_size = hidden_size
+        if version == 1:
+            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+            self.intermediate = nn.Linear(hidden_size, 4096)
+            next_size = 4096
+        self.fc = nn.Linear(next_size, output_size)
+        self.softmax = nn.LogSoftmax(dim=1)
+        self.optimizer: optim.Optimizer = None
+        self.lossfunc = nn.CrossEntropyLoss()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.version = version
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        if self.version == 1:
+            x = self.intermediate(x)
+        x = self.fc(x)
+        x = self.softmax(x)
+        return x
+    @torch.no_grad()
+    def get_token(self, x):
+        """
+        Used to get the token for the first
+        :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
+        :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
+        """
+        return torch.argmax(self(x), dim=1)
+    def prepare_training(self):
+        self.optimizer = optim.Adam(self.parameters(), 0.001)
+    def train_step(self, x_train, y_train, log_loss=False):
+        # y_train = y_train[:-1]
+        # y_train = y_train[1:]
+        optimizer = self.optimizer
+        lossfunc = self.lossfunc
+        # Zero the gradients
+        self.zero_grad()
+        # Forward pass
+        y_pred = self(x_train)
+        y_train_len = len(y_train)
+        y_pred_len = y_pred.shape[0]
+        if y_train_len > y_pred_len:
+            diff = y_train_len - y_pred_len
+            y_train = y_train[diff:]
+        elif y_train_len < y_pred_len:
+            diff = y_pred_len - y_train_len
+            y_pred = y_pred[:-diff, :]
+        y_train_hot = torch.zeros(len(y_train), self.output_size)
+        y_train_hot[range(len(y_train)), y_train] = 1
+        y_train_hot = y_train_hot.to('cuda')
+        # Calculate the loss
+        loss = lossfunc(y_pred, y_train_hot)
+        # Print loss
+        if log_loss:
+            print('Loss', loss.item())
+        # Backward pass
+        loss.backward()
+        # Update the weights
+        optimizer.step()
+    def save(self, path):
+        info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info'
+        torch.save(self.state_dict(), path)
+        data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
+        with ZipFile(path, 'a') as model_zip:
+            model_zip.writestr(info_path, data_from_model.save())
+            model_zip.close()
+    @staticmethod
+    def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
+        old = True
+        with ZipFile(path) as model_zip:
+            filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
+            file = filesMatch[0] if filesMatch else None
+            if file:
+                old = False
+                data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
+            model_zip.close()
+        if old:
+            model = CustomTokenizer()
+        else:
+            model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
+        model.load_state_dict(torch.load(path, map_location=map_location))
+        if map_location:
+            model = model.to(map_location)
+        return model
+class Data:
+    input_size: int
+    hidden_size: int
+    output_size: int
+    version: int
+    def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.version = version
+    @staticmethod
+    def load(string):
+        data = json.loads(string)
+        return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])
+    def save(self):
+        data = {
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'output_size': self.output_size,
+            'version': self.version,
+        }
+        return json.dumps(data)
+def auto_train(data_path, save_path='model.pth', load_model: str | None = None, save_epochs=1):
+    data_x, data_y = {}, {}
+    if load_model and os.path.isfile(load_model):
+        print('Loading model from', load_model)
+        model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
+    else:
+        print('Creating new model.')
+        model_training = CustomTokenizer(version=1).to('cuda')
+    save_path = os.path.join(data_path, save_path)
+    base_save_path = '.'.join(save_path.split('.')[:-1])
+    sem_string = '_semantic.npy'
+    feat_string = '_semantic_features.npy'
+    ready = os.path.join(data_path, 'ready')
+    for input_file in os.listdir(ready):
+        full_path = os.path.join(ready, input_file)
+        try:
+            prefix = input_file.split("_")[0]
+            number = int(prefix)
+        except ValueError as e:
+            raise e
+        if input_file.endswith(sem_string):
+            data_y[number] = numpy.load(full_path)
+        elif input_file.endswith(feat_string):
+            data_x[number] = numpy.load(full_path)
+    model_training.prepare_training()
+    epoch = 1
+    while 1:
+        for i in range(save_epochs):
+            j = 0
+            for i in range(max(len(data_x), len(data_y))):
+                x = data_x.get(i)
+                y = data_y.get(i)
+                if x is None or y is None:
+                    print(f'The training data does not match. key={i}')
+                    continue
+                model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0)  # Print loss every 50 steps
+                j += 1
+        save_p = save_path
+        save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
+        model_training.save(save_p)
+        model_training.save(save_p_2)
+        print(f'Epoch {epoch} completed')
+        epoch += 1

bark_hubert_quantizer/hubert_manager.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os.path
+import shutil
+import urllib.request
+import huggingface_hub
+class HuBERTManager:
+    @staticmethod
+    def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
+        install_dir = os.path.join('data', 'models', 'hubert')
+        if not os.path.isdir(install_dir):
+            os.makedirs(install_dir, exist_ok=True)
+        install_file = os.path.join(install_dir, file_name)
+        if not os.path.isfile(install_file):
+            print('Downloading HuBERT base model')
+            urllib.request.urlretrieve(download_url, install_file)
+            print('Downloaded HuBERT')
+        return install_file
+    @staticmethod
+    def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
+        install_dir = os.path.join('data', 'models', 'hubert')
+        if not os.path.isdir(install_dir):
+            os.makedirs(install_dir, exist_ok=True)
+        install_file = os.path.join(install_dir, local_file)
+        if not os.path.isfile(install_file):
+            print('Downloading HuBERT custom tokenizer')
+            huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
+            shutil.move(os.path.join(install_dir, model), install_file)
+            print('Downloaded tokenizer')
+        return install_file

bark_hubert_quantizer/pre_kmeans_hubert.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Modified HuBERT model without kmeans.
+Original author: https://github.com/lucidrains/
+Modified by: https://www.github.com/gitmylo/
+License: MIT
+"""
+# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
+from pathlib import Path
+import torch
+from torch import nn
+from einops import pack, unpack
+import fairseq
+from torchaudio.functional import resample
+from audiolm_pytorch.utils import curtail_to_multiple
+import logging
+logging.root.setLevel(logging.ERROR)
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+class CustomHubert(nn.Module):
+    """
+    checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
+    or you can train your own
+    """
+    def __init__(
+        self,
+        checkpoint_path,
+        target_sample_hz=16000,
+        seq_len_multiple_of=None,
+        output_layer=9,
+        device=None
+    ):
+        super().__init__()
+        self.target_sample_hz = target_sample_hz
+        self.seq_len_multiple_of = seq_len_multiple_of
+        self.output_layer = output_layer
+        if device is not None:
+            self.to(device)
+        model_path = Path(checkpoint_path)
+        assert model_path.exists(), f'path {checkpoint_path} does not exist'
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        load_model_input = {checkpoint_path: checkpoint}
+        model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
+        if device is not None:
+            model[0].to(device)
+        self.model = model[0]
+        self.model.eval()
+    @property
+    def groups(self):
+        return 1
+    @torch.no_grad()
+    def forward(
+        self,
+        wav_input,
+        flatten=True,
+        input_sample_hz=None
+    ):
+        device = wav_input.device
+        if exists(input_sample_hz):
+            wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
+        if exists(self.seq_len_multiple_of):
+            wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
+        embed = self.model(
+            wav_input,
+            features_only=True,
+            mask=False,  # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
+            output_layer=self.output_layer
+        )
+        embed, packed_shape = pack([embed['x']], '* d')
+        # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
+        codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)  # .long()
+        if flatten:
+            return codebook_indices
+        codebook_indices, = unpack(codebook_indices, packed_shape, '*')
+        return codebook_indices

cog.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+build:
+  gpu: true
+  cuda: "11.8"
+  python_version: "3.10"
+  python_packages:
+    - "audiolm-pytorch==1.1.4"
+    - "fairseq"
+    - "huggingface-hub"
+    - "sentencepiece"
+    - "transformers"
+    - "encodec"
+    - 'soundfile; platform_system == "Windows"'
+    - 'sox; platform_system != "Windows"'
+    - "tensorboardX"
+    - "torch"
+    - "torchvision"
+    - "torchaudio"
+    - "light-the-torch"
+predict: "predict.py:Predictor"

colab_notebook.ipynb ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Bark text-to-speech voice cloning.\n",
+    "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).\n",
+    "(This version of the notebook is made to work on Google Colab, make sure your runtime hardware accelerator is set to GPU)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Google Colab: Clone the repository"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/\n",
+    "%cd bark-voice-cloning-HuBERT-quantizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Install packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading HuBERT...\n",
+      "Loading Quantizer...\n",
+      "Loading Encodec...\n",
+      "Downloaded and loaded models!\n"
+     ]
+    }
+   ],
+   "source": [
+    "large_quant_model = False  # Use the larger pretrained model\n",
+    "device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torchaudio\n",
+    "from encodec import EncodecModel\n",
+    "from encodec.utils import convert_audio\n",
+    "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
+    "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
+    "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
+    "\n",
+    "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
+    "\n",
+    "print('Loading HuBERT...')\n",
+    "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
+    "print('Loading Quantizer...')\n",
+    "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
+    "print('Loading Encodec...')\n",
+    "encodec_model = EncodecModel.encodec_model_24khz()\n",
+    "encodec_model.set_target_bandwidth(6.0)\n",
+    "encodec_model.to(device)\n",
+    "\n",
+    "print('Downloaded and loaded models!')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Load wav and create speaker history prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting semantics...\n",
+      "Tokenizing semantics...\n",
+      "Creating coarse and fine prompts...\n",
+      "Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "wav_file = 'speaker.wav'  # Put the path of the speaker you want to use here.\n",
+    "out_file = 'speaker.npz'  # Put the path to save the cloned speaker to here.\n",
+    "\n",
+    "wav, sr = torchaudio.load(wav_file)\n",
+    "\n",
+    "wav_hubert = wav.to(device)\n",
+    "\n",
+    "if wav_hubert.shape[0] == 2:  # Stereo to mono if needed\n",
+    "    wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
+    "\n",
+    "print('Extracting semantics...')\n",
+    "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
+    "print('Tokenizing semantics...')\n",
+    "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
+    "print('Creating coarse and fine prompts...')\n",
+    "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
+    "\n",
+    "wav = wav.to(device)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    encoded_frames = encodec_model.encode(wav)\n",
+    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
+    "\n",
+    "codes = codes.cpu()\n",
+    "semantic_tokens = semantic_tokens.cpu()\n",
+    "\n",
+    "np.savez(out_file,\n",
+    "         semantic_prompt=semantic_tokens,\n",
+    "         fine_prompt=codes,\n",
+    "         coarse_prompt=codes[:2, :]\n",
+    "         )\n",
+    "\n",
+    "print('Done!')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/models/hubert/hubert_base_ls960.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1703cf8d2cdc76f8c046f5f6a9bcd224e0e6caf4744cad1a1f4199c32cac8c8d
+size 1136468879

data/models/hubert/quantifier_V1_hubert_base_ls960_23.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d94c5dd646bcfe1a8bb470372f0004c189acf65d913831f3a6ed6414c9ba86f
+size 243656111

data/models/hubert/quantifier_hubert_base_ls960_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cf7eeab58835c5fc1cfbd3fd19c457fbd07859a5f036a6bfea4b6840716c1e7
+size 103981977

examples/biden_example.mov ADDED Viewed

Binary file (73.7 kB). View file

install_hubert.py ADDED Viewed

	@@ -0,0 +1,28 @@

+ # SETUP
+large_quant_model = False  # Use the larger pretrained model
+device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
+import numpy as np
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+from bark_hubert_quantizer.hubert_manager import HuBERTManager
+from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
+from bark_hubert_quantizer.customtokenizer import CustomTokenizer
+model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
+    'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
+print('Loading HuBERT...')
+hubert_model = CustomHubert(
+    HuBERTManager.make_sure_hubert_installed(), device=device)
+print('Loading Quantizer...')
+quant_model = CustomTokenizer.load_from_checkpoint(
+    HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
+print('Loading Encodec...')
+encodec_model = EncodecModel.encodec_model_24khz()
+encodec_model.set_target_bandwidth(6.0)
+encodec_model.to(device)
+print('Downloaded and loaded models!')

notebook.ipynb ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Bark text-to-speech voice cloning.\n",
+    "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Install packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading HuBERT...\n",
+      "Loading Quantizer...\n",
+      "Loading Encodec...\n",
+      "Downloaded and loaded models!\n"
+     ]
+    }
+   ],
+   "source": [
+    "large_quant_model = False  # Use the larger pretrained model\n",
+    "device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torchaudio\n",
+    "from encodec import EncodecModel\n",
+    "from encodec.utils import convert_audio\n",
+    "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
+    "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
+    "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
+    "\n",
+    "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
+    "\n",
+    "print('Loading HuBERT...')\n",
+    "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
+    "print('Loading Quantizer...')\n",
+    "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
+    "print('Loading Encodec...')\n",
+    "encodec_model = EncodecModel.encodec_model_24khz()\n",
+    "encodec_model.set_target_bandwidth(6.0)\n",
+    "encodec_model.to(device)\n",
+    "\n",
+    "print('Downloaded and loaded models!')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Load wav and create speaker history prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting semantics...\n",
+      "Tokenizing semantics...\n",
+      "Creating coarse and fine prompts...\n",
+      "Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "wav_file = 'speaker.wav'  # Put the path of the speaker you want to use here.\n",
+    "out_file = 'speaker.npz'  # Put the path to save the cloned speaker to here.\n",
+    "\n",
+    "wav, sr = torchaudio.load(wav_file)\n",
+    "\n",
+    "wav_hubert = wav.to(device)\n",
+    "\n",
+    "if wav_hubert.shape[0] == 2:  # Stereo to mono if needed\n",
+    "    wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
+    "\n",
+    "print('Extracting semantics...')\n",
+    "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
+    "print('Tokenizing semantics...')\n",
+    "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
+    "print('Creating coarse and fine prompts...')\n",
+    "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
+    "\n",
+    "wav = wav.to(device)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    encoded_frames = encodec_model.encode(wav)\n",
+    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
+    "\n",
+    "codes = codes.cpu()\n",
+    "semantic_tokens = semantic_tokens.cpu()\n",
+    "\n",
+    "np.savez(out_file,\n",
+    "         semantic_prompt=semantic_tokens,\n",
+    "         fine_prompt=codes,\n",
+    "         coarse_prompt=codes[:2, :]\n",
+    "         )\n",
+    "\n",
+    "print('Done!')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

predict.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Optional
+from cog import BasePredictor, Input, Path, BaseModel
+class ModelOutput(BaseModel):
+    prompt_npz: Optional[Path]
+    audio_out: Path
+class Predictor(BasePredictor):
+    def setup(self):
+        """Load the model into memory to make running multiple predictions efficient"""
+    def predict(
+        self,
+        speaker: Path = Input(
+            description="Reference audio.", default=None),
+    ) -> ModelOutput:
+        """Run a single prediction on the model"""
+        # SETUP
+        large_quant_model = False  # Use the larger pretrained model
+        device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
+        import numpy as np
+        import torch
+        import torchaudio
+        from encodec import EncodecModel
+        from encodec.utils import convert_audio
+        from bark_hubert_quantizer.hubert_manager import HuBERTManager
+        from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
+        from bark_hubert_quantizer.customtokenizer import CustomTokenizer
+        model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
+            'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
+        print('Loading HuBERT...')
+        hubert_model = CustomHubert(
+            HuBERTManager.make_sure_hubert_installed(), device=device)
+        print('Loading Quantizer...')
+        quant_model = CustomTokenizer.load_from_checkpoint(
+            HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
+        print('Loading Encodec...')
+        encodec_model = EncodecModel.encodec_model_24khz()
+        encodec_model.set_target_bandwidth(6.0)
+        encodec_model.to(device)
+        print('Downloaded and loaded models!')
+        # PREDICT
+        # Put the path of the speaker you want to use here.
+        wav_file = speaker
+        # Put the path to save the cloned speaker to here.
+        out_file = 'speaker.npz'
+        wav, sr = torchaudio.load(wav_file)
+        wav_hubert = wav.to(device)
+        if wav_hubert.shape[0] == 2:  # Stereo to mono if needed
+            wav_hubert = wav_hubert.mean(0, keepdim=True)
+        print('Extracting semantics...')
+        semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)
+        print('Tokenizing semantics...')
+        semantic_tokens = quant_model.get_token(semantic_vectors)
+        print('Creating coarse and fine prompts...')
+        wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)
+        wav = wav.to(device)
+        with torch.no_grad():
+            encoded_frames = encodec_model.encode(wav)
+        codes = torch.cat([encoded[0]
+                          for encoded in encoded_frames], dim=-1).squeeze()
+        codes = codes.cpu()
+        semantic_tokens = semantic_tokens.cpu()
+        np.savez(out_file,
+                 semantic_prompt=semantic_tokens,
+                 fine_prompt=codes,
+                 coarse_prompt=codes[:2, :]
+                 )
+        print('Done!')
+        return ModelOutput(audio_out=Path('speaker.npz'))

prepare.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import shutil
+import zipfile
+import numpy
+import torchaudio
+from hubert.pre_kmeans_hubert import CustomHubert
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def prepare(path):
+    """
+    Put all the training data in one folder
+    :param path: The path to the training data, with 2 subdirectories with zips, "semantic" and "wav", with equal pairs in both directories
+    """
+    path = os.path.abspath(path)
+    raw_data_paths = {
+        'semantic': os.path.join(path, 'semantic'),
+        'wav': os.path.join(path, 'wav')
+    }
+    prepared_path = os.path.join(path, 'prepared')
+    if not os.path.isdir(prepared_path):
+        os.mkdir(prepared_path)
+    offset = 0
+    for zip_file in os.listdir(raw_data_paths['semantic']):
+        print(f'Extracting {os.path.basename(zip_file)}')
+        offset = extract_files({
+            'semantic': os.path.join(raw_data_paths['semantic'], zip_file),
+            'wav': os.path.join(raw_data_paths['wav'], zip_file)
+        }, prepared_path, offset)
+def extract_files(zip_files: dict[str, str], out: str, start_offset: int = 0) -> int:
+    new_offset = start_offset
+    with zipfile.ZipFile(zip_files['semantic'], 'r') as semantic_zip:
+        with zipfile.ZipFile(zip_files['wav'], 'r') as wav_zip:
+            for file in semantic_zip.infolist():
+                for file2 in wav_zip.infolist():
+                    if ''.join(file.filename.split('.')[:-1]).lower() == ''.join(file2.filename.split('.')[:-1]):
+                        semantic_zip.extract(file, out)
+                        shutil.move(os.path.join(out, file.filename), os.path.join(out, f'{new_offset}_semantic.npy'))
+                        wav_zip.extract(file2, out)
+                        shutil.move(os.path.join(out, file2.filename), os.path.join(out, f'{new_offset}_wav.wav'))
+                        new_offset += 1
+            wav_zip.close()
+        semantic_zip.close()
+    return new_offset
+def prepare2(path, model):
+    prepared = os.path.join(path, 'prepared')
+    ready = os.path.join(path, 'ready')
+    hubert_model = CustomHubert(checkpoint_path=model, device=device)
+    if not os.path.isdir(ready):
+        os.mkdir(ready)
+    wav_string = '_wav.wav'
+    sem_string = '_semantic.npy'
+    for input_file in os.listdir(prepared):
+        input_path = os.path.join(prepared, input_file)
+        if input_file.endswith(wav_string):
+            file_num = int(input_file[:-len(wav_string)])
+            fname = f'{file_num}_semantic_features.npy'
+            print('Processing', input_file)
+            if os.path.isfile(fname):
+                continue
+            wav, sr = torchaudio.load(input_path)
+            wav = wav.to(device)
+            if wav.shape[0] == 2:  # Stereo to mono if needed
+                wav = wav.mean(0, keepdim=True)
+            output = hubert_model.forward(wav, input_sample_hz=sr)
+            out_array = output.cpu().numpy()
+            numpy.save(os.path.join(ready, fname), out_array)
+        elif input_file.endswith(sem_string):
+            fname = os.path.join(ready, input_file)
+            if os.path.isfile(fname):
+                continue
+            shutil.copy(input_path, fname)
+    print('All set! We\'re ready to train!')

process.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os.path
+from args import args
+from prepare import prepare, prepare2
+from test_hubert import test_hubert
+from hubert.customtokenizer import auto_train
+path = args.path
+mode = args.mode
+model = args.hubert_model
+if mode == 'prepare':
+    prepare(path)
+elif mode == 'prepare2':
+    prepare2(path, model)
+elif mode == 'train':
+    auto_train(path, load_model=os.path.join(path, 'model.pth'), save_epochs=args.train_save_epochs)
+elif mode == 'test':
+    test_hubert(path, model)

readme.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# Bark voice cloning
+## Please read
+This code works on python 3.10, i have not tested it on other versions. Some older versions will have issues.
+## Voice cloning with bark in high quality?
+It's possible now.
+https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/assets/36931363/516375e2-d699-44fe-a928-cd0411982049
+## How do I clone a voice?
+For developers:
+* [code examples on huggingface model page](https://huggingface.co/GitMylo/bark-voice-cloning)
+For everyone:
+* [audio-webui with bark and voice cloning](https://github.com/gitmylo/audio-webui)
+* [online huggingface voice cloning space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning)
+* [interactive python notebook](notebook.ipynb)
+## Voices cloned aren't very convincing, why are other people's cloned voices better than mine?
+Make sure these things are **NOT** in your voice input: (in no particular order)
+* Noise (You can use a noise remover before)
+* Music (There are also music remover tools) (Unless you want music in the background)
+* A cut-off at the end (This will cause it to try and continue on the generation)
+* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
+What makes for good prompt audio? (in no particular order)
+* Clearly spoken
+* No weird background noises
+* Only one speaker
+* Audio which ends after a sentence ends
+* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
+* Around 10 seconds of data
+## Pretrained models
+### Official
+| Name                                                                                                                                         | HuBERT Model                                                              | Quantizer Version | Epoch | Language | Dataset                                                                                          |
+|----------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|--------------------------------------------------------------------------------------------------|
+| [quantifier_hubert_base_ls960.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960.pth)             | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0                 | 3     | ENG      | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
+| [quantifier_hubert_base_ls960_14.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960_14.pth)       | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0                 | 14    | ENG      | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
+| [quantifier_V1_hubert_base_ls960_23.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_V1_hubert_base_ls960_23.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1                 | 23    | ENG      | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
+### Community
+| Author                                      | Name                                                                                                                                                                 | HuBERT Model                                                              | Quantizer Version | Epoch | Language | Dataset                                                                                                                      |
+|---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|------------------------------------------------------------------------------------------------------------------------------|
+| [HobisPL](https://github.com/HobisPL)       | [polish-HuBERT-quantizer_8_epoch.pth](https://huggingface.co/Hobis/bark-voice-cloning-polish-HuBERT-quantizer/blob/main/polish-HuBERT-quantizer_8_epoch.pth)         | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1                 | 8     | POL      | [Hobis/bark-polish-semantic-wav-training](https://huggingface.co/datasets/Hobis/bark-polish-semantic-wav-training)           |
+| [C0untFloyd](https://github.com/C0untFloyd) | [ german-HuBERT-quantizer_14_epoch.pth](https://huggingface.co/CountFloyd/bark-voice-cloning-german-HuBERT-quantizer/blob/main/german-HuBERT-quantizer_14_epoch.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1                 | 14    | GER      | [CountFloyd/bark-german-semantic-wav-training](https://huggingface.co/datasets/CountFloyd/bark-german-semantic-wav-training) |
+## For developers: Implementing voice cloning in your bark projects
+* Simply copy the files from [this directory](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/tree/master/bark_hubert_quantizer) into your project.
+* The [hubert manager](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/hubert_manager.py) contains methods to download HuBERT and the custom Quantizer model.
+* Loading the [CustomHuBERT](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/pre_kmeans_hubert.py) should be pretty straightforward
+* The [notebook](notebook.ipynb) contains code to use on cuda or cpu. Instead of just cpu.
+```python
+from hubert.pre_kmeans_hubert import CustomHubert
+import torchaudio
+# Load the HuBERT model,
+# checkpoint_path should work fine with data/models/hubert/hubert.pt for the default config
+hubert_model = CustomHubert(checkpoint_path='path/to/checkpoint')
+# Run the model to extract semantic features from an audio file, where wav is your audio file
+wav, sr = torchaudio.load('path/to/wav') # This is where you load your wav, with soundfile or torchaudio for example
+if wav.shape[0] == 2:  # Stereo to mono if needed
+    wav = wav.mean(0, keepdim=True)
+semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
+```
+* Loading and running the [custom kmeans](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer)
+```python
+import torch
+from hubert.customtokenizer import CustomTokenizer
+# Load the CustomTokenizer model from a checkpoint
+# With default config, you can use the pretrained model from huggingface
+# With the default setup from HuBERTManager, this will be in data/models/hubert/tokenizer.pth
+tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth')  # Automatically uses the right layers
+# Process the semantic vectors from the previous HuBERT run (This works in batches, so you can send the entire HuBERT output)
+semantic_tokens = tokenizer.get_token(semantic_vectors)
+# Congratulations! You now have semantic tokens which can be used inside of a speaker prompt file.
+```
+## How do I train it myself?
+Simply run the training commands.
+A simple way to create semantic data and wavs for training, is with my script: [bark-data-gen](https://github.com/gitmylo/bark-data-gen). But remember that the creation of the wavs will take around the same time if not longer than the creation of the semantics. This can take a while to generate because of that.
+For example, if you have a dataset with zips containing audio files, one zip for semantics, and one for the wav files. Inside of a folder called "Literature"
+You should run `process.py --path Literature --mode prepare` for extracting all the data to one directory
+You should run `process.py --path Literature --mode prepare2` for creating HuBERT semantic vectors, ready for training
+You should run `process.py --path Literature --mode train` for training
+And when your model has trained enough, you can run `process.py --path Literature --mode test` to test the latest model.
+## Disclaimer
+I am not responsible for audio generated using semantics created by this model. Just don't use it for illegal purposes.

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+audiolm-pytorch==1.1.4
+fairseq
+huggingface-hub
+sentencepiece
+transformers
+encodec
+soundfile; platform_system == "Windows"
+sox; platform_system != "Windows"

sample-speaker.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba7c59faa843a892cb35733b5bdad5a6bd3eebadf70494d48694a06c2fefbad6
+size 1324090

setup.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from setuptools import setup
+setup(
+    name='bark_hubert_quantizer',
+    version='0.0.4',
+    packages=['bark_hubert_quantizer'],
+    install_requires=[
+        'audiolm-pytorch==1.1.4',
+        'fairseq',
+        'huggingface-hub',
+        'sentencepiece',
+        'transformers',
+        'encodec',
+        'soundfile; platform_system == "Windows"',
+        'sox; platform_system != "Windows"'
+    ],
+)

test_hubert.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import numpy
+import torch
+import torchaudio
+from hubert.customtokenizer import CustomTokenizer
+from hubert.pre_kmeans_hubert import CustomHubert
+def test_hubert(path: str, model: str = 'model/hubert/hubert_base_ls960.pt', tokenizer: str = 'model.pth'):
+    hubert_model = CustomHubert(checkpoint_path=model)
+    customtokenizer = CustomTokenizer.load_from_checkpoint(os.path.join(path, tokenizer))
+    wav, sr = torchaudio.load(os.path.join(path, 'test', 'wav.wav'))
+    original = numpy.load(os.path.join(path, 'test', 'semantic.npy'))
+    out = hubert_model.forward(wav, input_sample_hz=sr)
+    out_tokenized = customtokenizer.get_token(out)
+    # print(out.shape, out_tokenized.shape)
+    print(original[:-1], out_tokenized)
+    numpy.save(os.path.join(path, 'test', 'gen_semantic.npy'), out_tokenized)