jameshuntercarter commited on
Commit
b5dba8a
1 Parent(s): ed39a03

Upload 24 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sample-speaker.wav filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Mylo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
args.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+
3
+ parser = ArgumentParser()
4
+
5
+ parser.add_argument('--path', required=True, help='The path containing your semantic tokens and wavs')
6
+ parser.add_argument('--mode', required=True, help='The mode to use', choices=['prepare', 'prepare2', 'train', 'test'])
7
+ parser.add_argument('--hubert-model', default='model/hubert/hubert_base_ls960.pt', help='The hubert model to use for preparing the data and later creation of semantic tokens.')
8
+ parser.add_argument('--train-save-epochs', default=1, type=int, help='The amount of epochs to train before saving')
9
+
10
+ args = parser.parse_args()
bark_hubert_quantizer/__init__.py ADDED
File without changes
bark_hubert_quantizer/customtokenizer.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom tokenizer model.
3
+ Author: https://www.github.com/gitmylo/
4
+ License: MIT
5
+ """
6
+
7
+ import json
8
+ import os.path
9
+ from zipfile import ZipFile
10
+
11
+ import numpy
12
+ import torch
13
+ from torch import nn, optim
14
+ from torch.serialization import MAP_LOCATION
15
+
16
+
17
+ class CustomTokenizer(nn.Module):
18
+ def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
19
+ super(CustomTokenizer, self).__init__()
20
+ next_size = input_size
21
+ if version == 0:
22
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
23
+ next_size = hidden_size
24
+ if version == 1:
25
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
26
+ self.intermediate = nn.Linear(hidden_size, 4096)
27
+ next_size = 4096
28
+
29
+ self.fc = nn.Linear(next_size, output_size)
30
+ self.softmax = nn.LogSoftmax(dim=1)
31
+ self.optimizer: optim.Optimizer = None
32
+ self.lossfunc = nn.CrossEntropyLoss()
33
+ self.input_size = input_size
34
+ self.hidden_size = hidden_size
35
+ self.output_size = output_size
36
+ self.version = version
37
+
38
+ def forward(self, x):
39
+ x, _ = self.lstm(x)
40
+ if self.version == 1:
41
+ x = self.intermediate(x)
42
+ x = self.fc(x)
43
+ x = self.softmax(x)
44
+ return x
45
+
46
+ @torch.no_grad()
47
+ def get_token(self, x):
48
+ """
49
+ Used to get the token for the first
50
+ :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
51
+ :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
52
+ """
53
+ return torch.argmax(self(x), dim=1)
54
+
55
+ def prepare_training(self):
56
+ self.optimizer = optim.Adam(self.parameters(), 0.001)
57
+
58
+ def train_step(self, x_train, y_train, log_loss=False):
59
+ # y_train = y_train[:-1]
60
+ # y_train = y_train[1:]
61
+
62
+ optimizer = self.optimizer
63
+ lossfunc = self.lossfunc
64
+ # Zero the gradients
65
+ self.zero_grad()
66
+
67
+ # Forward pass
68
+ y_pred = self(x_train)
69
+
70
+ y_train_len = len(y_train)
71
+ y_pred_len = y_pred.shape[0]
72
+
73
+ if y_train_len > y_pred_len:
74
+ diff = y_train_len - y_pred_len
75
+ y_train = y_train[diff:]
76
+ elif y_train_len < y_pred_len:
77
+ diff = y_pred_len - y_train_len
78
+ y_pred = y_pred[:-diff, :]
79
+
80
+ y_train_hot = torch.zeros(len(y_train), self.output_size)
81
+ y_train_hot[range(len(y_train)), y_train] = 1
82
+ y_train_hot = y_train_hot.to('cuda')
83
+
84
+ # Calculate the loss
85
+ loss = lossfunc(y_pred, y_train_hot)
86
+
87
+ # Print loss
88
+ if log_loss:
89
+ print('Loss', loss.item())
90
+
91
+ # Backward pass
92
+ loss.backward()
93
+
94
+ # Update the weights
95
+ optimizer.step()
96
+
97
+ def save(self, path):
98
+ info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info'
99
+ torch.save(self.state_dict(), path)
100
+ data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
101
+ with ZipFile(path, 'a') as model_zip:
102
+ model_zip.writestr(info_path, data_from_model.save())
103
+ model_zip.close()
104
+
105
+ @staticmethod
106
+ def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
107
+ old = True
108
+ with ZipFile(path) as model_zip:
109
+ filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
110
+ file = filesMatch[0] if filesMatch else None
111
+ if file:
112
+ old = False
113
+ data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
114
+ model_zip.close()
115
+ if old:
116
+ model = CustomTokenizer()
117
+ else:
118
+ model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
119
+ model.load_state_dict(torch.load(path, map_location=map_location))
120
+ if map_location:
121
+ model = model.to(map_location)
122
+ return model
123
+
124
+
125
+
126
+ class Data:
127
+ input_size: int
128
+ hidden_size: int
129
+ output_size: int
130
+ version: int
131
+
132
+ def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
133
+ self.input_size = input_size
134
+ self.hidden_size = hidden_size
135
+ self.output_size = output_size
136
+ self.version = version
137
+
138
+ @staticmethod
139
+ def load(string):
140
+ data = json.loads(string)
141
+ return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])
142
+
143
+ def save(self):
144
+ data = {
145
+ 'input_size': self.input_size,
146
+ 'hidden_size': self.hidden_size,
147
+ 'output_size': self.output_size,
148
+ 'version': self.version,
149
+ }
150
+ return json.dumps(data)
151
+
152
+
153
+ def auto_train(data_path, save_path='model.pth', load_model: str | None = None, save_epochs=1):
154
+ data_x, data_y = {}, {}
155
+
156
+ if load_model and os.path.isfile(load_model):
157
+ print('Loading model from', load_model)
158
+ model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
159
+ else:
160
+ print('Creating new model.')
161
+ model_training = CustomTokenizer(version=1).to('cuda')
162
+ save_path = os.path.join(data_path, save_path)
163
+ base_save_path = '.'.join(save_path.split('.')[:-1])
164
+
165
+ sem_string = '_semantic.npy'
166
+ feat_string = '_semantic_features.npy'
167
+
168
+ ready = os.path.join(data_path, 'ready')
169
+ for input_file in os.listdir(ready):
170
+ full_path = os.path.join(ready, input_file)
171
+ try:
172
+ prefix = input_file.split("_")[0]
173
+ number = int(prefix)
174
+ except ValueError as e:
175
+ raise e
176
+ if input_file.endswith(sem_string):
177
+ data_y[number] = numpy.load(full_path)
178
+ elif input_file.endswith(feat_string):
179
+ data_x[number] = numpy.load(full_path)
180
+
181
+ model_training.prepare_training()
182
+ epoch = 1
183
+
184
+ while 1:
185
+ for i in range(save_epochs):
186
+ j = 0
187
+ for i in range(max(len(data_x), len(data_y))):
188
+ x = data_x.get(i)
189
+ y = data_y.get(i)
190
+ if x is None or y is None:
191
+ print(f'The training data does not match. key={i}')
192
+ continue
193
+ model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0) # Print loss every 50 steps
194
+ j += 1
195
+ save_p = save_path
196
+ save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
197
+ model_training.save(save_p)
198
+ model_training.save(save_p_2)
199
+ print(f'Epoch {epoch} completed')
200
+ epoch += 1
bark_hubert_quantizer/hubert_manager.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import shutil
3
+ import urllib.request
4
+
5
+ import huggingface_hub
6
+
7
+
8
+ class HuBERTManager:
9
+ @staticmethod
10
+ def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
11
+ install_dir = os.path.join('data', 'models', 'hubert')
12
+ if not os.path.isdir(install_dir):
13
+ os.makedirs(install_dir, exist_ok=True)
14
+ install_file = os.path.join(install_dir, file_name)
15
+ if not os.path.isfile(install_file):
16
+ print('Downloading HuBERT base model')
17
+ urllib.request.urlretrieve(download_url, install_file)
18
+ print('Downloaded HuBERT')
19
+ return install_file
20
+
21
+
22
+ @staticmethod
23
+ def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
24
+ install_dir = os.path.join('data', 'models', 'hubert')
25
+ if not os.path.isdir(install_dir):
26
+ os.makedirs(install_dir, exist_ok=True)
27
+ install_file = os.path.join(install_dir, local_file)
28
+ if not os.path.isfile(install_file):
29
+ print('Downloading HuBERT custom tokenizer')
30
+ huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
31
+ shutil.move(os.path.join(install_dir, model), install_file)
32
+ print('Downloaded tokenizer')
33
+ return install_file
bark_hubert_quantizer/pre_kmeans_hubert.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modified HuBERT model without kmeans.
3
+ Original author: https://github.com/lucidrains/
4
+ Modified by: https://www.github.com/gitmylo/
5
+ License: MIT
6
+ """
7
+
8
+ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
9
+
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ from torch import nn
14
+ from einops import pack, unpack
15
+
16
+ import fairseq
17
+
18
+ from torchaudio.functional import resample
19
+
20
+ from audiolm_pytorch.utils import curtail_to_multiple
21
+
22
+ import logging
23
+ logging.root.setLevel(logging.ERROR)
24
+
25
+
26
+ def exists(val):
27
+ return val is not None
28
+
29
+
30
+ def default(val, d):
31
+ return val if exists(val) else d
32
+
33
+
34
+ class CustomHubert(nn.Module):
35
+ """
36
+ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
37
+ or you can train your own
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ checkpoint_path,
43
+ target_sample_hz=16000,
44
+ seq_len_multiple_of=None,
45
+ output_layer=9,
46
+ device=None
47
+ ):
48
+ super().__init__()
49
+ self.target_sample_hz = target_sample_hz
50
+ self.seq_len_multiple_of = seq_len_multiple_of
51
+ self.output_layer = output_layer
52
+
53
+ if device is not None:
54
+ self.to(device)
55
+
56
+ model_path = Path(checkpoint_path)
57
+
58
+ assert model_path.exists(), f'path {checkpoint_path} does not exist'
59
+
60
+ checkpoint = torch.load(checkpoint_path, map_location=device)
61
+ load_model_input = {checkpoint_path: checkpoint}
62
+ model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
63
+
64
+ if device is not None:
65
+ model[0].to(device)
66
+
67
+ self.model = model[0]
68
+ self.model.eval()
69
+
70
+ @property
71
+ def groups(self):
72
+ return 1
73
+
74
+ @torch.no_grad()
75
+ def forward(
76
+ self,
77
+ wav_input,
78
+ flatten=True,
79
+ input_sample_hz=None
80
+ ):
81
+ device = wav_input.device
82
+
83
+ if exists(input_sample_hz):
84
+ wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
85
+
86
+ if exists(self.seq_len_multiple_of):
87
+ wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
88
+
89
+ embed = self.model(
90
+ wav_input,
91
+ features_only=True,
92
+ mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
93
+ output_layer=self.output_layer
94
+ )
95
+
96
+ embed, packed_shape = pack([embed['x']], '* d')
97
+
98
+ # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
99
+
100
+ codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
101
+
102
+ if flatten:
103
+ return codebook_indices
104
+
105
+ codebook_indices, = unpack(codebook_indices, packed_shape, '*')
106
+ return codebook_indices
cog.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build:
2
+ gpu: true
3
+ cuda: "11.8"
4
+ python_version: "3.10"
5
+ python_packages:
6
+ - "audiolm-pytorch==1.1.4"
7
+ - "fairseq"
8
+ - "huggingface-hub"
9
+ - "sentencepiece"
10
+ - "transformers"
11
+ - "encodec"
12
+ - 'soundfile; platform_system == "Windows"'
13
+ - 'sox; platform_system != "Windows"'
14
+ - "tensorboardX"
15
+ - "torch"
16
+ - "torchvision"
17
+ - "torchaudio"
18
+ - "light-the-torch"
19
+
20
+ predict: "predict.py:Predictor"
colab_notebook.ipynb ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "collapsed": false
7
+ },
8
+ "source": [
9
+ "# Bark text-to-speech voice cloning.\n",
10
+ "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).\n",
11
+ "(This version of the notebook is made to work on Google Colab, make sure your runtime hardware accelerator is set to GPU)"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "markdown",
16
+ "metadata": {
17
+ "collapsed": false
18
+ },
19
+ "source": [
20
+ "# Google Colab: Clone the repository"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {
27
+ "collapsed": false
28
+ },
29
+ "outputs": [],
30
+ "source": [
31
+ "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/\n",
32
+ "%cd bark-voice-cloning-HuBERT-quantizer"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "markdown",
37
+ "metadata": {
38
+ "collapsed": false
39
+ },
40
+ "source": [
41
+ "## Install packages"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {
48
+ "collapsed": true
49
+ },
50
+ "outputs": [],
51
+ "source": [
52
+ "%pip install -r requirements.txt\n",
53
+ "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "markdown",
58
+ "metadata": {
59
+ "collapsed": false
60
+ },
61
+ "source": [
62
+ "## Load models"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 1,
68
+ "metadata": {
69
+ "collapsed": false
70
+ },
71
+ "outputs": [
72
+ {
73
+ "name": "stderr",
74
+ "output_type": "stream",
75
+ "text": [
76
+ "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
77
+ ]
78
+ },
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "Loading HuBERT...\n",
84
+ "Loading Quantizer...\n",
85
+ "Loading Encodec...\n",
86
+ "Downloaded and loaded models!\n"
87
+ ]
88
+ }
89
+ ],
90
+ "source": [
91
+ "large_quant_model = False # Use the larger pretrained model\n",
92
+ "device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
93
+ "\n",
94
+ "import numpy as np\n",
95
+ "import torch\n",
96
+ "import torchaudio\n",
97
+ "from encodec import EncodecModel\n",
98
+ "from encodec.utils import convert_audio\n",
99
+ "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
100
+ "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
101
+ "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
102
+ "\n",
103
+ "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
104
+ "\n",
105
+ "print('Loading HuBERT...')\n",
106
+ "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
107
+ "print('Loading Quantizer...')\n",
108
+ "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
109
+ "print('Loading Encodec...')\n",
110
+ "encodec_model = EncodecModel.encodec_model_24khz()\n",
111
+ "encodec_model.set_target_bandwidth(6.0)\n",
112
+ "encodec_model.to(device)\n",
113
+ "\n",
114
+ "print('Downloaded and loaded models!')"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "markdown",
119
+ "metadata": {
120
+ "collapsed": false
121
+ },
122
+ "source": [
123
+ "## Load wav and create speaker history prompt"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 2,
129
+ "metadata": {
130
+ "collapsed": false
131
+ },
132
+ "outputs": [
133
+ {
134
+ "name": "stdout",
135
+ "output_type": "stream",
136
+ "text": [
137
+ "Extracting semantics...\n",
138
+ "Tokenizing semantics...\n",
139
+ "Creating coarse and fine prompts...\n",
140
+ "Done!\n"
141
+ ]
142
+ }
143
+ ],
144
+ "source": [
145
+ "wav_file = 'speaker.wav' # Put the path of the speaker you want to use here.\n",
146
+ "out_file = 'speaker.npz' # Put the path to save the cloned speaker to here.\n",
147
+ "\n",
148
+ "wav, sr = torchaudio.load(wav_file)\n",
149
+ "\n",
150
+ "wav_hubert = wav.to(device)\n",
151
+ "\n",
152
+ "if wav_hubert.shape[0] == 2: # Stereo to mono if needed\n",
153
+ " wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
154
+ "\n",
155
+ "print('Extracting semantics...')\n",
156
+ "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
157
+ "print('Tokenizing semantics...')\n",
158
+ "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
159
+ "print('Creating coarse and fine prompts...')\n",
160
+ "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
161
+ "\n",
162
+ "wav = wav.to(device)\n",
163
+ "\n",
164
+ "with torch.no_grad():\n",
165
+ " encoded_frames = encodec_model.encode(wav)\n",
166
+ "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
167
+ "\n",
168
+ "codes = codes.cpu()\n",
169
+ "semantic_tokens = semantic_tokens.cpu()\n",
170
+ "\n",
171
+ "np.savez(out_file,\n",
172
+ " semantic_prompt=semantic_tokens,\n",
173
+ " fine_prompt=codes,\n",
174
+ " coarse_prompt=codes[:2, :]\n",
175
+ " )\n",
176
+ "\n",
177
+ "print('Done!')"
178
+ ]
179
+ }
180
+ ],
181
+ "metadata": {
182
+ "kernelspec": {
183
+ "display_name": "Python 3",
184
+ "language": "python",
185
+ "name": "python3"
186
+ },
187
+ "language_info": {
188
+ "codemirror_mode": {
189
+ "name": "ipython",
190
+ "version": 2
191
+ },
192
+ "file_extension": ".py",
193
+ "mimetype": "text/x-python",
194
+ "name": "python",
195
+ "nbconvert_exporter": "python",
196
+ "pygments_lexer": "ipython2",
197
+ "version": "2.7.6"
198
+ }
199
+ },
200
+ "nbformat": 4,
201
+ "nbformat_minor": 0
202
+ }
data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/models/hubert/hubert_base_ls960.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1703cf8d2cdc76f8c046f5f6a9bcd224e0e6caf4744cad1a1f4199c32cac8c8d
3
+ size 1136468879
data/models/hubert/quantifier_V1_hubert_base_ls960_23.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d94c5dd646bcfe1a8bb470372f0004c189acf65d913831f3a6ed6414c9ba86f
3
+ size 243656111
data/models/hubert/quantifier_hubert_base_ls960_14.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cf7eeab58835c5fc1cfbd3fd19c457fbd07859a5f036a6bfea4b6840716c1e7
3
+ size 103981977
examples/biden_example.mov ADDED
Binary file (73.7 kB). View file
 
install_hubert.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SETUP
2
+ large_quant_model = False # Use the larger pretrained model
3
+ device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torchaudio
8
+ from encodec import EncodecModel
9
+ from encodec.utils import convert_audio
10
+ from bark_hubert_quantizer.hubert_manager import HuBERTManager
11
+ from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
12
+ from bark_hubert_quantizer.customtokenizer import CustomTokenizer
13
+
14
+ model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
15
+ 'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
16
+
17
+ print('Loading HuBERT...')
18
+ hubert_model = CustomHubert(
19
+ HuBERTManager.make_sure_hubert_installed(), device=device)
20
+ print('Loading Quantizer...')
21
+ quant_model = CustomTokenizer.load_from_checkpoint(
22
+ HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
23
+ print('Loading Encodec...')
24
+ encodec_model = EncodecModel.encodec_model_24khz()
25
+ encodec_model.set_target_bandwidth(6.0)
26
+ encodec_model.to(device)
27
+
28
+ print('Downloaded and loaded models!')
notebook.ipynb ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "collapsed": false
7
+ },
8
+ "source": [
9
+ "# Bark text-to-speech voice cloning.\n",
10
+ "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark)."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "metadata": {
16
+ "collapsed": false
17
+ },
18
+ "source": [
19
+ "## Install packages"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {
26
+ "collapsed": true
27
+ },
28
+ "outputs": [],
29
+ "source": [
30
+ "%pip install -r requirements.txt\n",
31
+ "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "metadata": {
37
+ "collapsed": false
38
+ },
39
+ "source": [
40
+ "## Load models"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 1,
46
+ "metadata": {
47
+ "collapsed": false
48
+ },
49
+ "outputs": [
50
+ {
51
+ "name": "stderr",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
55
+ ]
56
+ },
57
+ {
58
+ "name": "stdout",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "Loading HuBERT...\n",
62
+ "Loading Quantizer...\n",
63
+ "Loading Encodec...\n",
64
+ "Downloaded and loaded models!\n"
65
+ ]
66
+ }
67
+ ],
68
+ "source": [
69
+ "large_quant_model = False # Use the larger pretrained model\n",
70
+ "device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
71
+ "\n",
72
+ "import numpy as np\n",
73
+ "import torch\n",
74
+ "import torchaudio\n",
75
+ "from encodec import EncodecModel\n",
76
+ "from encodec.utils import convert_audio\n",
77
+ "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
78
+ "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
79
+ "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
80
+ "\n",
81
+ "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
82
+ "\n",
83
+ "print('Loading HuBERT...')\n",
84
+ "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
85
+ "print('Loading Quantizer...')\n",
86
+ "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
87
+ "print('Loading Encodec...')\n",
88
+ "encodec_model = EncodecModel.encodec_model_24khz()\n",
89
+ "encodec_model.set_target_bandwidth(6.0)\n",
90
+ "encodec_model.to(device)\n",
91
+ "\n",
92
+ "print('Downloaded and loaded models!')"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "metadata": {
98
+ "collapsed": false
99
+ },
100
+ "source": [
101
+ "## Load wav and create speaker history prompt"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 2,
107
+ "metadata": {
108
+ "collapsed": false
109
+ },
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "Extracting semantics...\n",
116
+ "Tokenizing semantics...\n",
117
+ "Creating coarse and fine prompts...\n",
118
+ "Done!\n"
119
+ ]
120
+ }
121
+ ],
122
+ "source": [
123
+ "wav_file = 'speaker.wav' # Put the path of the speaker you want to use here.\n",
124
+ "out_file = 'speaker.npz' # Put the path to save the cloned speaker to here.\n",
125
+ "\n",
126
+ "wav, sr = torchaudio.load(wav_file)\n",
127
+ "\n",
128
+ "wav_hubert = wav.to(device)\n",
129
+ "\n",
130
+ "if wav_hubert.shape[0] == 2: # Stereo to mono if needed\n",
131
+ " wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
132
+ "\n",
133
+ "print('Extracting semantics...')\n",
134
+ "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
135
+ "print('Tokenizing semantics...')\n",
136
+ "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
137
+ "print('Creating coarse and fine prompts...')\n",
138
+ "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
139
+ "\n",
140
+ "wav = wav.to(device)\n",
141
+ "\n",
142
+ "with torch.no_grad():\n",
143
+ " encoded_frames = encodec_model.encode(wav)\n",
144
+ "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
145
+ "\n",
146
+ "codes = codes.cpu()\n",
147
+ "semantic_tokens = semantic_tokens.cpu()\n",
148
+ "\n",
149
+ "np.savez(out_file,\n",
150
+ " semantic_prompt=semantic_tokens,\n",
151
+ " fine_prompt=codes,\n",
152
+ " coarse_prompt=codes[:2, :]\n",
153
+ " )\n",
154
+ "\n",
155
+ "print('Done!')"
156
+ ]
157
+ }
158
+ ],
159
+ "metadata": {
160
+ "kernelspec": {
161
+ "display_name": "Python 3",
162
+ "language": "python",
163
+ "name": "python3"
164
+ },
165
+ "language_info": {
166
+ "codemirror_mode": {
167
+ "name": "ipython",
168
+ "version": 2
169
+ },
170
+ "file_extension": ".py",
171
+ "mimetype": "text/x-python",
172
+ "name": "python",
173
+ "nbconvert_exporter": "python",
174
+ "pygments_lexer": "ipython2",
175
+ "version": "2.7.6"
176
+ }
177
+ },
178
+ "nbformat": 4,
179
+ "nbformat_minor": 0
180
+ }
predict.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from cog import BasePredictor, Input, Path, BaseModel
3
+
4
+
5
+ class ModelOutput(BaseModel):
6
+ prompt_npz: Optional[Path]
7
+ audio_out: Path
8
+
9
+
10
+ class Predictor(BasePredictor):
11
+
12
+ def setup(self):
13
+ """Load the model into memory to make running multiple predictions efficient"""
14
+
15
+ def predict(
16
+ self,
17
+ speaker: Path = Input(
18
+ description="Reference audio.", default=None),
19
+ ) -> ModelOutput:
20
+ """Run a single prediction on the model"""
21
+ # SETUP
22
+ large_quant_model = False # Use the larger pretrained model
23
+ device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
24
+
25
+ import numpy as np
26
+ import torch
27
+ import torchaudio
28
+ from encodec import EncodecModel
29
+ from encodec.utils import convert_audio
30
+ from bark_hubert_quantizer.hubert_manager import HuBERTManager
31
+ from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
32
+ from bark_hubert_quantizer.customtokenizer import CustomTokenizer
33
+
34
+ model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
35
+ 'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
36
+
37
+ print('Loading HuBERT...')
38
+ hubert_model = CustomHubert(
39
+ HuBERTManager.make_sure_hubert_installed(), device=device)
40
+ print('Loading Quantizer...')
41
+ quant_model = CustomTokenizer.load_from_checkpoint(
42
+ HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
43
+ print('Loading Encodec...')
44
+ encodec_model = EncodecModel.encodec_model_24khz()
45
+ encodec_model.set_target_bandwidth(6.0)
46
+ encodec_model.to(device)
47
+
48
+ print('Downloaded and loaded models!')
49
+ # PREDICT
50
+ # Put the path of the speaker you want to use here.
51
+ wav_file = speaker
52
+ # Put the path to save the cloned speaker to here.
53
+ out_file = 'speaker.npz'
54
+
55
+ wav, sr = torchaudio.load(wav_file)
56
+
57
+ wav_hubert = wav.to(device)
58
+
59
+ if wav_hubert.shape[0] == 2: # Stereo to mono if needed
60
+ wav_hubert = wav_hubert.mean(0, keepdim=True)
61
+
62
+ print('Extracting semantics...')
63
+ semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)
64
+ print('Tokenizing semantics...')
65
+ semantic_tokens = quant_model.get_token(semantic_vectors)
66
+ print('Creating coarse and fine prompts...')
67
+ wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)
68
+
69
+ wav = wav.to(device)
70
+
71
+ with torch.no_grad():
72
+ encoded_frames = encodec_model.encode(wav)
73
+ codes = torch.cat([encoded[0]
74
+ for encoded in encoded_frames], dim=-1).squeeze()
75
+
76
+ codes = codes.cpu()
77
+ semantic_tokens = semantic_tokens.cpu()
78
+
79
+ np.savez(out_file,
80
+ semantic_prompt=semantic_tokens,
81
+ fine_prompt=codes,
82
+ coarse_prompt=codes[:2, :]
83
+ )
84
+
85
+ print('Done!')
86
+
87
+ return ModelOutput(audio_out=Path('speaker.npz'))
prepare.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import zipfile
4
+
5
+ import numpy
6
+ import torchaudio
7
+
8
+ from hubert.pre_kmeans_hubert import CustomHubert
9
+
10
+ import torch
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+ def prepare(path):
14
+ """
15
+ Put all the training data in one folder
16
+ :param path: The path to the training data, with 2 subdirectories with zips, "semantic" and "wav", with equal pairs in both directories
17
+ """
18
+ path = os.path.abspath(path)
19
+ raw_data_paths = {
20
+ 'semantic': os.path.join(path, 'semantic'),
21
+ 'wav': os.path.join(path, 'wav')
22
+ }
23
+ prepared_path = os.path.join(path, 'prepared')
24
+
25
+ if not os.path.isdir(prepared_path):
26
+ os.mkdir(prepared_path)
27
+
28
+ offset = 0
29
+
30
+ for zip_file in os.listdir(raw_data_paths['semantic']):
31
+ print(f'Extracting {os.path.basename(zip_file)}')
32
+ offset = extract_files({
33
+ 'semantic': os.path.join(raw_data_paths['semantic'], zip_file),
34
+ 'wav': os.path.join(raw_data_paths['wav'], zip_file)
35
+ }, prepared_path, offset)
36
+
37
+
38
+ def extract_files(zip_files: dict[str, str], out: str, start_offset: int = 0) -> int:
39
+ new_offset = start_offset
40
+ with zipfile.ZipFile(zip_files['semantic'], 'r') as semantic_zip:
41
+ with zipfile.ZipFile(zip_files['wav'], 'r') as wav_zip:
42
+ for file in semantic_zip.infolist():
43
+ for file2 in wav_zip.infolist():
44
+ if ''.join(file.filename.split('.')[:-1]).lower() == ''.join(file2.filename.split('.')[:-1]):
45
+ semantic_zip.extract(file, out)
46
+ shutil.move(os.path.join(out, file.filename), os.path.join(out, f'{new_offset}_semantic.npy'))
47
+ wav_zip.extract(file2, out)
48
+ shutil.move(os.path.join(out, file2.filename), os.path.join(out, f'{new_offset}_wav.wav'))
49
+ new_offset += 1
50
+ wav_zip.close()
51
+ semantic_zip.close()
52
+
53
+ return new_offset
54
+
55
+
56
+ def prepare2(path, model):
57
+ prepared = os.path.join(path, 'prepared')
58
+ ready = os.path.join(path, 'ready')
59
+ hubert_model = CustomHubert(checkpoint_path=model, device=device)
60
+ if not os.path.isdir(ready):
61
+ os.mkdir(ready)
62
+
63
+ wav_string = '_wav.wav'
64
+ sem_string = '_semantic.npy'
65
+
66
+ for input_file in os.listdir(prepared):
67
+ input_path = os.path.join(prepared, input_file)
68
+ if input_file.endswith(wav_string):
69
+ file_num = int(input_file[:-len(wav_string)])
70
+ fname = f'{file_num}_semantic_features.npy'
71
+ print('Processing', input_file)
72
+ if os.path.isfile(fname):
73
+ continue
74
+ wav, sr = torchaudio.load(input_path)
75
+ wav = wav.to(device)
76
+
77
+ if wav.shape[0] == 2: # Stereo to mono if needed
78
+ wav = wav.mean(0, keepdim=True)
79
+
80
+ output = hubert_model.forward(wav, input_sample_hz=sr)
81
+ out_array = output.cpu().numpy()
82
+ numpy.save(os.path.join(ready, fname), out_array)
83
+ elif input_file.endswith(sem_string):
84
+ fname = os.path.join(ready, input_file)
85
+ if os.path.isfile(fname):
86
+ continue
87
+ shutil.copy(input_path, fname)
88
+ print('All set! We\'re ready to train!')
process.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+
3
+ from args import args
4
+ from prepare import prepare, prepare2
5
+ from test_hubert import test_hubert
6
+ from hubert.customtokenizer import auto_train
7
+
8
+ path = args.path
9
+ mode = args.mode
10
+ model = args.hubert_model
11
+
12
+ if mode == 'prepare':
13
+ prepare(path)
14
+
15
+ elif mode == 'prepare2':
16
+ prepare2(path, model)
17
+
18
+ elif mode == 'train':
19
+ auto_train(path, load_model=os.path.join(path, 'model.pth'), save_epochs=args.train_save_epochs)
20
+
21
+ elif mode == 'test':
22
+ test_hubert(path, model)
readme.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bark voice cloning
2
+
3
+ ## Please read
4
+ This code works on python 3.10, i have not tested it on other versions. Some older versions will have issues.
5
+
6
+ ## Voice cloning with bark in high quality?
7
+ It's possible now.
8
+
9
+ https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/assets/36931363/516375e2-d699-44fe-a928-cd0411982049
10
+
11
+
12
+
13
+ ## How do I clone a voice?
14
+ For developers:
15
+ * [code examples on huggingface model page](https://huggingface.co/GitMylo/bark-voice-cloning)
16
+
17
+ For everyone:
18
+ * [audio-webui with bark and voice cloning](https://github.com/gitmylo/audio-webui)
19
+ * [online huggingface voice cloning space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning)
20
+ * [interactive python notebook](notebook.ipynb)
21
+
22
+ ## Voices cloned aren't very convincing, why are other people's cloned voices better than mine?
23
+ Make sure these things are **NOT** in your voice input: (in no particular order)
24
+ * Noise (You can use a noise remover before)
25
+ * Music (There are also music remover tools) (Unless you want music in the background)
26
+ * A cut-off at the end (This will cause it to try and continue on the generation)
27
+ * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
28
+
29
+ What makes for good prompt audio? (in no particular order)
30
+ * Clearly spoken
31
+ * No weird background noises
32
+ * Only one speaker
33
+ * Audio which ends after a sentence ends
34
+ * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
35
+ * Around 10 seconds of data
36
+
37
+ ## Pretrained models
38
+ ### Official
39
+
40
+ | Name | HuBERT Model | Quantizer Version | Epoch | Language | Dataset |
41
+ |----------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|--------------------------------------------------------------------------------------------------|
42
+ | [quantifier_hubert_base_ls960.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0 | 3 | ENG | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
43
+ | [quantifier_hubert_base_ls960_14.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960_14.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0 | 14 | ENG | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
44
+ | [quantifier_V1_hubert_base_ls960_23.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_V1_hubert_base_ls960_23.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1 | 23 | ENG | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
45
+
46
+ ### Community
47
+
48
+ | Author | Name | HuBERT Model | Quantizer Version | Epoch | Language | Dataset |
49
+ |---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|------------------------------------------------------------------------------------------------------------------------------|
50
+ | [HobisPL](https://github.com/HobisPL) | [polish-HuBERT-quantizer_8_epoch.pth](https://huggingface.co/Hobis/bark-voice-cloning-polish-HuBERT-quantizer/blob/main/polish-HuBERT-quantizer_8_epoch.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1 | 8 | POL | [Hobis/bark-polish-semantic-wav-training](https://huggingface.co/datasets/Hobis/bark-polish-semantic-wav-training) |
51
+ | [C0untFloyd](https://github.com/C0untFloyd) | [ german-HuBERT-quantizer_14_epoch.pth](https://huggingface.co/CountFloyd/bark-voice-cloning-german-HuBERT-quantizer/blob/main/german-HuBERT-quantizer_14_epoch.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1 | 14 | GER | [CountFloyd/bark-german-semantic-wav-training](https://huggingface.co/datasets/CountFloyd/bark-german-semantic-wav-training) |
52
+
53
+
54
+ ## For developers: Implementing voice cloning in your bark projects
55
+ * Simply copy the files from [this directory](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/tree/master/bark_hubert_quantizer) into your project.
56
+ * The [hubert manager](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/hubert_manager.py) contains methods to download HuBERT and the custom Quantizer model.
57
+ * Loading the [CustomHuBERT](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/pre_kmeans_hubert.py) should be pretty straightforward
58
+ * The [notebook](notebook.ipynb) contains code to use on cuda or cpu. Instead of just cpu.
59
+ ```python
60
+ from hubert.pre_kmeans_hubert import CustomHubert
61
+ import torchaudio
62
+
63
+ # Load the HuBERT model,
64
+ # checkpoint_path should work fine with data/models/hubert/hubert.pt for the default config
65
+ hubert_model = CustomHubert(checkpoint_path='path/to/checkpoint')
66
+
67
+ # Run the model to extract semantic features from an audio file, where wav is your audio file
68
+ wav, sr = torchaudio.load('path/to/wav') # This is where you load your wav, with soundfile or torchaudio for example
69
+
70
+ if wav.shape[0] == 2: # Stereo to mono if needed
71
+ wav = wav.mean(0, keepdim=True)
72
+
73
+ semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
74
+ ```
75
+ * Loading and running the [custom kmeans](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer)
76
+
77
+ ```python
78
+ import torch
79
+ from hubert.customtokenizer import CustomTokenizer
80
+
81
+ # Load the CustomTokenizer model from a checkpoint
82
+ # With default config, you can use the pretrained model from huggingface
83
+ # With the default setup from HuBERTManager, this will be in data/models/hubert/tokenizer.pth
84
+ tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth') # Automatically uses the right layers
85
+
86
+ # Process the semantic vectors from the previous HuBERT run (This works in batches, so you can send the entire HuBERT output)
87
+ semantic_tokens = tokenizer.get_token(semantic_vectors)
88
+
89
+ # Congratulations! You now have semantic tokens which can be used inside of a speaker prompt file.
90
+ ```
91
+
92
+ ## How do I train it myself?
93
+ Simply run the training commands.
94
+
95
+ A simple way to create semantic data and wavs for training, is with my script: [bark-data-gen](https://github.com/gitmylo/bark-data-gen). But remember that the creation of the wavs will take around the same time if not longer than the creation of the semantics. This can take a while to generate because of that.
96
+
97
+ For example, if you have a dataset with zips containing audio files, one zip for semantics, and one for the wav files. Inside of a folder called "Literature"
98
+
99
+ You should run `process.py --path Literature --mode prepare` for extracting all the data to one directory
100
+
101
+ You should run `process.py --path Literature --mode prepare2` for creating HuBERT semantic vectors, ready for training
102
+
103
+ You should run `process.py --path Literature --mode train` for training
104
+
105
+ And when your model has trained enough, you can run `process.py --path Literature --mode test` to test the latest model.
106
+
107
+ ## Disclaimer
108
+ I am not responsible for audio generated using semantics created by this model. Just don't use it for illegal purposes.
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ audiolm-pytorch==1.1.4
2
+ fairseq
3
+ huggingface-hub
4
+ sentencepiece
5
+ transformers
6
+ encodec
7
+ soundfile; platform_system == "Windows"
8
+ sox; platform_system != "Windows"
sample-speaker.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba7c59faa843a892cb35733b5bdad5a6bd3eebadf70494d48694a06c2fefbad6
3
+ size 1324090
setup.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup(
4
+ name='bark_hubert_quantizer',
5
+ version='0.0.4',
6
+ packages=['bark_hubert_quantizer'],
7
+ install_requires=[
8
+ 'audiolm-pytorch==1.1.4',
9
+ 'fairseq',
10
+ 'huggingface-hub',
11
+ 'sentencepiece',
12
+ 'transformers',
13
+ 'encodec',
14
+ 'soundfile; platform_system == "Windows"',
15
+ 'sox; platform_system != "Windows"'
16
+ ],
17
+ )
test_hubert.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy
4
+ import torch
5
+ import torchaudio
6
+
7
+ from hubert.customtokenizer import CustomTokenizer
8
+ from hubert.pre_kmeans_hubert import CustomHubert
9
+
10
+
11
+ def test_hubert(path: str, model: str = 'model/hubert/hubert_base_ls960.pt', tokenizer: str = 'model.pth'):
12
+ hubert_model = CustomHubert(checkpoint_path=model)
13
+ customtokenizer = CustomTokenizer.load_from_checkpoint(os.path.join(path, tokenizer))
14
+
15
+ wav, sr = torchaudio.load(os.path.join(path, 'test', 'wav.wav'))
16
+ original = numpy.load(os.path.join(path, 'test', 'semantic.npy'))
17
+
18
+ out = hubert_model.forward(wav, input_sample_hz=sr)
19
+ out_tokenized = customtokenizer.get_token(out)
20
+
21
+ # print(out.shape, out_tokenized.shape)
22
+ print(original[:-1], out_tokenized)
23
+ numpy.save(os.path.join(path, 'test', 'gen_semantic.npy'), out_tokenized)