import math import os.path import uuid import gradio import numpy import torch from hubert.hubert_manager import HuBERTManager from hubert.pre_kmeans_hubert import CustomHubert from hubert.customtokenizer import CustomTokenizer from encodec import EncodecModel from encodec.utils import convert_audio hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed()) tokenizer_model = CustomTokenizer.load_from_checkpoint( HuBERTManager.make_sure_tokenizer_installed(model='quantifier_V1_hubert_base_ls960_23.pth'), map_location=torch.device('cpu') ) encodec_model = EncodecModel.encodec_model_24khz() def clone(audio, *args): sr, wav = audio if wav.shape[0] == 2: # Stereo to mono if needed wav = wav.mean(0, keepdim=True) wav = wav[-int(sr*20):] # Take only the last 20 seconds duration = wav.shape[0] wav = wav.reshape(1, -1) # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N) wav = torch.tensor(wav, dtype=torch.float32) semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr) semantic_tokens = tokenizer_model.get_token(semantic_vectors) encodec_model.set_target_bandwidth(6.0) wav = convert_audio(wav, sr, encodec_model.sample_rate, 1) wav = wav.unsqueeze(0) with torch.no_grad(): encoded_frames = encodec_model.encode(wav) codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [B, n_q, T] if not os.path.isdir('data/speakers'): os.makedirs('data/speakers') file_path = f'data/speakers/{uuid.uuid4().hex}.npz' numpy.savez( file_path, semantic_prompt=semantic_tokens, fine_prompt=codes, coarse_prompt=codes[:2, :] ) return file_path iface = gradio.interface.Interface(fn=clone, inputs=[ 'audio', gradio.Markdown( ''' # Bark text to speech voice cloning [Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui) For faster creation of voice clones [Duplicate this space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning?duplicate=true) Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway) ## Tips for better cloning ### Make sure these things are **NOT** in your voice input: (in no particular order) * Noise (You can use a noise remover before) * Music (There are also music remover tools) (Unless you want music in the background) * A cut-off at the end (This will cause it to try and continue on the generation) * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.) ### What makes for good prompt audio? (in no particular order) * Clearly spoken * No weird background noises * Only one speaker * Audio which ends after a sentence ends * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it) * Around 10 seconds of data ''') ], outputs='file') iface.launch()