Nymbo's picture
Update app.py
9ed7f7e verified
import math
import os.path
import uuid
import gradio
import numpy
import torch
from hubert.hubert_manager import HuBERTManager
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer
from encodec import EncodecModel
from encodec.utils import convert_audio
hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed())
tokenizer_model = CustomTokenizer.load_from_checkpoint(
HuBERTManager.make_sure_tokenizer_installed(model='quantifier_V1_hubert_base_ls960_23.pth'),
map_location=torch.device('cpu')
)
encodec_model = EncodecModel.encodec_model_24khz()
def clone(audio, *args):
sr, wav = audio
wav = torch.tensor(wav)
if wav.dtype == torch.int16:
wav = wav.float() / 32767.0
if len(wav.shape) == 2:
if wav.shape[0] == 2: # Stereo to mono if needed
wav = wav.mean(0, keepdim=True)
if wav.shape[1] == 2:
wav = wav.mean(1, keepdim=False).unsqueeze(-1)
wav = wav[-int(sr*20):] # Take only the last 20 seconds
wav = wav.reshape(1, -1) # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
semantic_tokens = tokenizer_model.get_token(semantic_vectors)
encodec_model.set_target_bandwidth(6.0)
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1)
wav = wav.unsqueeze(0)
with torch.no_grad():
encoded_frames = encodec_model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [B, n_q, T]
if not os.path.isdir('data/speakers'):
os.makedirs('data/speakers')
file_path = f'data/speakers/{uuid.uuid4().hex}.npz'
numpy.savez(
file_path,
semantic_prompt=semantic_tokens,
fine_prompt=codes,
coarse_prompt=codes[:2, :]
)
return file_path
iface = gradio.interface.Interface(fn=clone, theme="Nymbo/Nymbo_Theme", inputs=[
'audio',
gradio.Markdown(
'''
# Bark text to speech voice cloning
[Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)
For faster creation of voice clones [Duplicate this space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning?duplicate=true)
Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway)
## Tips for better cloning
### Make sure these things are **NOT** in your voice input: (in no particular order)
* Noise (You can use a noise remover before)
* Music (There are also music remover tools) (Unless you want music in the background)
* A cut-off at the end (This will cause it to try and continue on the generation)
* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
### What makes for good prompt audio? (in no particular order)
* Clearly spoken
* No weird background noises
* Only one speaker
* Audio which ends after a sentence ends
* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
* Around 10 seconds of data
''')
], outputs='file')
iface.launch()