bark-voice-cloning-polish-HuBERT-quantizer

Running

App Files Files Community

Hobis commited on May 28, 2023

Commit

2eaaa08

•

1 Parent(s): 8f086d2

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -77

app.py CHANGED Viewed

@@ -1,10 +1,8 @@
-import math
-import os.path
-import uuid
-import gradio
-import numpy
 import torch
 from hubert.hubert_manager import HuBERTManager
 from hubert.pre_kmeans_hubert import CustomHubert
@@ -12,82 +10,31 @@ from hubert.customtokenizer import CustomTokenizer
 from encodec import EncodecModel
 from encodec.utils import convert_audio
-hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed())
-tokenizer_model = CustomTokenizer.load_from_checkpoint(
-    HuBERTManager.make_sure_tokenizer_installed(model='polish-HuBERT-quantizer_8_epoch.pth'),
-    map_location=torch.device('cpu')
-)
-encodec_model = EncodecModel.encodec_model_24khz()
-def clone(audio, *args):
-    sr, wav = audio
-    if wav.shape[0] == 2:  # Stereo to mono if needed
         wav = wav.mean(0, keepdim=True)
-    wav = wav[-int(sr*20):]  # Take only the last 20 seconds
-    duration = wav.shape[0]
-    wav = wav.reshape(1, -1)  # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)
-    wav = torch.tensor(wav, dtype=torch.float32)
     semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
-    semantic_tokens = tokenizer_model.get_token(semantic_vectors)
-    encodec_model.set_target_bandwidth(6.0)
-    wav = convert_audio(wav, sr, encodec_model.sample_rate, 1)
     wav = wav.unsqueeze(0)
     with torch.no_grad():
-        encoded_frames = encodec_model.encode(wav)
-    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [B, n_q, T]
-    if not os.path.isdir('data/speakers'):
-        os.makedirs('data/speakers')
-    file_path = f'data/speakers/{uuid.uuid4().hex}.npz'
-    numpy.savez(
-        file_path,
-        semantic_prompt=semantic_tokens,
-        fine_prompt=codes,
-        coarse_prompt=codes[:2, :]
-    )
-    return file_path
-iface = gradio.interface.Interface(fn=clone, inputs=[
-    'audio',
-    gradio.Markdown(
-        '''
-        # Bark text to speech voice cloning
-        [Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)
-        For faster creation of voice clones [Duplicate this space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning?duplicate=true)
-        Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway)
-        ## Tips for better cloning
-        ### Make sure these things are **NOT** in your voice input: (in no particular order)
-        * Noise (You can use a noise remover before)
-        * Music (There are also music remover tools) (Unless you want music in the background)
-        * A cut-off at the end (This will cause it to try and continue on the generation)
-        * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
-        ### What makes for good prompt audio? (in no particular order)
-        * Clearly spoken
-        * No weird background noises
-        * Only one speaker
-        * Audio which ends after a sentence ends
-        * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
-        * Around 10 seconds of data
-        ''')
-], outputs='file')
-iface.launch()

+import gradio as gr
+import os
+import torchaudio
 import torch
+import numpy as np
 from hubert.hubert_manager import HuBERTManager
 from hubert.pre_kmeans_hubert import CustomHubert
 from encodec import EncodecModel
 from encodec.utils import convert_audio
+def process_audio(audio_file):
+    hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt')
+    wav, sr = torchaudio.load(audio_file)
+    if wav.shape[0] == 2:
         wav = wav.mean(0, keepdim=True)
     semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
+    tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth')
+    semantic_tokens = tokenizer.get_token(semantic_vectors)
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(6.0)
+    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
     wav = wav.unsqueeze(0)
     with torch.no_grad():
+        encoded_frames = model.encode(wav)
+    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
+    fine_prompt = codes
+    coarse_prompt = fine_prompt[:2, :]
+    np.savez('helloWorld.npz', semantic_prompt=semantic_tokens, fine_prompt=fine_prompt, coarse_prompt=coarse_prompt)
+def audio_file_processing(input_audio):
+    process_audio('audio.wav')
+    return "Plik audio został przetworzony i zapisany jako helloWorld.npz"
+audio_input = gr.inputs.Audio(label="Wybierz plik audio")
+audio_output = gr.outputs.Textbox(label="Status")
+gr.Interface(fn=audio_file_processing, inputs=audio_input, outputs=audio_output).launch()