Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
-
import
|
2 |
-
import os
|
3 |
-
import
|
4 |
-
|
5 |
-
import gradio
|
6 |
-
import numpy
|
7 |
import torch
|
|
|
8 |
|
9 |
from hubert.hubert_manager import HuBERTManager
|
10 |
from hubert.pre_kmeans_hubert import CustomHubert
|
@@ -12,82 +10,31 @@ from hubert.customtokenizer import CustomTokenizer
|
|
12 |
from encodec import EncodecModel
|
13 |
from encodec.utils import convert_audio
|
14 |
|
15 |
-
|
16 |
-
hubert_model = CustomHubert(
|
17 |
-
|
18 |
-
|
19 |
-
map_location=torch.device('cpu')
|
20 |
-
)
|
21 |
-
encodec_model = EncodecModel.encodec_model_24khz()
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
def clone(audio, *args):
|
26 |
-
sr, wav = audio
|
27 |
-
if wav.shape[0] == 2: # Stereo to mono if needed
|
28 |
wav = wav.mean(0, keepdim=True)
|
29 |
-
|
30 |
-
wav = wav[-int(sr*20):] # Take only the last 20 seconds
|
31 |
-
|
32 |
-
duration = wav.shape[0]
|
33 |
-
|
34 |
-
wav = wav.reshape(1, -1) # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)
|
35 |
-
|
36 |
-
wav = torch.tensor(wav, dtype=torch.float32)
|
37 |
-
|
38 |
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
43 |
wav = wav.unsqueeze(0)
|
44 |
-
|
45 |
with torch.no_grad():
|
46 |
-
encoded_frames =
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
os.makedirs('data/speakers')
|
52 |
-
|
53 |
-
file_path = f'data/speakers/{uuid.uuid4().hex}.npz'
|
54 |
-
|
55 |
-
numpy.savez(
|
56 |
-
file_path,
|
57 |
-
semantic_prompt=semantic_tokens,
|
58 |
-
fine_prompt=codes,
|
59 |
-
coarse_prompt=codes[:2, :]
|
60 |
-
)
|
61 |
-
|
62 |
-
return file_path
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
iface = gradio.interface.Interface(fn=clone, inputs=[
|
67 |
-
'audio',
|
68 |
-
gradio.Markdown(
|
69 |
-
'''
|
70 |
-
# Bark text to speech voice cloning
|
71 |
-
[Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)
|
72 |
|
73 |
-
|
|
|
|
|
74 |
|
75 |
-
|
|
|
76 |
|
77 |
-
|
78 |
-
### Make sure these things are **NOT** in your voice input: (in no particular order)
|
79 |
-
* Noise (You can use a noise remover before)
|
80 |
-
* Music (There are also music remover tools) (Unless you want music in the background)
|
81 |
-
* A cut-off at the end (This will cause it to try and continue on the generation)
|
82 |
-
* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
|
83 |
|
84 |
-
### What makes for good prompt audio? (in no particular order)
|
85 |
-
* Clearly spoken
|
86 |
-
* No weird background noises
|
87 |
-
* Only one speaker
|
88 |
-
* Audio which ends after a sentence ends
|
89 |
-
* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
|
90 |
-
* Around 10 seconds of data
|
91 |
-
''')
|
92 |
-
], outputs='file')
|
93 |
-
iface.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import torchaudio
|
|
|
|
|
|
|
4 |
import torch
|
5 |
+
import numpy as np
|
6 |
|
7 |
from hubert.hubert_manager import HuBERTManager
|
8 |
from hubert.pre_kmeans_hubert import CustomHubert
|
|
|
10 |
from encodec import EncodecModel
|
11 |
from encodec.utils import convert_audio
|
12 |
|
13 |
+
def process_audio(audio_file):
|
14 |
+
hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt')
|
15 |
+
wav, sr = torchaudio.load(audio_file)
|
16 |
+
if wav.shape[0] == 2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
wav = wav.mean(0, keepdim=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
|
19 |
+
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth')
|
20 |
+
semantic_tokens = tokenizer.get_token(semantic_vectors)
|
21 |
+
model = EncodecModel.encodec_model_24khz()
|
22 |
+
model.set_target_bandwidth(6.0)
|
23 |
+
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
|
24 |
wav = wav.unsqueeze(0)
|
|
|
25 |
with torch.no_grad():
|
26 |
+
encoded_frames = model.encode(wav)
|
27 |
+
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
|
28 |
+
fine_prompt = codes
|
29 |
+
coarse_prompt = fine_prompt[:2, :]
|
30 |
+
np.savez('helloWorld.npz', semantic_prompt=semantic_tokens, fine_prompt=fine_prompt, coarse_prompt=coarse_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
def audio_file_processing(input_audio):
|
33 |
+
process_audio('audio.wav')
|
34 |
+
return "Plik audio został przetworzony i zapisany jako helloWorld.npz"
|
35 |
|
36 |
+
audio_input = gr.inputs.Audio(label="Wybierz plik audio")
|
37 |
+
audio_output = gr.outputs.Textbox(label="Status")
|
38 |
|
39 |
+
gr.Interface(fn=audio_file_processing, inputs=audio_input, outputs=audio_output).launch()
|
|
|
|
|
|
|
|
|
|
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|