Hobis commited on
Commit
2eaaa08
1 Parent(s): 8f086d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -77
app.py CHANGED
@@ -1,10 +1,8 @@
1
- import math
2
- import os.path
3
- import uuid
4
-
5
- import gradio
6
- import numpy
7
  import torch
 
8
 
9
  from hubert.hubert_manager import HuBERTManager
10
  from hubert.pre_kmeans_hubert import CustomHubert
@@ -12,82 +10,31 @@ from hubert.customtokenizer import CustomTokenizer
12
  from encodec import EncodecModel
13
  from encodec.utils import convert_audio
14
 
15
-
16
- hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed())
17
- tokenizer_model = CustomTokenizer.load_from_checkpoint(
18
- HuBERTManager.make_sure_tokenizer_installed(model='polish-HuBERT-quantizer_8_epoch.pth'),
19
- map_location=torch.device('cpu')
20
- )
21
- encodec_model = EncodecModel.encodec_model_24khz()
22
-
23
-
24
-
25
- def clone(audio, *args):
26
- sr, wav = audio
27
- if wav.shape[0] == 2: # Stereo to mono if needed
28
  wav = wav.mean(0, keepdim=True)
29
-
30
- wav = wav[-int(sr*20):] # Take only the last 20 seconds
31
-
32
- duration = wav.shape[0]
33
-
34
- wav = wav.reshape(1, -1) # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)
35
-
36
- wav = torch.tensor(wav, dtype=torch.float32)
37
-
38
  semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
39
- semantic_tokens = tokenizer_model.get_token(semantic_vectors)
40
-
41
- encodec_model.set_target_bandwidth(6.0)
42
- wav = convert_audio(wav, sr, encodec_model.sample_rate, 1)
 
43
  wav = wav.unsqueeze(0)
44
-
45
  with torch.no_grad():
46
- encoded_frames = encodec_model.encode(wav)
47
-
48
- codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [B, n_q, T]
49
-
50
- if not os.path.isdir('data/speakers'):
51
- os.makedirs('data/speakers')
52
-
53
- file_path = f'data/speakers/{uuid.uuid4().hex}.npz'
54
-
55
- numpy.savez(
56
- file_path,
57
- semantic_prompt=semantic_tokens,
58
- fine_prompt=codes,
59
- coarse_prompt=codes[:2, :]
60
- )
61
-
62
- return file_path
63
-
64
-
65
-
66
- iface = gradio.interface.Interface(fn=clone, inputs=[
67
- 'audio',
68
- gradio.Markdown(
69
- '''
70
- # Bark text to speech voice cloning
71
- [Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)
72
 
73
- For faster creation of voice clones [Duplicate this space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning?duplicate=true)
 
 
74
 
75
- Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway)
 
76
 
77
- ## Tips for better cloning
78
- ### Make sure these things are **NOT** in your voice input: (in no particular order)
79
- * Noise (You can use a noise remover before)
80
- * Music (There are also music remover tools) (Unless you want music in the background)
81
- * A cut-off at the end (This will cause it to try and continue on the generation)
82
- * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
83
 
84
- ### What makes for good prompt audio? (in no particular order)
85
- * Clearly spoken
86
- * No weird background noises
87
- * Only one speaker
88
- * Audio which ends after a sentence ends
89
- * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
90
- * Around 10 seconds of data
91
- ''')
92
- ], outputs='file')
93
- iface.launch()
 
1
+ import gradio as gr
2
+ import os
3
+ import torchaudio
 
 
 
4
  import torch
5
+ import numpy as np
6
 
7
  from hubert.hubert_manager import HuBERTManager
8
  from hubert.pre_kmeans_hubert import CustomHubert
 
10
  from encodec import EncodecModel
11
  from encodec.utils import convert_audio
12
 
13
+ def process_audio(audio_file):
14
+ hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt')
15
+ wav, sr = torchaudio.load(audio_file)
16
+ if wav.shape[0] == 2:
 
 
 
 
 
 
 
 
 
17
  wav = wav.mean(0, keepdim=True)
 
 
 
 
 
 
 
 
 
18
  semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
19
+ tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth')
20
+ semantic_tokens = tokenizer.get_token(semantic_vectors)
21
+ model = EncodecModel.encodec_model_24khz()
22
+ model.set_target_bandwidth(6.0)
23
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
24
  wav = wav.unsqueeze(0)
 
25
  with torch.no_grad():
26
+ encoded_frames = model.encode(wav)
27
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
28
+ fine_prompt = codes
29
+ coarse_prompt = fine_prompt[:2, :]
30
+ np.savez('helloWorld.npz', semantic_prompt=semantic_tokens, fine_prompt=fine_prompt, coarse_prompt=coarse_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def audio_file_processing(input_audio):
33
+ process_audio('audio.wav')
34
+ return "Plik audio został przetworzony i zapisany jako helloWorld.npz"
35
 
36
+ audio_input = gr.inputs.Audio(label="Wybierz plik audio")
37
+ audio_output = gr.outputs.Textbox(label="Status")
38
 
39
+ gr.Interface(fn=audio_file_processing, inputs=audio_input, outputs=audio_output).launch()
 
 
 
 
 
40