fffiloni commited on
Commit
34bc786
1 Parent(s): dd24337

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from bark.generation import codec_encode, load_codec_model, generate_text_semantic
3
+ from encodec.utils import convert_audio
4
+ import torchaudio
5
+ import torch
6
+
7
+ model = load_codec_model(use_gpu=True)
8
+
9
+ def clone_voice(audio_in, name, transcript_text):
10
+ # Load and pre-process the audio waveform
11
+ audio_filepath = audio_in # the audio WAV you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
12
+ wav, sr = torchaudio.load(audio_filepath)
13
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
14
+ wav = wav.unsqueeze(0).to('cuda')
15
+
16
+ # Extract discrete codes from EnCodec
17
+ with torch.no_grad():
18
+ encoded_frames = model.encode(wav)
19
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
20
+
21
+ #"Transcription of the audio you are cloning"
22
+ text = transcript_text
23
+
24
+ # get seconds of audio
25
+ seconds = wav.shape[-1] / model.sample_rate
26
+
27
+ # generate semantic token
28
+ semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)
29
+
30
+ # move codes to cpu
31
+ codes = codes.cpu().numpy()
32
+
33
+ voice_name = name # whatever you want the name of the voice to be
34
+ output_path = voice_name + '.npz'
35
+ np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
36
+
37
+ return voice_name + '.npz'
38
+
39
+ css="""
40
+ #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
41
+ """
42
+
43
+ title="""
44
+ <div style="text-align: center;">
45
+ <h1>Voice Cloning for Bark Text-to-Audio</h1>
46
+ <p>This demo is an adaptation of the Serp-AI attempts to enable voice cloning using Bark</p>
47
+ </div>
48
+ """
49
+
50
+ with gr.Blocks(css=css) as demo:
51
+ with gr.Column(elem_id="col-container"):
52
+ gr.HTML(title)
53
+ audio_in = gr.Audio(label="Voice in to clone", source="upload", type="filepath")
54
+ transcript = gr.Textbox(label="Manual transcription of your audio")
55
+ name = gr.Textbox(label="Name your voice")
56
+
57
+ generate_btn = gr.Button("Clone voice !")
58
+
59
+ npz_file = gr.File()
60
+
61
+ generate_btn.click(clone_voice, inputs=[audio_in, transcript, name], outputs=[npe_file])
62
+
63
+ demo.launch()