kevinwang676 commited on
Commit
6af2279
0 Parent(s):

Duplicate from kevinwang676/Bark-UI-with-Voice-Cloning

Browse files
.gitattributes ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ SE_checkpoint.pth.tar filter=lfs diff=lfs merge=lfs -text
36
+ best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
37
+ nana_longest_vocal.wav filter=lfs diff=lfs merge=lfs -text
38
+ test.wav filter=lfs diff=lfs merge=lfs -text
39
+ reference.wav filter=lfs diff=lfs merge=lfs -text
40
+ ref.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Bark with Voice Cloning
3
+ emoji: 📊
4
+ colorFrom: purple
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: kevinwang676/Bark-UI-with-Voice-Cloning
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
SE_checkpoint.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
3
+ size 44610930
app.py ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ os.system("git clone https://github.com/C0untFloyd/bark-gui.git")
5
+ sys.path.append("./bark-gui/")
6
+
7
+ from cProfile import label
8
+ from distutils.command.check import check
9
+ from doctest import Example
10
+ import gradio as gr
11
+ import numpy as np
12
+ import logging
13
+ import torch
14
+ import pytorch_seed
15
+ import time
16
+
17
+ from xml.sax import saxutils
18
+ from bark.api import generate_with_settings
19
+ from bark.api import save_as_prompt
20
+ from settings import Settings
21
+ #import nltk
22
+
23
+ from bark import SAMPLE_RATE
24
+ from bark.clonevoice import clone_voice
25
+ from bark.generation import SAMPLE_RATE, preload_models
26
+ from scipy.io.wavfile import write as write_wav
27
+ from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
28
+ from datetime import datetime
29
+ from tqdm.auto import tqdm
30
+ from id3tagging import add_id3_tag
31
+
32
+ import shutil
33
+
34
+ import string
35
+ import argparse
36
+ import json
37
+
38
+ from TTS.tts.utils.synthesis import synthesis
39
+ from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
40
+ try:
41
+ from TTS.utils.audio import AudioProcessor
42
+ except:
43
+ from TTS.utils.audio import AudioProcessor
44
+
45
+
46
+ from TTS.tts.models import setup_model
47
+ from TTS.config import load_config
48
+ from TTS.tts.models.vits import *
49
+
50
+ from TTS.tts.utils.speakers import SpeakerManager
51
+ from pydub import AudioSegment
52
+
53
+ # from google.colab import files
54
+ import librosa
55
+
56
+ from scipy.io.wavfile import write, read
57
+
58
+ import subprocess
59
+
60
+
61
+ OUTPUTFOLDER = "Outputs"
62
+
63
+
64
+ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, progress=gr.Progress(track_tqdm=True)):
65
+ if text == None or len(text) < 1:
66
+ raise gr.Error('No text entered!')
67
+
68
+ # Chunk the text into smaller pieces then combine the generated audio
69
+
70
+ # generation settings
71
+ if selected_speaker == 'None':
72
+ selected_speaker = None
73
+ if seed != None and seed > 2**32 - 1:
74
+ logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
75
+ seed = None
76
+ if seed == None or seed <= 0:
77
+ seed = np.random.default_rng().integers(1, 2**32 - 1)
78
+ assert(0 < seed and seed < 2**32)
79
+
80
+ voice_name = selected_speaker
81
+ use_last_generation_as_history = "Use last generation as history" in complete_settings
82
+ save_last_generation = "Save generation as Voice" in complete_settings
83
+ progress(0, desc="Generating")
84
+
85
+ silenceshort = np.zeros(int((float(settings.silence_sentence) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # quarter second of silence
86
+ silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
87
+ full_generation = None
88
+
89
+ all_parts = []
90
+ complete_text = ""
91
+ text = text.lstrip()
92
+ if is_ssml(text):
93
+ list_speak = create_clips_from_ssml(text)
94
+ prev_speaker = None
95
+ for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
96
+ selected_speaker = clip[0]
97
+ # Add pause break between speakers
98
+ if i > 0 and selected_speaker != prev_speaker:
99
+ all_parts += [silencelong.copy()]
100
+ prev_speaker = selected_speaker
101
+ text = clip[1]
102
+ text = saxutils.unescape(text)
103
+ if selected_speaker == "None":
104
+ selected_speaker = None
105
+
106
+ print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker} (Seed {seed}):`{text}`")
107
+ complete_text += text
108
+ with pytorch_seed.SavedRNG(seed):
109
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
110
+ seed = torch.random.initial_seed()
111
+ if len(list_speak) > 1:
112
+ filename = create_filename(OUTPUTFOLDER, seed, "audioclip",".wav")
113
+ save_wav(audio_array, filename)
114
+ add_id3_tag(filename, text, selected_speaker, seed)
115
+
116
+ all_parts += [audio_array]
117
+ else:
118
+ texts = split_and_recombine_text(text, settings.input_text_desired_length, settings.input_text_max_length)
119
+ for i, text in tqdm(enumerate(texts), total=len(texts)):
120
+ print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker} (Seed {seed}):`{text}`")
121
+ complete_text += text
122
+ if quick_generation == True:
123
+ with pytorch_seed.SavedRNG(seed):
124
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
125
+ seed = torch.random.initial_seed()
126
+ else:
127
+ full_output = use_last_generation_as_history or save_last_generation
128
+ if full_output:
129
+ full_generation, audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob, output_full=True)
130
+ else:
131
+ audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
132
+
133
+ # Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
134
+ # audio_array = (audio_array * 32767).astype(np.int16)
135
+
136
+ if len(texts) > 1:
137
+ filename = create_filename(OUTPUTFOLDER, seed, "audioclip",".wav")
138
+ save_wav(audio_array, filename)
139
+ add_id3_tag(filename, text, selected_speaker, seed)
140
+
141
+ if quick_generation == False and (save_last_generation == True or use_last_generation_as_history == True):
142
+ # save to npz
143
+ voice_name = create_filename(OUTPUTFOLDER, seed, "audioclip", ".npz")
144
+ save_as_prompt(voice_name, full_generation)
145
+ if use_last_generation_as_history:
146
+ selected_speaker = voice_name
147
+
148
+ all_parts += [audio_array]
149
+ # Add short pause between sentences
150
+ if text[-1] in "!?.\n" and i > 1:
151
+ all_parts += [silenceshort.copy()]
152
+
153
+ # save & play audio
154
+ result = create_filename(OUTPUTFOLDER, seed, "final",".wav")
155
+ save_wav(np.concatenate(all_parts), result)
156
+ # write id3 tag with text truncated to 60 chars, as a precaution...
157
+ add_id3_tag(result, complete_text, selected_speaker, seed)
158
+ return result
159
+
160
+ def create_filename(path, seed, name, extension):
161
+ now = datetime.now()
162
+ date_str =now.strftime("%m-%d-%Y")
163
+ outputs_folder = os.path.join(os.getcwd(), path)
164
+ if not os.path.exists(outputs_folder):
165
+ os.makedirs(outputs_folder)
166
+
167
+ sub_folder = os.path.join(outputs_folder, date_str)
168
+ if not os.path.exists(sub_folder):
169
+ os.makedirs(sub_folder)
170
+
171
+ time_str = now.strftime("%H-%M-%S")
172
+ file_name = f"{name}_{time_str}_s{seed}{extension}"
173
+ return os.path.join(sub_folder, file_name)
174
+
175
+
176
+ def save_wav(audio_array, filename):
177
+ write_wav(filename, SAMPLE_RATE, audio_array)
178
+
179
+ def save_voice(filename, semantic_prompt, coarse_prompt, fine_prompt):
180
+ np.savez_compressed(
181
+ filename,
182
+ semantic_prompt=semantic_prompt,
183
+ coarse_prompt=coarse_prompt,
184
+ fine_prompt=fine_prompt
185
+ )
186
+
187
+
188
+ def on_quick_gen_changed(checkbox):
189
+ if checkbox == False:
190
+ return gr.CheckboxGroup.update(visible=True)
191
+ return gr.CheckboxGroup.update(visible=False)
192
+
193
+ def delete_output_files(checkbox_state):
194
+ if checkbox_state:
195
+ outputs_folder = os.path.join(os.getcwd(), OUTPUTFOLDER)
196
+ if os.path.exists(outputs_folder):
197
+ purgedir(outputs_folder)
198
+ return False
199
+
200
+
201
+ # https://stackoverflow.com/a/54494779
202
+ def purgedir(parent):
203
+ for root, dirs, files in os.walk(parent):
204
+ for item in files:
205
+ # Delete subordinate files
206
+ filespec = os.path.join(root, item)
207
+ os.unlink(filespec)
208
+ for item in dirs:
209
+ # Recursively perform this operation for subordinate directories
210
+ purgedir(os.path.join(root, item))
211
+
212
+ def convert_text_to_ssml(text, selected_speaker):
213
+ return build_ssml(text, selected_speaker)
214
+
215
+
216
+ def apply_settings(themes, input_server_name, input_server_port, input_server_public, input_desired_len, input_max_len, input_silence_break, input_silence_speaker):
217
+ settings.selected_theme = themes
218
+ settings.server_name = input_server_name
219
+ settings.server_port = input_server_port
220
+ settings.server_share = input_server_public
221
+ settings.input_text_desired_length = input_desired_len
222
+ settings.input_text_max_length = input_max_len
223
+ settings.silence_sentence = input_silence_break
224
+ settings.silence_speaker = input_silence_speaker
225
+ settings.save()
226
+
227
+ def restart():
228
+ global restart_server
229
+ restart_server = True
230
+
231
+
232
+ def create_version_html():
233
+ python_version = ".".join([str(x) for x in sys.version_info[0:3]])
234
+ versions_html = f"""
235
+ python: <span title="{sys.version}">{python_version}</span>
236
+  • 
237
+ torch: {getattr(torch, '__long_version__',torch.__version__)}
238
+  • 
239
+ gradio: {gr.__version__}
240
+ """
241
+ return versions_html
242
+
243
+
244
+
245
+ logger = logging.getLogger(__name__)
246
+ APPTITLE = "Bark UI Enhanced v0.4.6"
247
+
248
+
249
+ autolaunch = False
250
+
251
+ if len(sys.argv) > 1:
252
+ autolaunch = "-autolaunch" in sys.argv
253
+
254
+
255
+ if torch.cuda.is_available() == False:
256
+ os.environ['BARK_FORCE_CPU'] = 'True'
257
+ logger.warning("No CUDA detected, fallback to CPU!")
258
+
259
+ print(f'smallmodels={os.environ.get("SUNO_USE_SMALL_MODELS", False)}')
260
+ print(f'enablemps={os.environ.get("SUNO_ENABLE_MPS", False)}')
261
+ print(f'offloadcpu={os.environ.get("SUNO_OFFLOAD_CPU", False)}')
262
+ print(f'forcecpu={os.environ.get("BARK_FORCE_CPU", False)}')
263
+ print(f'autolaunch={autolaunch}\n\n')
264
+
265
+ #print("Updating nltk\n")
266
+ #nltk.download('punkt')
267
+
268
+ print("Preloading Models\n")
269
+ preload_models()
270
+
271
+ settings = Settings('config.yaml')
272
+
273
+ # Collect all existing speakers/voices in dir
274
+ speakers_list = []
275
+
276
+ for root, dirs, files in os.walk("./bark/assets/prompts"):
277
+ for file in files:
278
+ if(file.endswith(".npz")):
279
+ pathpart = root.replace("./bark/assets/prompts", "")
280
+ name = os.path.join(pathpart, file[:-4])
281
+ if name.startswith("/") or name.startswith("\\"):
282
+ name = name[1:]
283
+ speakers_list.append(name)
284
+
285
+ speakers_list = sorted(speakers_list, key=lambda x: x.lower())
286
+ speakers_list.insert(0, 'None')
287
+
288
+ available_themes = ["Default", "gradio/glass", "gradio/monochrome", "gradio/seafoam", "gradio/soft", "gstaff/xkcd", "freddyaboulton/dracula_revamped", "ysharma/steampunk"]
289
+
290
+ seed = -1
291
+ server_name = settings.server_name
292
+ if len(server_name) < 1:
293
+ server_name = None
294
+ server_port = settings.server_port
295
+ if server_port <= 0:
296
+ server_port = None
297
+ global run_server
298
+ global restart_server
299
+
300
+ run_server = True
301
+
302
+
303
+
304
+
305
+ '''
306
+ from google.colab import drive
307
+ drive.mount('/content/drive')
308
+ src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar')
309
+ dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar')
310
+ shutil.copy(src_path, dst_path)
311
+ '''
312
+
313
+ TTS_PATH = "TTS/"
314
+
315
+ # add libraries into environment
316
+ sys.path.append(TTS_PATH) # set this if TTS is not installed globally
317
+
318
+ # Paths definition
319
+
320
+ OUT_PATH = 'out/'
321
+
322
+ # create output path
323
+ os.makedirs(OUT_PATH, exist_ok=True)
324
+
325
+ # model vars
326
+ MODEL_PATH = 'best_model.pth.tar'
327
+ CONFIG_PATH = 'config.json'
328
+ TTS_LANGUAGES = "language_ids.json"
329
+ TTS_SPEAKERS = "speakers.json"
330
+ USE_CUDA = torch.cuda.is_available()
331
+
332
+ # load the config
333
+ C = load_config(CONFIG_PATH)
334
+
335
+ # load the audio processor
336
+ ap = AudioProcessor(**C.audio)
337
+
338
+ speaker_embedding = None
339
+
340
+ C.model_args['d_vector_file'] = TTS_SPEAKERS
341
+ C.model_args['use_speaker_encoder_as_loss'] = False
342
+
343
+ model = setup_model(C)
344
+ model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
345
+ # print(model.language_manager.num_languages, model.embedded_language_dim)
346
+ # print(model.emb_l)
347
+ cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
348
+ # remove speaker encoder
349
+ model_weights = cp['model'].copy()
350
+ for key in list(model_weights.keys()):
351
+ if "speaker_encoder" in key:
352
+ del model_weights[key]
353
+
354
+ model.load_state_dict(model_weights)
355
+
356
+ model.eval()
357
+
358
+ if USE_CUDA:
359
+ model = model.cuda()
360
+
361
+ # synthesize voice
362
+ use_griffin_lim = False
363
+
364
+ # Paths definition
365
+
366
+ CONFIG_SE_PATH = "config_se.json"
367
+ CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
368
+
369
+ # Load the Speaker encoder
370
+
371
+ SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
372
+
373
+ # Define helper function
374
+
375
+ def compute_spec(ref_file):
376
+ y, sr = librosa.load(ref_file, sr=ap.sample_rate)
377
+ spec = ap.spectrogram(y)
378
+ spec = torch.FloatTensor(spec).unsqueeze(0)
379
+ return spec
380
+
381
+
382
+ def voice_conversion(ta, ra, da):
383
+
384
+ target_audio = 'target.wav'
385
+ reference_audio = 'reference.wav'
386
+ driving_audio = 'driving.wav'
387
+
388
+ write(target_audio, ta[0], ta[1])
389
+ write(reference_audio, ra[0], ra[1])
390
+ write(driving_audio, da[0], da[1])
391
+
392
+ # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
393
+ # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
394
+ # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
395
+
396
+ files = [target_audio, reference_audio, driving_audio]
397
+
398
+ for file in files:
399
+ subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
400
+
401
+ # ta_ = read(target_audio)
402
+
403
+ target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
404
+ target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
405
+
406
+ driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
407
+ driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
408
+
409
+ # Convert the voice
410
+
411
+ driving_spec = compute_spec(driving_audio)
412
+ y_lengths = torch.tensor([driving_spec.size(-1)])
413
+ if USE_CUDA:
414
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
415
+ ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
416
+ else:
417
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
418
+ ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
419
+
420
+ # print("Reference Audio after decoder:")
421
+ # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
422
+
423
+ return (ap.sample_rate, ref_wav_voc)
424
+
425
+
426
+ while run_server:
427
+ print(f'Launching {APPTITLE} Server')
428
+
429
+ # Create Gradio Blocks
430
+
431
+ with gr.Blocks(title=f"{APPTITLE}", mode=f"{APPTITLE}", theme=settings.selected_theme) as barkgui:
432
+ with gr.Row():
433
+ with gr.Column():
434
+ gr.Markdown(f"### [{APPTITLE}](https://github.com/C0untFloyd/bark-gui)")
435
+ with gr.Column():
436
+ gr.HTML(create_version_html(), elem_id="versions")
437
+
438
+ with gr.Tab("TTS"):
439
+ with gr.Row():
440
+ with gr.Column():
441
+ placeholder = "Enter text here."
442
+ input_text = gr.Textbox(label="Input Text", lines=4, placeholder=placeholder)
443
+ with gr.Column():
444
+ seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
445
+ convert_to_ssml_button = gr.Button("Convert Text to SSML")
446
+ with gr.Row():
447
+ with gr.Column():
448
+ examples = [
449
+ "Special meanings: [laughter] [laughs] [sighs] [music] [gasps] [clears throat] MAN: WOMAN:",
450
+ "♪ Never gonna make you cry, never gonna say goodbye, never gonna tell a lie and hurt you ♪",
451
+ "And now — a picture of a larch [laughter]",
452
+ """
453
+ WOMAN: I would like an oatmilk latte please.
454
+ MAN: Wow, that's expensive!
455
+ """,
456
+ """<?xml version="1.0"?>
457
+ <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
458
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
459
+ xsi:schemaLocation="http://www.w3.org/2001/10/synthesis
460
+ http://www.w3.org/TR/speech-synthesis/synthesis.xsd"
461
+ xml:lang="en-US">
462
+ <voice name="en_speaker_9">Look at that drunk guy!</voice>
463
+ <voice name="en_speaker_3">Who is he?</voice>
464
+ <voice name="en_speaker_9">WOMAN: [clears throat] 10 years ago, he proposed me and I rejected him.</voice>
465
+ <voice name="en_speaker_3">Oh my God [laughs] he is still celebrating</voice>
466
+ </speak>"""
467
+ ]
468
+ examples = gr.Examples(examples=examples, inputs=input_text)
469
+
470
+ with gr.Row():
471
+ with gr.Column():
472
+ gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)")
473
+ speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
474
+ with gr.Column():
475
+ text_temp = gr.Slider(0.1, 1.0, value=0.6, label="Generation Temperature", info="1.0 more diverse, 0.1 more conservative")
476
+ waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative")
477
+
478
+ with gr.Row():
479
+ with gr.Column():
480
+ quick_gen_checkbox = gr.Checkbox(label="Quick Generation", value=True)
481
+ settings_checkboxes = ["Use last generation as history", "Save generation as Voice"]
482
+ complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False)
483
+ with gr.Column():
484
+ eos_prob = gr.Slider(0.0, 0.5, value=0.05, label="End of sentence probability")
485
+
486
+ with gr.Row():
487
+ with gr.Column():
488
+ tts_create_button = gr.Button("Generate")
489
+ with gr.Column():
490
+ hidden_checkbox = gr.Checkbox(visible=False)
491
+ button_stop_generation = gr.Button("Stop generation")
492
+ with gr.Row():
493
+ output_audio = gr.Audio(label="Generated Audio")
494
+
495
+ with gr.Row():
496
+ inp1 = gr.Audio(label='Target Speaker - Reference Clip')
497
+ inp2 = output_audio
498
+ inp3 = output_audio
499
+ btn = gr.Button("Generate")
500
+ out1 = gr.Audio(label='Target Speaker - Converted Clip')
501
+ btn.click(voice_conversion, [inp1, inp2, inp3], [out1])
502
+
503
+
504
+
505
+ with gr.Tab("Clone Voice"):
506
+ input_audio_filename = gr.Audio(label="Input audio.wav", source="upload", type="filepath")
507
+ transcription_text = gr.Textbox(label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...")
508
+ initialname = "./bark/assets/prompts/custom/MeMyselfAndI"
509
+ output_voice = gr.Textbox(label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname)
510
+ clone_voice_button = gr.Button("Create Voice")
511
+ dummy = gr.Text(label="Progress")
512
+
513
+ with gr.Tab("Settings"):
514
+ with gr.Row():
515
+ themes = gr.Dropdown(available_themes, label="Theme", info="Change needs complete restart", value=settings.selected_theme)
516
+ with gr.Row():
517
+ input_server_name = gr.Textbox(label="Server Name", lines=1, info="Leave blank to run locally", value=settings.server_name)
518
+ input_server_port = gr.Number(label="Server Port", precision=0, info="Leave at 0 to use default", value=settings.server_port)
519
+ share_checkbox = gr.Checkbox(label="Public Server", value=settings.server_share)
520
+ with gr.Row():
521
+ input_desired_len = gr.Slider(100, 150, value=settings.input_text_desired_length, label="Desired Input Text Length", info="Ideal length to split input sentences")
522
+ input_max_len = gr.Slider(150, 256, value=settings.input_text_max_length, label="Max Input Text Length", info="Maximum Input Text Length")
523
+ with gr.Row():
524
+ input_silence_break = gr.Slider(1, 1000, value=settings.silence_sentence, label="Sentence Pause Time (ms)", info="Silence between sentences in milliseconds")
525
+ input_silence_speakers = gr.Slider(1, 5000, value=settings.silence_speakers, label="Speaker Pause Time (ms)", info="Silence between different speakers in milliseconds")
526
+
527
+ with gr.Row():
528
+ button_apply_settings = gr.Button("Apply Settings")
529
+ button_apply_restart = gr.Button("Restart Server")
530
+ button_delete_files = gr.Button("Clear output folder")
531
+
532
+ quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
533
+ convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
534
+ gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent],outputs=output_audio)
535
+ button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
536
+ # Javascript hack to display modal confirmation dialog
537
+ js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
538
+ button_delete_files.click(None, None, hidden_checkbox, _js=js)
539
+ hidden_checkbox.change(delete_output_files, [hidden_checkbox], [hidden_checkbox])
540
+ clone_voice_button.click(clone_voice, inputs=[input_audio_filename, transcription_text, output_voice], outputs=dummy)
541
+ button_apply_settings.click(apply_settings, inputs=[themes, input_server_name, input_server_port, share_checkbox, input_desired_len, input_max_len, input_silence_break, input_silence_speakers])
542
+ button_apply_restart.click(restart)
543
+ restart_server = False
544
+ try:
545
+ barkgui.queue().launch(show_error=True)
546
+ except:
547
+ restart_server = True
548
+ run_server = False
549
+ try:
550
+ while restart_server == False:
551
+ time.sleep(1.0)
552
+ except (KeyboardInterrupt, OSError):
553
+ print("Keyboard interruption in main thread... closing server.")
554
+ run_server = False
555
+ barkgui.close()
bark/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt
2
+ from .generation import SAMPLE_RATE, preload_models
bark/api.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional, Union
2
+
3
+ import numpy as np
4
+
5
+ from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic
6
+
7
+
8
+ def generate_with_settings(text_prompt, semantic_temp=0.6, eos_p=0.2, coarse_temp=0.7, fine_temp=0.5, voice_name=None, output_full=False):
9
+
10
+ # generation with more control
11
+ x_semantic = generate_text_semantic(
12
+ text_prompt,
13
+ history_prompt=voice_name,
14
+ temp=semantic_temp,
15
+ min_eos_p = eos_p,
16
+ use_kv_caching=True
17
+ )
18
+
19
+ x_coarse_gen = generate_coarse(
20
+ x_semantic,
21
+ history_prompt=voice_name,
22
+ temp=coarse_temp,
23
+ use_kv_caching=True
24
+ )
25
+ x_fine_gen = generate_fine(
26
+ x_coarse_gen,
27
+ history_prompt=voice_name,
28
+ temp=fine_temp,
29
+ )
30
+
31
+ if output_full:
32
+ full_generation = {
33
+ 'semantic_prompt': x_semantic,
34
+ 'coarse_prompt': x_coarse_gen,
35
+ 'fine_prompt': x_fine_gen
36
+ }
37
+ return full_generation, codec_decode(x_fine_gen)
38
+ return codec_decode(x_fine_gen)
39
+
40
+
41
+ def text_to_semantic(
42
+ text: str,
43
+ history_prompt: Optional[Union[Dict, str]] = None,
44
+ temp: float = 0.7,
45
+ silent: bool = False,
46
+ ):
47
+ """Generate semantic array from text.
48
+
49
+ Args:
50
+ text: text to be turned into audio
51
+ history_prompt: history choice for audio cloning
52
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
53
+ silent: disable progress bar
54
+
55
+ Returns:
56
+ numpy semantic array to be fed into `semantic_to_waveform`
57
+ """
58
+ x_semantic = generate_text_semantic(
59
+ text,
60
+ history_prompt=history_prompt,
61
+ temp=temp,
62
+ silent=silent,
63
+ use_kv_caching=True
64
+ )
65
+ return x_semantic
66
+
67
+
68
+ def semantic_to_waveform(
69
+ semantic_tokens: np.ndarray,
70
+ history_prompt: Optional[Union[Dict, str]] = None,
71
+ temp: float = 0.7,
72
+ silent: bool = False,
73
+ output_full: bool = False,
74
+ ):
75
+ """Generate audio array from semantic input.
76
+
77
+ Args:
78
+ semantic_tokens: semantic token output from `text_to_semantic`
79
+ history_prompt: history choice for audio cloning
80
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
81
+ silent: disable progress bar
82
+ output_full: return full generation to be used as a history prompt
83
+
84
+ Returns:
85
+ numpy audio array at sample frequency 24khz
86
+ """
87
+ coarse_tokens = generate_coarse(
88
+ semantic_tokens,
89
+ history_prompt=history_prompt,
90
+ temp=temp,
91
+ silent=silent,
92
+ use_kv_caching=True
93
+ )
94
+ fine_tokens = generate_fine(
95
+ coarse_tokens,
96
+ history_prompt=history_prompt,
97
+ temp=0.5,
98
+ )
99
+ audio_arr = codec_decode(fine_tokens)
100
+ if output_full:
101
+ full_generation = {
102
+ "semantic_prompt": semantic_tokens,
103
+ "coarse_prompt": coarse_tokens,
104
+ "fine_prompt": fine_tokens,
105
+ }
106
+ return full_generation, audio_arr
107
+ return audio_arr
108
+
109
+
110
+ def save_as_prompt(filepath, full_generation):
111
+ assert(filepath.endswith(".npz"))
112
+ assert(isinstance(full_generation, dict))
113
+ assert("semantic_prompt" in full_generation)
114
+ assert("coarse_prompt" in full_generation)
115
+ assert("fine_prompt" in full_generation)
116
+ np.savez(filepath, **full_generation)
117
+
118
+
119
+ def generate_audio(
120
+ text: str,
121
+ history_prompt: Optional[Union[Dict, str]] = None,
122
+ text_temp: float = 0.7,
123
+ waveform_temp: float = 0.7,
124
+ silent: bool = False,
125
+ output_full: bool = False,
126
+ ):
127
+ """Generate audio array from input text.
128
+
129
+ Args:
130
+ text: text to be turned into audio
131
+ history_prompt: history choice for audio cloning
132
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
133
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
134
+ silent: disable progress bar
135
+ output_full: return full generation to be used as a history prompt
136
+
137
+ Returns:
138
+ numpy audio array at sample frequency 24khz
139
+ """
140
+ semantic_tokens = text_to_semantic(
141
+ text,
142
+ history_prompt=history_prompt,
143
+ temp=text_temp,
144
+ silent=silent,
145
+ )
146
+ out = semantic_to_waveform(
147
+ semantic_tokens,
148
+ history_prompt=history_prompt,
149
+ temp=waveform_temp,
150
+ silent=silent,
151
+ output_full=output_full,
152
+ )
153
+ if output_full:
154
+ full_generation, audio_arr = out
155
+ return full_generation, audio_arr
156
+ else:
157
+ audio_arr = out
158
+ return audio_arr
bark/assets/prompts/announcer.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26f2d1a9e3b6fe453cf5fc8191de26cbfae6276c5b0f7c376c6a0f3c35867f83
3
+ size 16794
bark/assets/prompts/en_speaker_0.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:932f40d879ba8659f1ca26319ba64ea3b0647b2050fe24313bf42b0dff1fe241
3
+ size 28100
bark/assets/prompts/en_speaker_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e7f18015e1ab9b6302ded1e28a971af5306a72f193bb6c411f1948a083c8578
3
+ size 25220
bark/assets/prompts/en_speaker_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d218990680ece5f2d4fc18ea4783b016b3ae353ec413eaee2058f2d57263c9b3
3
+ size 26236
bark/assets/prompts/en_speaker_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c2e2a29145c83738e9b63f082fd1c873d9422468a155463cb27f814aeaea66
3
+ size 34980
bark/assets/prompts/en_speaker_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992f91991a9a5359d72f00b09a11a550e71bb8ebfc0cfd877e39d7d41f98b714
3
+ size 23780
bark/assets/prompts/en_speaker_5.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18831c3f6014e4a2ff60ad5169b1fae06e28ed07f43f8a3616aafb84515091bf
3
+ size 24740
bark/assets/prompts/en_speaker_6.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab38dc6b6bc9226bcc414f4c5a9524bc1b2441865a586153fb620127a8faa4e
3
+ size 25540
bark/assets/prompts/en_speaker_7.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f4c4eb33f5994be8de5cfd1744ebce13da1618a6da3a7d244514178c61ef7db
3
+ size 22716
bark/assets/prompts/en_speaker_8.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fc9f11b539588f51bbf78150a73e0365c49b2306bd72e5a22b28ef09c4fb15d
3
+ size 23300
bark/assets/prompts/en_speaker_9.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b3ba32eb9aeb9ed34556856c40633ecc8332d1c3ae3c81e6f5015ac3eefbd5
3
+ size 30180
bark/assets/prompts/zh_speaker_0.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd7ac118a3e944b3f20c89f2446056a00850a630ee16318922acc6572ce80929
3
+ size 20636
bark/assets/prompts/zh_speaker_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eacf5c862dfd3c5ac825f2ebb26f323e64309cb712e7e264cbd31c5bca3f038
3
+ size 19836
bark/assets/prompts/zh_speaker_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e324b47f8250e5798c314f395d4e049575e7ca369d0b6074e91c7bba70e9f26d
3
+ size 21060
bark/assets/prompts/zh_speaker_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98c476abc7bf634ffb2d71d363284e7bd8c8abd5e33ec5ca21d4aa5b15730d18
3
+ size 31300
bark/assets/prompts/zh_speaker_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fa8673a9895ad3302d13ac94193b5ad5da481f1cc276e6181fa895acaae133b
3
+ size 29964
bark/assets/prompts/zh_speaker_5.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226edfe5fabc72eeb83a13e350599bc8babe5adc2264b3cdb661fd1258dc4044
3
+ size 17436
bark/assets/prompts/zh_speaker_6.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285d51fbe81cc263636b5b487fbb6633e6f3cf92c53ca9ab8e6b7f55d4b4a31d
3
+ size 16900
bark/assets/prompts/zh_speaker_7.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0967cdb14ffa79895747b0d52df9f15bdad80d6c55b7630894345c9a7ec87c91
3
+ size 21060
bark/assets/prompts/zh_speaker_8.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c028f78530013f29ab8c0c1cf4fe2138106fbe5252951f5f36e0168056779549
3
+ size 19300
bark/assets/prompts/zh_speaker_9.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6265bb827008d7af8a45a8e057fe3e91efb347d56208180a9ed990ad54e4d75e
3
+ size 16156
bark/clonevoice.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bark.generation import load_codec_model, generate_text_semantic, grab_best_device
2
+ from encodec.utils import convert_audio
3
+ import torchaudio
4
+ import torch
5
+ import os
6
+ import gradio
7
+
8
+
9
+ def clone_voice(audio_filepath, text, dest_filename, progress=gradio.Progress(track_tqdm=True)):
10
+ if len(text) < 1:
11
+ raise gradio.Error('No transcription text entered!')
12
+
13
+ use_gpu = not os.environ.get("BARK_FORCE_CPU", False)
14
+ progress(0, desc="Loading Codec")
15
+ model = load_codec_model(use_gpu=use_gpu)
16
+ progress(0.25, desc="Converting WAV")
17
+
18
+ # Load and pre-process the audio waveform
19
+ device = grab_best_device(use_gpu)
20
+ wav, sr = torchaudio.load(audio_filepath)
21
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
22
+ wav = wav.unsqueeze(0).to(device)
23
+ progress(0.5, desc="Extracting codes")
24
+
25
+ # Extract discrete codes from EnCodec
26
+ with torch.no_grad():
27
+ encoded_frames = model.encode(wav)
28
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
29
+
30
+ # get seconds of audio
31
+ seconds = wav.shape[-1] / model.sample_rate
32
+ # generate semantic tokens
33
+ semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)
34
+
35
+ # move codes to cpu
36
+ codes = codes.cpu().numpy()
37
+
38
+ import numpy as np
39
+ output_path = dest_filename + '.npz'
40
+ np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
41
+ return "Finished"
bark/generation.py ADDED
@@ -0,0 +1,865 @@