kevinwang676 commited on
Commit
132064f
1 Parent(s): 869a2d9

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile.txt +38 -0
  2. config.yaml +8 -0
  3. gitignore (1).txt +14 -0
  4. pyproject.toml +60 -0
  5. setup.py +3 -0
  6. swap_voice.py +62 -0
  7. webui.py +468 -0
Dockerfile.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM debian:stable
2
+
3
+ # Install system packages
4
+ RUN apt update && apt install -y git pip
5
+
6
+ # Create non-root user
7
+ RUN useradd -m -d /bark bark
8
+
9
+ # Run as new user
10
+ USER bark
11
+ WORKDIR /bark
12
+
13
+ # Clone git repo
14
+ RUN git clone https://github.com/C0untFloyd/bark-gui
15
+
16
+ # Switch to git directory
17
+ WORKDIR /bark/bark-gui
18
+
19
+ # Append pip bin path to PATH
20
+ ENV PATH=$PATH:/bark/.local/bin
21
+
22
+ # Install dependancies
23
+ RUN pip install .
24
+ RUN pip install -r requirements.txt
25
+
26
+ # List on all addresses, since we are in a container.
27
+ RUN sed -i "s/server_name: ''/server_name: 0.0.0.0/g" ./config.yaml
28
+
29
+ # Suggested volumes
30
+ VOLUME /bark/bark-gui/assets/prompts/custom
31
+ VOLUME /bark/bark-gui/models
32
+ VOLUME /bark/.cache/huggingface/hub
33
+
34
+ # Default port for web-ui
35
+ EXPOSE 7860/tcp
36
+
37
+ # Start script
38
+ CMD python3 webui.py
config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ input_text_desired_length: 110
2
+ input_text_max_length: 170
3
+ selected_theme: freddyaboulton/dracula_revamped
4
+ server_name: ''
5
+ server_port: 0
6
+ server_share: false
7
+ silence_between_sentences: 250
8
+ silence_between_speakers: 500
gitignore (1).txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ /outputs
3
+ /speakers
4
+ .vs
5
+ *.npz
6
+ *.wav
7
+ *.npy
8
+ .vs/
9
+ /models
10
+ /bark_ui_enhanced.egg-info
11
+ /build/lib/bark
12
+ *.pth
13
+ *.pt
14
+ *.zip
pyproject.toml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "bark-ui-enhanced"
7
+ version = "0.7.0"
8
+ description = "Bark text to audio model with addition features and a Web UI"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ authors = [
12
+ {name = "Suno Inc (original Bark)", email = "hello@suno.ai"},
13
+ {name = "Count Floyd"},
14
+ ]
15
+ # MIT License
16
+ license = {file = "LICENSE"}
17
+
18
+ dependencies = [
19
+ "boto3",
20
+ "encodec",
21
+ "funcy",
22
+ "huggingface-hub>=0.14.1",
23
+ "numpy",
24
+ "scipy",
25
+ "tokenizers",
26
+ "torch",
27
+ "tqdm",
28
+ "transformers",
29
+ ]
30
+
31
+ [project.urls]
32
+ source = "https://github.com/C0untFloyd/bark-gui"
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "bandit",
37
+ "black",
38
+ "codecov",
39
+ "flake8",
40
+ "hypothesis>=6.14,<7",
41
+ "isort>=5.0.0,<6",
42
+ "jupyter",
43
+ "mypy",
44
+ "nbconvert",
45
+ "nbformat",
46
+ "pydocstyle",
47
+ "pylint",
48
+ "pytest",
49
+ "pytest-cov",
50
+ ]
51
+
52
+ [tool.setuptools]
53
+ packages = ["bark"]
54
+
55
+ [tool.setuptools.package-data]
56
+ bark = ["assets/prompts/*.npz", "assets/prompts/v2/*.npz"]
57
+
58
+
59
+ [tool.black]
60
+ line-length = 100
setup.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup()
swap_voice.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bark.generation import load_codec_model, generate_text_semantic, grab_best_device
2
+ from bark import SAMPLE_RATE
3
+ from encodec.utils import convert_audio
4
+ from bark.hubert.hubert_manager import HuBERTManager
5
+ from bark.hubert.pre_kmeans_hubert import CustomHubert
6
+ from bark.hubert.customtokenizer import CustomTokenizer
7
+ from bark.api import semantic_to_waveform
8
+ from scipy.io.wavfile import write as write_wav
9
+ from util.helper import create_filename
10
+ from util.settings import Settings
11
+
12
+
13
+ import torchaudio
14
+ import torch
15
+ import os
16
+ import gradio
17
+
18
+ def swap_voice_from_audio(swap_audio_filename, selected_speaker, tokenizer_lang, seed, batchcount, progress=gradio.Progress(track_tqdm=True)):
19
+ use_gpu = not os.environ.get("BARK_FORCE_CPU", False)
20
+ progress(0, desc="Loading Codec")
21
+
22
+ # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
23
+ hubert_manager = HuBERTManager()
24
+ hubert_manager.make_sure_hubert_installed()
25
+ hubert_manager.make_sure_tokenizer_installed(tokenizer_lang=tokenizer_lang)
26
+
27
+ # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
28
+ # Load HuBERT for semantic tokens
29
+
30
+ # Load the HuBERT model
31
+ device = grab_best_device(use_gpu)
32
+ hubert_model = CustomHubert(checkpoint_path='./models/hubert/hubert.pt').to(device)
33
+ model = load_codec_model(use_gpu=use_gpu)
34
+
35
+ # Load the CustomTokenizer model
36
+ tokenizer = CustomTokenizer.load_from_checkpoint(f'./models/hubert/{tokenizer_lang}_tokenizer.pth').to(device) # Automatically uses the right layers
37
+
38
+ progress(0.25, desc="Converting WAV")
39
+
40
+ # Load and pre-process the audio waveform
41
+ wav, sr = torchaudio.load(swap_audio_filename)
42
+ if wav.shape[0] == 2: # Stereo to mono if needed
43
+ wav = wav.mean(0, keepdim=True)
44
+
45
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
46
+ wav = wav.to(device)
47
+ semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
48
+ semantic_tokens = tokenizer.get_token(semantic_vectors)
49
+
50
+ audio = semantic_to_waveform(
51
+ semantic_tokens,
52
+ history_prompt=selected_speaker,
53
+ temp=0.7,
54
+ silent=False,
55
+ output_full=False)
56
+
57
+ settings = Settings('config.yaml')
58
+
59
+ result = create_filename(settings.output_folder_path, None, "swapvoice",".wav")
60
+ write_wav(result, SAMPLE_RATE, audio)
61
+ return result
62
+
webui.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from cProfile import label
2
+ import dataclasses
3
+ from distutils.command.check import check
4
+ from doctest import Example
5
+ import gradio as gr
6
+ import os
7
+ import sys
8
+ import numpy as np
9
+ import logging
10
+ import torch
11
+ import pytorch_seed
12
+ import time
13
+
14
+ from xml.sax import saxutils
15
+ from bark.api import generate_with_settings
16
+ from bark.api import save_as_prompt
17
+ from util.settings import Settings
18
+ #import nltk
19
+
20
+ from bark import SAMPLE_RATE
21
+ from cloning.clonevoice import clone_voice
22
+ from bark.generation import SAMPLE_RATE, preload_models, _load_history_prompt, codec_decode
23
+ from scipy.io.wavfile import write as write_wav
24
+ from util.parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
25
+ from datetime import datetime
26
+ from tqdm.auto import tqdm
27
+ from util.helper import create_filename, add_id3_tag
28
+ from swap_voice import swap_voice_from_audio
29
+ from training.training_prepare import prepare_semantics_from_text, prepare_wavs_from_semantics
30
+ from training.train import training_prepare_files, train
31
+
32
+ settings = Settings('config.yaml')
33
+
34
+
35
+ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, batchcount, progress=gr.Progress(track_tqdm=True)):
36
+ # Chunk the text into smaller pieces then combine the generated audio
37
+
38
+ # generation settings
39
+ if selected_speaker == 'None':
40
+ selected_speaker = None
41
+
42
+ voice_name = selected_speaker
43
+
44
+ if text == None or len(text) < 1:
45
+ if selected_speaker == None:
46
+ raise gr.Error('No text entered!')
47
+
48
+ # Extract audio data from speaker if no text and speaker selected
49
+ voicedata = _load_history_prompt(voice_name)
50
+ audio_arr = codec_decode(voicedata["fine_prompt"])
51
+ result = create_filename(settings.output_folder_path, "None", "extract",".wav")
52
+ save_wav(audio_arr, result)
53
+ return result
54
+
55
+ if batchcount < 1:
56
+ batchcount = 1
57
+
58
+
59
+ silenceshort = np.zeros(int((float(settings.silence_sentence) / 1000.0) * SAMPLE_RATE), dtype=np.int16) # quarter second of silence
60
+ silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
61
+ use_last_generation_as_history = "Use last generation as history" in complete_settings
62
+ save_last_generation = "Save generation as Voice" in complete_settings
63
+ for l in range(batchcount):
64
+ currentseed = seed
65
+ if seed != None and seed > 2**32 - 1:
66
+ logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
67
+ currentseed = None
68
+ if currentseed == None or currentseed <= 0:
69
+ currentseed = np.random.default_rng().integers(1, 2**32 - 1)
70
+ assert(0 < currentseed and currentseed < 2**32)
71
+
72
+ progress(0, desc="Generating")
73
+
74
+ full_generation = None
75
+
76
+ all_parts = []
77
+ complete_text = ""
78
+ text = text.lstrip()
79
+ if is_ssml(text):
80
+ list_speak = create_clips_from_ssml(text)
81
+ prev_speaker = None
82
+ for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
83
+ selected_speaker = clip[0]
84
+ # Add pause break between speakers
85
+ if i > 0 and selected_speaker != prev_speaker:
86
+ all_parts += [silencelong.copy()]
87
+ prev_speaker = selected_speaker
88
+ text = clip[1]
89
+ text = saxutils.unescape(text)
90
+ if selected_speaker == "None":
91
+ selected_speaker = None
92
+
93
+ print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker} (Seed {currentseed}):`{text}`")
94
+ complete_text += text
95
+ with pytorch_seed.SavedRNG(currentseed):
96
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
97
+ currentseed = torch.random.initial_seed()
98
+ if len(list_speak) > 1:
99
+ filename = create_filename(settings.output_folder_path, currentseed, "audioclip",".wav")
100
+ save_wav(audio_array, filename)
101
+ add_id3_tag(filename, text, selected_speaker, currentseed)
102
+
103
+ all_parts += [audio_array]
104
+ else:
105
+ texts = split_and_recombine_text(text, settings.input_text_desired_length, settings.input_text_max_length)
106
+ for i, text in tqdm(enumerate(texts), total=len(texts)):
107
+ print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker} (Seed {currentseed}):`{text}`")
108
+ complete_text += text
109
+ if quick_generation == True:
110
+ with pytorch_seed.SavedRNG(currentseed):
111
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
112
+ currentseed = torch.random.initial_seed()
113
+ else:
114
+ full_output = use_last_generation_as_history or save_last_generation
115
+ if full_output:
116
+ full_generation, audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob, output_full=True)
117
+ else:
118
+ audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
119
+
120
+ # Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
121
+ # audio_array = (audio_array * 32767).astype(np.int16)
122
+
123
+ if len(texts) > 1:
124
+ filename = create_filename(settings.output_folder_path, currentseed, "audioclip",".wav")
125
+ save_wav(audio_array, filename)
126
+ add_id3_tag(filename, text, selected_speaker, currentseed)
127
+
128
+ if quick_generation == False and (save_last_generation == True or use_last_generation_as_history == True):
129
+ # save to npz
130
+ voice_name = create_filename(settings.output_folder_path, seed, "audioclip", ".npz")
131
+ save_as_prompt(voice_name, full_generation)
132
+ if use_last_generation_as_history:
133
+ selected_speaker = voice_name
134
+
135
+ all_parts += [audio_array]
136
+ # Add short pause between sentences
137
+ if text[-1] in "!?.\n" and i > 1:
138
+ all_parts += [silenceshort.copy()]
139
+
140
+ # save & play audio
141
+ result = create_filename(settings.output_folder_path, currentseed, "final",".wav")
142
+ save_wav(np.concatenate(all_parts), result)
143
+ # write id3 tag with text truncated to 60 chars, as a precaution...
144
+ add_id3_tag(result, complete_text, selected_speaker, currentseed)
145
+
146
+ return result
147
+
148
+
149
+
150
+ def save_wav(audio_array, filename):
151
+ write_wav(filename, SAMPLE_RATE, audio_array)
152
+
153
+ def save_voice(filename, semantic_prompt, coarse_prompt, fine_prompt):
154
+ np.savez_compressed(
155
+ filename,
156
+ semantic_prompt=semantic_prompt,
157
+ coarse_prompt=coarse_prompt,
158
+ fine_prompt=fine_prompt
159
+ )
160
+
161
+
162
+ def on_quick_gen_changed(checkbox):
163
+ if checkbox == False:
164
+ return gr.CheckboxGroup.update(visible=True)
165
+ return gr.CheckboxGroup.update(visible=False)
166
+
167
+ def delete_output_files(checkbox_state):
168
+ if checkbox_state:
169
+ outputs_folder = os.path.join(os.getcwd(), settings.output_folder_path)
170
+ if os.path.exists(outputs_folder):
171
+ purgedir(outputs_folder)
172
+ return False
173
+
174
+
175
+ # https://stackoverflow.com/a/54494779
176
+ def purgedir(parent):
177
+ for root, dirs, files in os.walk(parent):
178
+ for item in files:
179
+ # Delete subordinate files
180
+ filespec = os.path.join(root, item)
181
+ os.unlink(filespec)
182
+ for item in dirs:
183
+ # Recursively perform this operation for subordinate directories
184
+ purgedir(os.path.join(root, item))
185
+
186
+ def convert_text_to_ssml(text, selected_speaker):
187
+ return build_ssml(text, selected_speaker)
188
+
189
+
190
+ def training_prepare(selected_step, num_text_generations, progress=gr.Progress(track_tqdm=True)):
191
+ if selected_step == prepare_training_list[0]:
192
+ prepare_semantics_from_text()
193
+ else:
194
+ prepare_wavs_from_semantics()
195
+ return None
196
+
197
+
198
+ def start_training(save_model_epoch, max_epochs, progress=gr.Progress(track_tqdm=True)):
199
+ training_prepare_files("./training/data/", "./training/data/checkpoint/hubert_base_ls960.pt")
200
+ train("./training/data/", save_model_epoch, max_epochs)
201
+ return None
202
+
203
+
204
+
205
+ def apply_settings(themes, input_server_name, input_server_port, input_server_public, input_desired_len, input_max_len, input_silence_break, input_silence_speaker):
206
+ settings.selected_theme = themes
207
+ settings.server_name = input_server_name
208
+ settings.server_port = input_server_port
209
+ settings.server_share = input_server_public
210
+ settings.input_text_desired_length = input_desired_len
211
+ settings.input_text_max_length = input_max_len
212
+ settings.silence_sentence = input_silence_break
213
+ settings.silence_speaker = input_silence_speaker
214
+ settings.save()
215
+
216
+ def restart():
217
+ global restart_server
218
+ restart_server = True
219
+
220
+
221
+ def create_version_html():
222
+ python_version = ".".join([str(x) for x in sys.version_info[0:3]])
223
+ versions_html = f"""
224
+ python: <span title="{sys.version}">{python_version}</span>
225
+  • 
226
+ torch: {getattr(torch, '__long_version__',torch.__version__)}
227
+  • 
228
+ gradio: {gr.__version__}
229
+ """
230
+ return versions_html
231
+
232
+
233
+
234
+ logger = logging.getLogger(__name__)
235
+ APPTITLE = "Bark UI Enhanced v0.7"
236
+
237
+
238
+ autolaunch = False
239
+
240
+ if len(sys.argv) > 1:
241
+ autolaunch = "-autolaunch" in sys.argv
242
+
243
+
244
+ if torch.cuda.is_available() == False:
245
+ os.environ['BARK_FORCE_CPU'] = 'True'
246
+ logger.warning("No CUDA detected, fallback to CPU!")
247
+
248
+ print(f'smallmodels={os.environ.get("SUNO_USE_SMALL_MODELS", False)}')
249
+ print(f'enablemps={os.environ.get("SUNO_ENABLE_MPS", False)}')
250
+ print(f'offloadcpu={os.environ.get("SUNO_OFFLOAD_CPU", False)}')
251
+ print(f'forcecpu={os.environ.get("BARK_FORCE_CPU", False)}')
252
+ print(f'autolaunch={autolaunch}\n\n')
253
+
254
+ #print("Updating nltk\n")
255
+ #nltk.download('punkt')
256
+
257
+ print("Preloading Models\n")
258
+ preload_models()
259
+
260
+ available_themes = ["Default", "gradio/glass", "gradio/monochrome", "gradio/seafoam", "gradio/soft", "gstaff/xkcd", "freddyaboulton/dracula_revamped", "ysharma/steampunk"]
261
+ tokenizer_language_list = ["de","en", "pl"]
262
+ prepare_training_list = ["Step 1: Semantics from Text","Step 2: WAV from Semantics"]
263
+
264
+ seed = -1
265
+ server_name = settings.server_name
266
+ if len(server_name) < 1:
267
+ server_name = None
268
+ server_port = settings.server_port
269
+ if server_port <= 0:
270
+ server_port = None
271
+ global run_server
272
+ global restart_server
273
+
274
+ run_server = True
275
+
276
+ while run_server:
277
+ # Collect all existing speakers/voices in dir
278
+ speakers_list = []
279
+
280
+ for root, dirs, files in os.walk("./bark/assets/prompts"):
281
+ for file in files:
282
+ if file.endswith(".npz"):
283
+ pathpart = root.replace("./bark/assets/prompts", "")
284
+ name = os.path.join(pathpart, file[:-4])
285
+ if name.startswith("/") or name.startswith("\\"):
286
+ name = name[1:]
287
+ speakers_list.append(name)
288
+
289
+ speakers_list = sorted(speakers_list, key=lambda x: x.lower())
290
+ speakers_list.insert(0, 'None')
291
+
292
+ print(f'Launching {APPTITLE} Server')
293
+
294
+ # Create Gradio Blocks
295
+
296
+ with gr.Blocks(title=f"{APPTITLE}", mode=f"{APPTITLE}", theme=settings.selected_theme) as barkgui:
297
+ with gr.Row():
298
+ with gr.Column():
299
+ gr.Markdown(f"### [{APPTITLE}](https://github.com/C0untFloyd/bark-gui)")
300
+ with gr.Column():
301
+ gr.HTML(create_version_html(), elem_id="versions")
302
+
303
+ with gr.Tab("TTS"):
304
+ with gr.Row():
305
+ with gr.Column():
306
+ placeholder = "Enter text here."
307
+ input_text = gr.Textbox(label="Input Text", lines=4, placeholder=placeholder)
308
+ with gr.Column():
309
+ seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
310
+ batchcount = gr.Number(label="Batch count", precision=0, value=1)
311
+ with gr.Row():
312
+ with gr.Column():
313
+ examples = [
314
+ "Special meanings: [laughter] [laughs] [sighs] [music] [gasps] [clears throat] MAN: WOMAN:",
315
+ "♪ Never gonna make you cry, never gonna say goodbye, never gonna tell a lie and hurt you ♪",
316
+ "And now — a picture of a larch [laughter]",
317
+ """
318
+ WOMAN: I would like an oatmilk latte please.
319
+ MAN: Wow, that's expensive!
320
+ """,
321
+ """<?xml version="1.0"?>
322
+ <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
323
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
324
+ xsi:schemaLocation="http://www.w3.org/2001/10/synthesis
325
+ http://www.w3.org/TR/speech-synthesis/synthesis.xsd"
326
+ xml:lang="en-US">
327
+ <voice name="/v2/en_speaker_9">Look at that drunk guy!</voice>
328
+ <voice name="/v2/en_speaker_3">Who is he?</voice>
329
+ <voice name="/v2/en_speaker_9">WOMAN: [clears throat] 10 years ago, he proposed me and I rejected him.</voice>
330
+ <voice name="/v2/en_speaker_3">Oh my God [laughs] he is still celebrating</voice>
331
+ </speak>"""
332
+ ]
333
+ examples = gr.Examples(examples=examples, inputs=input_text)
334
+ with gr.Column():
335
+ convert_to_ssml_button = gr.Button("Convert Input Text to SSML")
336
+
337
+ with gr.Row():
338
+ with gr.Column():
339
+ gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)")
340
+ speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
341
+ with gr.Column():
342
+ text_temp = gr.Slider(0.1, 1.0, value=0.6, label="Generation Temperature", info="1.0 more diverse, 0.1 more conservative")
343
+ waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative")
344
+
345
+ with gr.Row():
346
+ with gr.Column():
347
+ quick_gen_checkbox = gr.Checkbox(label="Quick Generation", value=True)
348
+ settings_checkboxes = ["Use last generation as history", "Save generation as Voice"]
349
+ complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False)
350
+ with gr.Column():
351
+ eos_prob = gr.Slider(0.0, 0.5, value=0.05, label="End of sentence probability")
352
+
353
+ with gr.Row():
354
+ with gr.Column():
355
+ tts_create_button = gr.Button("Generate")
356
+ with gr.Column():
357
+ hidden_checkbox = gr.Checkbox(visible=False)
358
+ button_stop_generation = gr.Button("Stop generation")
359
+ with gr.Row():
360
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
361
+
362
+ with gr.Tab("Swap Voice"):
363
+ with gr.Row():
364
+ swap_audio_filename = gr.Audio(label="Input audio.wav to swap voice", source="upload", type="filepath")
365
+ with gr.Row():
366
+ with gr.Column():
367
+ swap_tokenizer_lang = gr.Dropdown(tokenizer_language_list, label="Base Language Tokenizer", value=tokenizer_language_list[1])
368
+ swap_seed = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
369
+ with gr.Column():
370
+ speaker_swap = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
371
+ swap_batchcount = gr.Number(label="Batch count", precision=0, value=1)
372
+ with gr.Row():
373
+ swap_voice_button = gr.Button("Swap Voice")
374
+ with gr.Row():
375
+ output_swap = gr.Audio(label="Generated Audio", type="filepath")
376
+
377
+ with gr.Tab("Clone Voice"):
378
+ with gr.Row():
379
+ input_audio_filename = gr.Audio(label="Input audio.wav", source="upload", type="filepath")
380
+ #transcription_text = gr.Textbox(label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...")
381
+ with gr.Row():
382
+ with gr.Column():
383
+ initialname = "./bark/assets/prompts/custom/MeMyselfAndI"
384
+ output_voice = gr.Textbox(label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname)
385
+ with gr.Column():
386
+ tokenizerlang = gr.Dropdown(tokenizer_language_list, label="Base Language Tokenizer", value=tokenizer_language_list[1])
387
+ with gr.Row():
388
+ clone_voice_button = gr.Button("Create Voice")
389
+ with gr.Row():
390
+ dummy = gr.Text(label="Progress")
391
+
392
+ with gr.Tab("Training Data Prepare"):
393
+ gr.Markdown("This tab should be used to generate the training dataset. For Step 1 put some books into the inputtext folder in UTF-8 Text Format.")
394
+ prepare_semantics_number = gr.Number(label="Number of semantics to create", precision=0, value=3079)
395
+ prepare_dropdown = gr.Dropdown(prepare_training_list, value=prepare_training_list[0], label="Prepare")
396
+ training_prepare_button = gr.Button("Generate")
397
+ dummytrd = gr.Text(label="Progress")
398
+
399
+ with gr.Tab("Training"):
400
+ with gr.Row():
401
+ gr.Markdown("This tab is used to train the actual model (language).")
402
+ with gr.Row():
403
+ with gr.Column():
404
+ save_model_epoch = gr.Number(label="Auto-save model after number of epochs", precision=0, value=1)
405
+ with gr.Column():
406
+ max_epochs = gr.Number(label="Train for number of epochs", precision=0, value=6)
407
+ with gr.Row():
408
+ with gr.Column():
409
+ allowed_chars = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-_+=\"\':;[]{}/<>,.`~'
410
+ allowedcharsfilter = gr.Textbox(label="Allowed chars for text input", lines=1, value=allowed_chars)
411
+ with gr.Column():
412
+ train_button = gr.Button("Start Training")
413
+ with gr.Row():
414
+ dummytrain = gr.Text(label="Progress")
415
+
416
+
417
+ with gr.Tab("Settings"):
418
+ with gr.Row():
419
+ themes = gr.Dropdown(available_themes, label="Theme", info="Change needs complete restart", value=settings.selected_theme)
420
+ with gr.Row():
421
+ input_server_name = gr.Textbox(label="Server Name", lines=1, info="Leave blank to run locally", value=settings.server_name)
422
+ input_server_port = gr.Number(label="Server Port", precision=0, info="Leave at 0 to use default", value=settings.server_port)
423
+ share_checkbox = gr.Checkbox(label="Public Server", value=settings.server_share)
424
+ with gr.Row():
425
+ input_desired_len = gr.Slider(100, 150, value=settings.input_text_desired_length, label="Desired Input Text Length", info="Ideal length to split input sentences")
426
+ input_max_len = gr.Slider(150, 256, value=settings.input_text_max_length, label="Max Input Text Length", info="Maximum Input Text Length")
427
+ with gr.Row():
428
+ input_silence_break = gr.Slider(1, 1000, value=settings.silence_sentence, label="Sentence Pause Time (ms)", info="Silence between sentences in milliseconds")
429
+ input_silence_speakers = gr.Slider(1, 5000, value=settings.silence_speakers, label="Speaker Pause Time (ms)", info="Silence between different speakers in milliseconds")
430
+
431
+ with gr.Row():
432
+ button_apply_settings = gr.Button("Apply Settings")
433
+ button_apply_restart = gr.Button("Restart Server")
434
+ button_delete_files = gr.Button("Clear output folder")
435
+
436
+ quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
437
+ convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
438
+ gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent, batchcount],outputs=output_audio)
439
+ button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
440
+ # Javascript hack to display modal confirmation dialog
441
+ js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
442
+ button_delete_files.click(None, None, hidden_checkbox, _js=js)
443
+ hidden_checkbox.change(delete_output_files, [hidden_checkbox], [hidden_checkbox])
444
+
445
+ swap_voice_button.click(swap_voice_from_audio, inputs=[swap_audio_filename, speaker_swap, swap_tokenizer_lang, swap_seed, swap_batchcount], outputs=output_swap)
446
+ clone_voice_button.click(clone_voice, inputs=[input_audio_filename, output_voice], outputs=dummy)
447
+ training_prepare_button.click(training_prepare, inputs=[prepare_dropdown, prepare_semantics_number], outputs=dummytrd)
448
+ train_button.click(start_training, inputs=[save_model_epoch, max_epochs], outputs=dummytrain)
449
+ button_apply_settings.click(apply_settings, inputs=[themes, input_server_name, input_server_port, share_checkbox, input_desired_len, input_max_len, input_silence_break, input_silence_speakers])
450
+ button_apply_restart.click(restart)
451
+
452
+ restart_server = False
453
+ try:
454
+ barkgui.queue().launch(inbrowser=autolaunch, server_name=server_name, server_port=server_port, share=settings.server_share, prevent_thread_lock=True)
455
+ except:
456
+ restart_server = True
457
+ run_server = False
458
+ try:
459
+ while restart_server == False:
460
+ time.sleep(1.0)
461
+ except (KeyboardInterrupt, OSError):
462
+ print("Keyboard interruption in main thread... closing server.")
463
+ run_server = False
464
+ barkgui.close()
465
+
466
+
467
+
468
+