kevinwang676
commited on
Commit
·
8a88b9f
1
Parent(s):
d8676f6
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ sys.path.append("./bark-gui/")
|
|
7 |
from cProfile import label
|
8 |
from distutils.command.check import check
|
9 |
from doctest import Example
|
|
|
10 |
import gradio as gr
|
11 |
import numpy as np
|
12 |
import logging
|
@@ -31,7 +32,7 @@ from settings import Settings
|
|
31 |
|
32 |
from bark import SAMPLE_RATE
|
33 |
from bark.clonevoice import clone_voice
|
34 |
-
from bark.generation import SAMPLE_RATE, preload_models
|
35 |
from scipy.io.wavfile import write as write_wav
|
36 |
from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
|
37 |
from datetime import datetime
|
@@ -78,100 +79,117 @@ def speechbrain(aud):
|
|
78 |
torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)
|
79 |
return 'enhanced.wav'
|
80 |
|
81 |
-
def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, progress=gr.Progress(track_tqdm=True)):
|
82 |
-
if text == None or len(text) < 1:
|
83 |
-
raise gr.Error('No text entered!')
|
84 |
-
|
85 |
# Chunk the text into smaller pieces then combine the generated audio
|
86 |
|
87 |
# generation settings
|
88 |
if selected_speaker == 'None':
|
89 |
selected_speaker = None
|
90 |
-
if seed != None and seed > 2**32 - 1:
|
91 |
-
logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
|
92 |
-
seed = None
|
93 |
-
if seed == None or seed <= 0:
|
94 |
-
seed = np.random.default_rng().integers(1, 2**32 - 1)
|
95 |
-
assert(0 < seed and seed < 2**32)
|
96 |
|
97 |
voice_name = selected_speaker
|
98 |
-
use_last_generation_as_history = "Use last generation as history" in complete_settings
|
99 |
-
save_last_generation = "Save generation as Voice" in complete_settings
|
100 |
-
progress(0, desc="Generating")
|
101 |
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
if quick_generation == True:
|
140 |
-
with pytorch_seed.SavedRNG(seed):
|
141 |
audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
else:
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
if use_last_generation_as_history:
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
175 |
return result
|
176 |
|
177 |
def create_filename(path, seed, name, extension):
|
@@ -260,7 +278,7 @@ gradio: {gr.__version__}
|
|
260 |
|
261 |
|
262 |
logger = logging.getLogger(__name__)
|
263 |
-
APPTITLE = "Bark UI Enhanced v0.4.
|
264 |
|
265 |
|
266 |
autolaunch = False
|
@@ -319,6 +337,7 @@ run_server = True
|
|
319 |
|
320 |
|
321 |
|
|
|
322 |
'''
|
323 |
from google.colab import drive
|
324 |
drive.mount('/content/drive')
|
@@ -461,8 +480,9 @@ while run_server:
|
|
461 |
placeholder = "想让Bark说些什么呢?"
|
462 |
input_text = gr.Textbox(label="用作声音合成的文本", lines=4, placeholder=placeholder)
|
463 |
with gr.Column():
|
|
|
464 |
seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
|
465 |
-
|
466 |
|
467 |
with gr.Row():
|
468 |
with gr.Column():
|
@@ -557,7 +577,7 @@ while run_server:
|
|
557 |
|
558 |
quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
|
559 |
convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
|
560 |
-
gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent],outputs=output_audio)
|
561 |
button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
|
562 |
# Javascript hack to display modal confirmation dialog
|
563 |
js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
|
|
|
7 |
from cProfile import label
|
8 |
from distutils.command.check import check
|
9 |
from doctest import Example
|
10 |
+
import dataclasses
|
11 |
import gradio as gr
|
12 |
import numpy as np
|
13 |
import logging
|
|
|
32 |
|
33 |
from bark import SAMPLE_RATE
|
34 |
from bark.clonevoice import clone_voice
|
35 |
+
from bark.generation import SAMPLE_RATE, preload_models, _load_history_prompt, codec_decode
|
36 |
from scipy.io.wavfile import write as write_wav
|
37 |
from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
|
38 |
from datetime import datetime
|
|
|
79 |
torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)
|
80 |
return 'enhanced.wav'
|
81 |
|
82 |
+
def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, batchcount, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
83 |
# Chunk the text into smaller pieces then combine the generated audio
|
84 |
|
85 |
# generation settings
|
86 |
if selected_speaker == 'None':
|
87 |
selected_speaker = None
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
voice_name = selected_speaker
|
|
|
|
|
|
|
90 |
|
91 |
+
if text == None or len(text) < 1:
|
92 |
+
if selected_speaker == None:
|
93 |
+
raise gr.Error('No text entered!')
|
94 |
+
|
95 |
+
# Extract audio data from speaker if no text and speaker selected
|
96 |
+
voicedata = _load_history_prompt(voice_name)
|
97 |
+
audio_arr = codec_decode(voicedata["fine_prompt"])
|
98 |
+
result = create_filename(OUTPUTFOLDER, "None", "extract",".wav")
|
99 |
+
save_wav(audio_arr, result)
|
100 |
+
return result
|
101 |
+
|
102 |
+
if batchcount < 1:
|
103 |
+
batchcount = 1
|
104 |
+
|
105 |
+
|
106 |
+
silenceshort = np.zeros(int((float(settings.silence_sentence) / 1000.0) * SAMPLE_RATE), dtype=np.int16) # quarter second of silence
|
107 |
silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
|
108 |
+
use_last_generation_as_history = "Use last generation as history" in complete_settings
|
109 |
+
save_last_generation = "Save generation as Voice" in complete_settings
|
110 |
+
for l in range(batchcount):
|
111 |
+
currentseed = seed
|
112 |
+
if seed != None and seed > 2**32 - 1:
|
113 |
+
logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
|
114 |
+
currentseed = None
|
115 |
+
if currentseed == None or currentseed <= 0:
|
116 |
+
currentseed = np.random.default_rng().integers(1, 2**32 - 1)
|
117 |
+
assert(0 < currentseed and currentseed < 2**32)
|
118 |
+
|
119 |
+
progress(0, desc="Generating")
|
120 |
+
|
121 |
+
full_generation = None
|
122 |
+
|
123 |
+
all_parts = []
|
124 |
+
complete_text = ""
|
125 |
+
text = text.lstrip()
|
126 |
+
if is_ssml(text):
|
127 |
+
list_speak = create_clips_from_ssml(text)
|
128 |
+
prev_speaker = None
|
129 |
+
for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
|
130 |
+
selected_speaker = clip[0]
|
131 |
+
# Add pause break between speakers
|
132 |
+
if i > 0 and selected_speaker != prev_speaker:
|
133 |
+
all_parts += [silencelong.copy()]
|
134 |
+
prev_speaker = selected_speaker
|
135 |
+
text = clip[1]
|
136 |
+
text = saxutils.unescape(text)
|
137 |
+
if selected_speaker == "None":
|
138 |
+
selected_speaker = None
|
139 |
+
|
140 |
+
print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker} (Seed {currentseed}):`{text}`")
|
141 |
+
complete_text += text
|
142 |
+
with pytorch_seed.SavedRNG(currentseed):
|
|
|
|
|
143 |
audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
|
144 |
+
currentseed = torch.random.initial_seed()
|
145 |
+
if len(list_speak) > 1:
|
146 |
+
filename = create_filename(OUTPUTFOLDER, currentseed, "audioclip",".wav")
|
147 |
+
save_wav(audio_array, filename)
|
148 |
+
add_id3_tag(filename, text, selected_speaker, currentseed)
|
149 |
+
|
150 |
+
all_parts += [audio_array]
|
151 |
+
else:
|
152 |
+
texts = split_and_recombine_text(text, settings.input_text_desired_length, settings.input_text_max_length)
|
153 |
+
for i, text in tqdm(enumerate(texts), total=len(texts)):
|
154 |
+
print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker} (Seed {currentseed}):`{text}`")
|
155 |
+
complete_text += text
|
156 |
+
if quick_generation == True:
|
157 |
+
with pytorch_seed.SavedRNG(currentseed):
|
158 |
+
audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
|
159 |
+
currentseed = torch.random.initial_seed()
|
160 |
else:
|
161 |
+
full_output = use_last_generation_as_history or save_last_generation
|
162 |
+
if full_output:
|
163 |
+
full_generation, audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob, output_full=True)
|
164 |
+
else:
|
165 |
+
audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
|
166 |
+
|
167 |
+
# Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
|
168 |
+
# audio_array = (audio_array * 32767).astype(np.int16)
|
169 |
+
|
170 |
+
if len(texts) > 1:
|
171 |
+
filename = create_filename(OUTPUTFOLDER, currentseed, "audioclip",".wav")
|
172 |
+
save_wav(audio_array, filename)
|
173 |
+
add_id3_tag(filename, text, selected_speaker, currentseed)
|
174 |
+
|
175 |
+
if quick_generation == False and (save_last_generation == True or use_last_generation_as_history == True):
|
176 |
+
# save to npz
|
177 |
+
voice_name = create_filename(OUTPUTFOLDER, seed, "audioclip", ".npz")
|
178 |
+
save_as_prompt(voice_name, full_generation)
|
179 |
+
if use_last_generation_as_history:
|
180 |
+
selected_speaker = voice_name
|
181 |
+
|
182 |
+
all_parts += [audio_array]
|
183 |
+
# Add short pause between sentences
|
184 |
+
if text[-1] in "!?.\n" and i > 1:
|
185 |
+
all_parts += [silenceshort.copy()]
|
186 |
+
|
187 |
+
# save & play audio
|
188 |
+
result = create_filename(OUTPUTFOLDER, currentseed, "final",".wav")
|
189 |
+
save_wav(np.concatenate(all_parts), result)
|
190 |
+
# write id3 tag with text truncated to 60 chars, as a precaution...
|
191 |
+
add_id3_tag(result, complete_text, selected_speaker, currentseed)
|
192 |
+
|
193 |
return result
|
194 |
|
195 |
def create_filename(path, seed, name, extension):
|
|
|
278 |
|
279 |
|
280 |
logger = logging.getLogger(__name__)
|
281 |
+
APPTITLE = "Bark UI Enhanced v0.4.8"
|
282 |
|
283 |
|
284 |
autolaunch = False
|
|
|
337 |
|
338 |
|
339 |
|
340 |
+
|
341 |
'''
|
342 |
from google.colab import drive
|
343 |
drive.mount('/content/drive')
|
|
|
480 |
placeholder = "想让Bark说些什么呢?"
|
481 |
input_text = gr.Textbox(label="用作声音合成的文本", lines=4, placeholder=placeholder)
|
482 |
with gr.Column():
|
483 |
+
convert_to_ssml_button = gr.Button("Convert Input Text to SSML")
|
484 |
seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
|
485 |
+
batchcount = gr.Number(label="Batch count", precision=0, value=1)
|
486 |
|
487 |
with gr.Row():
|
488 |
with gr.Column():
|
|
|
577 |
|
578 |
quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
|
579 |
convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
|
580 |
+
gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent, batchcount],outputs=output_audio)
|
581 |
button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
|
582 |
# Javascript hack to display modal confirmation dialog
|
583 |
js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
|