kevinwang676 commited on
Commit
8a88b9f
·
1 Parent(s): d8676f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -87
app.py CHANGED
@@ -7,6 +7,7 @@ sys.path.append("./bark-gui/")
7
  from cProfile import label
8
  from distutils.command.check import check
9
  from doctest import Example
 
10
  import gradio as gr
11
  import numpy as np
12
  import logging
@@ -31,7 +32,7 @@ from settings import Settings
31
 
32
  from bark import SAMPLE_RATE
33
  from bark.clonevoice import clone_voice
34
- from bark.generation import SAMPLE_RATE, preload_models
35
  from scipy.io.wavfile import write as write_wav
36
  from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
37
  from datetime import datetime
@@ -78,100 +79,117 @@ def speechbrain(aud):
78
  torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)
79
  return 'enhanced.wav'
80
 
81
- def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, progress=gr.Progress(track_tqdm=True)):
82
- if text == None or len(text) < 1:
83
- raise gr.Error('No text entered!')
84
-
85
  # Chunk the text into smaller pieces then combine the generated audio
86
 
87
  # generation settings
88
  if selected_speaker == 'None':
89
  selected_speaker = None
90
- if seed != None and seed > 2**32 - 1:
91
- logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
92
- seed = None
93
- if seed == None or seed <= 0:
94
- seed = np.random.default_rng().integers(1, 2**32 - 1)
95
- assert(0 < seed and seed < 2**32)
96
 
97
  voice_name = selected_speaker
98
- use_last_generation_as_history = "Use last generation as history" in complete_settings
99
- save_last_generation = "Save generation as Voice" in complete_settings
100
- progress(0, desc="Generating")
101
 
102
- silenceshort = np.zeros(int((float(settings.silence_sentence) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # quarter second of silence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
104
- full_generation = None
105
-
106
- all_parts = []
107
- complete_text = ""
108
- text = text.lstrip()
109
- if is_ssml(text):
110
- list_speak = create_clips_from_ssml(text)
111
- prev_speaker = None
112
- for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
113
- selected_speaker = clip[0]
114
- # Add pause break between speakers
115
- if i > 0 and selected_speaker != prev_speaker:
116
- all_parts += [silencelong.copy()]
117
- prev_speaker = selected_speaker
118
- text = clip[1]
119
- text = saxutils.unescape(text)
120
- if selected_speaker == "None":
121
- selected_speaker = None
122
-
123
- print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker} (Seed {seed}):`{text}`")
124
- complete_text += text
125
- with pytorch_seed.SavedRNG(seed):
126
- audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
127
- seed = torch.random.initial_seed()
128
- if len(list_speak) > 1:
129
- filename = create_filename(OUTPUTFOLDER, seed, "audioclip",".wav")
130
- save_wav(audio_array, filename)
131
- add_id3_tag(filename, text, selected_speaker, seed)
132
-
133
- all_parts += [audio_array]
134
- else:
135
- texts = split_and_recombine_text(text, settings.input_text_desired_length, settings.input_text_max_length)
136
- for i, text in tqdm(enumerate(texts), total=len(texts)):
137
- print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker} (Seed {seed}):`{text}`")
138
- complete_text += text
139
- if quick_generation == True:
140
- with pytorch_seed.SavedRNG(seed):
141
  audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
142
- seed = torch.random.initial_seed()
143
- else:
144
- full_output = use_last_generation_as_history or save_last_generation
145
- if full_output:
146
- full_generation, audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob, output_full=True)
 
 
 
 
 
 
 
 
 
 
 
147
  else:
148
- audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
149
-
150
- # Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
151
- # audio_array = (audio_array * 32767).astype(np.int16)
152
-
153
- if len(texts) > 1:
154
- filename = create_filename(OUTPUTFOLDER, seed, "audioclip",".wav")
155
- save_wav(audio_array, filename)
156
- add_id3_tag(filename, text, selected_speaker, seed)
157
-
158
- if quick_generation == False and (save_last_generation == True or use_last_generation_as_history == True):
159
- # save to npz
160
- voice_name = create_filename(OUTPUTFOLDER, seed, "audioclip", ".npz")
161
- save_as_prompt(voice_name, full_generation)
162
- if use_last_generation_as_history:
163
- selected_speaker = voice_name
164
-
165
- all_parts += [audio_array]
166
- # Add short pause between sentences
167
- if text[-1] in "!?.\n" and i > 1:
168
- all_parts += [silenceshort.copy()]
169
-
170
- # save & play audio
171
- result = create_filename(OUTPUTFOLDER, seed, "final",".wav")
172
- save_wav(np.concatenate(all_parts), result)
173
- # write id3 tag with text truncated to 60 chars, as a precaution...
174
- add_id3_tag(result, complete_text, selected_speaker, seed)
 
 
 
 
 
175
  return result
176
 
177
  def create_filename(path, seed, name, extension):
@@ -260,7 +278,7 @@ gradio: {gr.__version__}
260
 
261
 
262
  logger = logging.getLogger(__name__)
263
- APPTITLE = "Bark UI Enhanced v0.4.6"
264
 
265
 
266
  autolaunch = False
@@ -319,6 +337,7 @@ run_server = True
319
 
320
 
321
 
 
322
  '''
323
  from google.colab import drive
324
  drive.mount('/content/drive')
@@ -461,8 +480,9 @@ while run_server:
461
  placeholder = "想让Bark说些什么呢?"
462
  input_text = gr.Textbox(label="用作声音合成的文本", lines=4, placeholder=placeholder)
463
  with gr.Column():
 
464
  seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
465
- convert_to_ssml_button = gr.Button("Convert Text to SSML")
466
 
467
  with gr.Row():
468
  with gr.Column():
@@ -557,7 +577,7 @@ while run_server:
557
 
558
  quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
559
  convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
560
- gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent],outputs=output_audio)
561
  button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
562
  # Javascript hack to display modal confirmation dialog
563
  js = "(x) => confirm('Are you sure? This will remove all files from output folder')"
 
7
  from cProfile import label
8
  from distutils.command.check import check
9
  from doctest import Example
10
+ import dataclasses
11
  import gradio as gr
12
  import numpy as np
13
  import logging
 
32
 
33
  from bark import SAMPLE_RATE
34
  from bark.clonevoice import clone_voice
35
+ from bark.generation import SAMPLE_RATE, preload_models, _load_history_prompt, codec_decode
36
  from scipy.io.wavfile import write as write_wav
37
  from parseinput import split_and_recombine_text, build_ssml, is_ssml, create_clips_from_ssml
38
  from datetime import datetime
 
79
  torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)
80
  return 'enhanced.wav'
81
 
82
+ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, batchcount, progress=gr.Progress(track_tqdm=True)):
 
 
 
83
  # Chunk the text into smaller pieces then combine the generated audio
84
 
85
  # generation settings
86
  if selected_speaker == 'None':
87
  selected_speaker = None
 
 
 
 
 
 
88
 
89
  voice_name = selected_speaker
 
 
 
90
 
91
+ if text == None or len(text) < 1:
92
+ if selected_speaker == None:
93
+ raise gr.Error('No text entered!')
94
+
95
+ # Extract audio data from speaker if no text and speaker selected
96
+ voicedata = _load_history_prompt(voice_name)
97
+ audio_arr = codec_decode(voicedata["fine_prompt"])
98
+ result = create_filename(OUTPUTFOLDER, "None", "extract",".wav")
99
+ save_wav(audio_arr, result)
100
+ return result
101
+
102
+ if batchcount < 1:
103
+ batchcount = 1
104
+
105
+
106
+ silenceshort = np.zeros(int((float(settings.silence_sentence) / 1000.0) * SAMPLE_RATE), dtype=np.int16) # quarter second of silence
107
  silencelong = np.zeros(int((float(settings.silence_speakers) / 1000.0) * SAMPLE_RATE), dtype=np.float32) # half a second of silence
108
+ use_last_generation_as_history = "Use last generation as history" in complete_settings
109
+ save_last_generation = "Save generation as Voice" in complete_settings
110
+ for l in range(batchcount):
111
+ currentseed = seed
112
+ if seed != None and seed > 2**32 - 1:
113
+ logger.warning(f"Seed {seed} > 2**32 - 1 (max), setting to random")
114
+ currentseed = None
115
+ if currentseed == None or currentseed <= 0:
116
+ currentseed = np.random.default_rng().integers(1, 2**32 - 1)
117
+ assert(0 < currentseed and currentseed < 2**32)
118
+
119
+ progress(0, desc="Generating")
120
+
121
+ full_generation = None
122
+
123
+ all_parts = []
124
+ complete_text = ""
125
+ text = text.lstrip()
126
+ if is_ssml(text):
127
+ list_speak = create_clips_from_ssml(text)
128
+ prev_speaker = None
129
+ for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
130
+ selected_speaker = clip[0]
131
+ # Add pause break between speakers
132
+ if i > 0 and selected_speaker != prev_speaker:
133
+ all_parts += [silencelong.copy()]
134
+ prev_speaker = selected_speaker
135
+ text = clip[1]
136
+ text = saxutils.unescape(text)
137
+ if selected_speaker == "None":
138
+ selected_speaker = None
139
+
140
+ print(f"\nGenerating Text ({i+1}/{len(list_speak)}) -> {selected_speaker} (Seed {currentseed}):`{text}`")
141
+ complete_text += text
142
+ with pytorch_seed.SavedRNG(currentseed):
 
 
143
  audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
144
+ currentseed = torch.random.initial_seed()
145
+ if len(list_speak) > 1:
146
+ filename = create_filename(OUTPUTFOLDER, currentseed, "audioclip",".wav")
147
+ save_wav(audio_array, filename)
148
+ add_id3_tag(filename, text, selected_speaker, currentseed)
149
+
150
+ all_parts += [audio_array]
151
+ else:
152
+ texts = split_and_recombine_text(text, settings.input_text_desired_length, settings.input_text_max_length)
153
+ for i, text in tqdm(enumerate(texts), total=len(texts)):
154
+ print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker} (Seed {currentseed}):`{text}`")
155
+ complete_text += text
156
+ if quick_generation == True:
157
+ with pytorch_seed.SavedRNG(currentseed):
158
+ audio_array = generate_with_settings(text_prompt=text, voice_name=selected_speaker, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
159
+ currentseed = torch.random.initial_seed()
160
  else:
161
+ full_output = use_last_generation_as_history or save_last_generation
162
+ if full_output:
163
+ full_generation, audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob, output_full=True)
164
+ else:
165
+ audio_array = generate_with_settings(text_prompt=text, voice_name=voice_name, semantic_temp=text_temp, coarse_temp=waveform_temp, eos_p=eos_prob)
166
+
167
+ # Noticed this in the HF Demo - convert to 16bit int -32767/32767 - most used audio format
168
+ # audio_array = (audio_array * 32767).astype(np.int16)
169
+
170
+ if len(texts) > 1:
171
+ filename = create_filename(OUTPUTFOLDER, currentseed, "audioclip",".wav")
172
+ save_wav(audio_array, filename)
173
+ add_id3_tag(filename, text, selected_speaker, currentseed)
174
+
175
+ if quick_generation == False and (save_last_generation == True or use_last_generation_as_history == True):
176
+ # save to npz
177
+ voice_name = create_filename(OUTPUTFOLDER, seed, "audioclip", ".npz")
178
+ save_as_prompt(voice_name, full_generation)
179
+ if use_last_generation_as_history:
180
+ selected_speaker = voice_name
181
+
182
+ all_parts += [audio_array]
183
+ # Add short pause between sentences
184
+ if text[-1] in "!?.\n" and i > 1:
185
+ all_parts += [silenceshort.copy()]
186
+
187
+ # save & play audio
188
+ result = create_filename(OUTPUTFOLDER, currentseed, "final",".wav")
189
+ save_wav(np.concatenate(all_parts), result)
190
+ # write id3 tag with text truncated to 60 chars, as a precaution...
191
+ add_id3_tag(result, complete_text, selected_speaker, currentseed)
192
+
193
  return result
194
 
195
  def create_filename(path, seed, name, extension):
 
278
 
279
 
280
  logger = logging.getLogger(__name__)
281
+ APPTITLE = "Bark UI Enhanced v0.4.8"
282
 
283
 
284
  autolaunch = False
 
337
 
338
 
339
 
340
+
341
  '''
342
  from google.colab import drive
343
  drive.mount('/content/drive')
 
480
  placeholder = "想让Bark说些什么呢?"
481
  input_text = gr.Textbox(label="用作声音合成的文本", lines=4, placeholder=placeholder)
482
  with gr.Column():
483
+ convert_to_ssml_button = gr.Button("Convert Input Text to SSML")
484
  seedcomponent = gr.Number(label="Seed (default -1 = Random)", precision=0, value=-1)
485
+ batchcount = gr.Number(label="Batch count", precision=0, value=1)
486
 
487
  with gr.Row():
488
  with gr.Column():
 
577
 
578
  quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings)
579
  convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text)
580
+ gen_click = tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, eos_prob, quick_gen_checkbox, complete_settings, seedcomponent, batchcount],outputs=output_audio)
581
  button_stop_generation.click(fn=None, inputs=None, outputs=None, cancels=[gen_click])
582
  # Javascript hack to display modal confirmation dialog
583
  js = "(x) => confirm('Are you sure? This will remove all files from output folder')"