Update app.py
Browse files
app.py
CHANGED
@@ -159,12 +159,12 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, i
|
|
159 |
# convert duration information to string
|
160 |
duration_info_str = ""
|
161 |
for i in range(len(char_spacings)):
|
162 |
-
if
|
163 |
-
duration_info_str += str(char_spacing_dur_list[i])
|
|
|
|
|
164 |
else:
|
165 |
-
duration_info_str +=
|
166 |
-
if i != len(char_spacings)-1:
|
167 |
-
duration_info_str += ", "
|
168 |
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
169 |
currentDateAndTime = datetime.now()
|
170 |
print(f"\nCharacter {character} inference successful: {text}")
|
@@ -178,12 +178,14 @@ def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale,
|
|
178 |
phonemes = duration_info_str.split(", ")
|
179 |
recons_durs = []
|
180 |
recons_phonemes = ""
|
181 |
-
for item in phonemes:
|
182 |
-
if
|
183 |
-
recons_durs.append(int(item))
|
184 |
else:
|
185 |
-
|
186 |
-
|
|
|
|
|
187 |
except ValueError:
|
188 |
return ("Error: Format must not be changed!", None)
|
189 |
except AssertionError:
|
@@ -232,8 +234,8 @@ if __name__ == "__main__":
|
|
232 |
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
233 |
"This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
|
234 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
235 |
-
"If you have any suggestions or bug reports, feel free to open discussion in Community.\n\n"
|
236 |
-
"若有bug反馈或建议,请在Community下开启一个新的Discussion。 \n\n"
|
237 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
238 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
239 |
)
|
@@ -296,12 +298,12 @@ if __name__ == "__main__":
|
|
296 |
duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
|
297 |
interactive = True)
|
298 |
gr.Markdown(
|
299 |
-
"
|
300 |
-
"您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
|
301 |
-
"注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
|
302 |
-
"The numbers inside \{ \} represent the length for each phoneme in the generated audio, while the numbers out of \{ \} represent the length of spacings between phonemes."
|
303 |
"You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
|
304 |
"Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
|
|
|
|
|
|
|
305 |
)
|
306 |
btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
|
307 |
outputs=[text_output, audio_output, phoneme_output, duration_output])
|
@@ -324,23 +326,26 @@ if __name__ == "__main__":
|
|
324 |
)
|
325 |
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
326 |
"2023/1/24:\n\n"
|
327 |
-
"
|
|
|
|
|
328 |
"Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
|
|
|
329 |
"2023/1/13:\n\n"
|
330 |
-
"增加了音素输入的example(米浴喘气)\n\n"
|
331 |
"Added one example of phoneme input.\n\n"
|
|
|
332 |
"2023/1/12:\n\n"
|
333 |
-
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
334 |
"Added phoneme input, which enables more precise control on output audio.\n\n"
|
335 |
-
"
|
336 |
"Adjusted UI arrangements.\n\n"
|
|
|
337 |
"2023/1/10:\n\n"
|
338 |
-
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
339 |
"Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
|
|
|
340 |
"2023/1/9:\n\n"
|
341 |
-
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
342 |
"Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
|
343 |
-
"
|
344 |
"Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
|
|
|
345 |
)
|
346 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|
|
|
159 |
# convert duration information to string
|
160 |
duration_info_str = ""
|
161 |
for i in range(len(char_spacings)):
|
162 |
+
if i == len(char_spacings) - 1:
|
163 |
+
duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")"
|
164 |
+
elif char_spacings[i] == "spacing":
|
165 |
+
duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")" + ", "
|
166 |
else:
|
167 |
+
duration_info_str += char_spacings[i] + ":" + str(char_spacing_dur_list[i])
|
|
|
|
|
168 |
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
169 |
currentDateAndTime = datetime.now()
|
170 |
print(f"\nCharacter {character} inference successful: {text}")
|
|
|
178 |
phonemes = duration_info_str.split(", ")
|
179 |
recons_durs = []
|
180 |
recons_phonemes = ""
|
181 |
+
for i, item in enumerate(phonemes):
|
182 |
+
if i == 0:
|
183 |
+
recons_durs.append(int(item.strip("()")))
|
184 |
else:
|
185 |
+
phoneme_n_dur, spacing_dur = item.split("(")
|
186 |
+
recons_phonemes += phoneme_n_dur.split(":")[0]
|
187 |
+
recons_durs.append(int(phoneme_n_dur.split(":")[1]))
|
188 |
+
recons_durs.append(int(spacing_dur.strip(")")))
|
189 |
except ValueError:
|
190 |
return ("Error: Format must not be changed!", None)
|
191 |
except AssertionError:
|
|
|
234 |
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
235 |
"This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
|
236 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
237 |
+
"If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
|
238 |
+
"若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
|
239 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
240 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
241 |
)
|
|
|
298 |
duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
|
299 |
interactive = True)
|
300 |
gr.Markdown(
|
301 |
+
"The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme."
|
|
|
|
|
|
|
302 |
"You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
|
303 |
"Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
|
304 |
+
"音素冒号后的数字代��音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
|
305 |
+
"您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
|
306 |
+
"注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
|
307 |
)
|
308 |
btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
|
309 |
outputs=[text_output, audio_output, phoneme_output, duration_output])
|
|
|
326 |
)
|
327 |
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
328 |
"2023/1/24:\n\n"
|
329 |
+
"Improved the format of phoneme length control.\n\n"
|
330 |
+
"改善了音素控制的格式。\n\n"
|
331 |
+
"2023/1/24:\n\n"
|
332 |
"Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
|
333 |
+
"增加了对说话节奏的音素级控制。\n\n"
|
334 |
"2023/1/13:\n\n"
|
|
|
335 |
"Added one example of phoneme input.\n\n"
|
336 |
+
"增加了音素输入的example(米浴喘气)\n\n"
|
337 |
"2023/1/12:\n\n"
|
|
|
338 |
"Added phoneme input, which enables more precise control on output audio.\n\n"
|
339 |
+
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
340 |
"Adjusted UI arrangements.\n\n"
|
341 |
+
"调整了UI的布局。\n\n"
|
342 |
"2023/1/10:\n\n"
|
|
|
343 |
"Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
|
344 |
+
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
345 |
"2023/1/9:\n\n"
|
|
|
346 |
"Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
|
347 |
+
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
348 |
"Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
|
349 |
+
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
350 |
)
|
351 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|