Spaces:
Running
on
T4
Running
on
T4
import io | |
import json | |
import os | |
import gradio as gr | |
import requests | |
import soundfile as sf | |
API_SERVER_URL = "http://127.0.0.1:58003/tts" | |
RADIO_CHOICES = ["Preset voices", "Upload target voice", "Record your voice"] | |
MAX_CHARS = 220 | |
PRESET_VOICES = { | |
# female | |
"Ava": "https://cdn.themetavoice.xyz/speakers/ava.flac", | |
"Bria": "https://cdn.themetavoice.xyz/speakers/bria.mp3", | |
# male | |
"Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3", | |
"Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav", | |
} | |
def denormalise_top_p(top_p): | |
# returns top_p in the range [0.9, 1.0] | |
return round(0.9 + top_p / 100, 2) | |
def denormalise_guidance(guidance): | |
# returns guidance in the range [1.0, 3.0] | |
return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1) | |
def _handle_edge_cases(to_say, upload_target): | |
if not to_say: | |
raise gr.Error("Please provide text to synthesise") | |
def _check_file_size(path): | |
if not path: | |
return | |
filesize = os.path.getsize(path) | |
filesize_mb = filesize / 1024 / 1024 | |
if filesize_mb >= 50: | |
raise gr.Error( | |
f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB" | |
) | |
_check_file_size(upload_target) | |
def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target, record_target): | |
d_top_p = denormalise_top_p(top_p) | |
d_guidance = denormalise_guidance(guidance) | |
_handle_edge_cases(to_say, upload_target) | |
to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS] | |
custom_target_path = None | |
if toggle == RADIO_CHOICES[1]: | |
custom_target_path = upload_target | |
elif toggle == RADIO_CHOICES[2]: | |
custom_target_path = record_target | |
config = { | |
"text": to_say, | |
"guidance": d_guidance, | |
"top_p": d_top_p, | |
"speaker_ref_path": PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else None, | |
} | |
headers = {"Content-Type": "audio/wav", "X-Payload": json.dumps(config)} | |
if not custom_target_path: | |
response = requests.post(API_SERVER_URL, headers=headers, data=None) | |
else: | |
with open(custom_target_path, "rb") as f: | |
data = f.read() | |
response = requests.post(API_SERVER_URL, headers=headers, data=data) | |
wav, sr = None, None | |
if response.status_code == 200: | |
audio_buffer = io.BytesIO(response.content) | |
audio_buffer.seek(0) | |
wav, sr = sf.read(audio_buffer, dtype="float32") | |
else: | |
print(f"Something went wrong. response status code: {response.status_code}") | |
return sr, wav | |
def change_voice_selection_layout(choice): | |
index = RADIO_CHOICES.index(choice) | |
return [ | |
gr.update(visible=True) | |
if i == index else gr.update(visible=False) | |
for i in range(len(RADIO_CHOICES)) | |
] | |
title = "# TTS by Kotoba-Speech" | |
description = """ | |
<strong>Kotoba-Speech v0.1</strong>は、1.2Bのトランスフォーマーに基づく音声生成モデルです。 | |
以下の機能をサポートしています: | |
\n | |
* 日本語における滑らかなテキスト読み上げ生成 | |
* スピーチプロンプトを通じたOne-shot音声クローニング | |
Kotoba Technologiesは、公開されたモデルを商用可能なApache 2.0ライセンスで公開します。 | |
推論およびモデルコードは、Meta-Voiceをベースに作られており、学習コードは弊社のGitHubで近日中に公開する予定です。 | |
Kotoba Technologiesは、音声基盤モデルの開発に取り組んでおり、今後もモデルの公開を行なっていきます。是非、[Discord Community](https://discord.gg/qPVFqhGN7Z)に参加してご意見ください! | |
<strong>Kotoba-Speech v0.1</strong> is a 1.2B Transformer-based speech generative model. It supports the following properties: | |
\n | |
* Fluent text-to-speech generation in Japanese | |
* One-shot voice cloning through speech prompt | |
We are releasing our model under the Apache 2.0 license. Our inference and model code is adapted from Meta-Voice, and we will our training code on our GitHub repository shortly. | |
Kotoba Technologies is committing on developing speech foundation models, and we’ll continue releasing our models. Please join [our discord](https://discord.gg/qPVFqhGN7Z) to contribute to out community. | |
""" | |
with gr.Blocks(title="TTS by Kotoba-Speech") as demo: | |
gr.Markdown(title) | |
with gr.Row(): | |
gr.Markdown(description) | |
with gr.Row(): | |
with gr.Column(): | |
to_say = gr.TextArea( | |
label="What should I say!?", | |
lines=4, | |
value="コトバテクノロジーズのミッションは、音声基盤モデルを作ることです。", | |
) | |
with gr.Row(), gr.Column(): | |
# voice settings | |
top_p = gr.Slider( | |
value=5.0, | |
minimum=0.0, | |
maximum=10.0, | |
step=1.0, | |
label="Speech Stability - improves text following for a challenging speaker", | |
) | |
guidance = gr.Slider( | |
value=5.0, | |
minimum=1.0, | |
maximum=5.0, | |
step=1.0, | |
label="Speaker similarity - How closely to match speaker identity and speech style.", | |
) | |
# voice select | |
toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0]) | |
with gr.Row(visible=True) as row_1: | |
preset_dropdown = gr.Dropdown( | |
PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0] | |
) | |
with gr.Accordion("Preview: Preset voices", open=False): | |
for label, path in PRESET_VOICES.items(): | |
gr.Audio(value=path, label=label) | |
with gr.Row(visible=False) as row_2: | |
upload_target = gr.Audio( | |
sources=["upload"], | |
type="filepath", | |
label="Upload a clean sample to clone. Sample should contain 1 speaker, be between 10-90 seconds and not contain background noise.", | |
min_length=10, | |
max_length=90, | |
) | |
with gr.Row(visible=False) as row_3: | |
record_target = gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
label="Record your voice with a microphone to clone. Sample should contain 1 speaker, be between 10-90 seconds and not contain background noise.", | |
min_length=10, | |
max_length=90, | |
) | |
toggle.change( | |
change_voice_selection_layout, | |
inputs=toggle, | |
outputs=[row_1, row_2, row_3], | |
) | |
with gr.Column(): | |
speech = gr.Audio( | |
type="numpy", | |
label="Kotoba-Speech says...", | |
) | |
submit = gr.Button("Generate Speech") | |
submit.click( | |
fn=tts, | |
inputs=[to_say, top_p, guidance, toggle, preset_dropdown, upload_target, record_target], | |
outputs=speech, | |
) | |
demo.queue(default_concurrency_limit=2) | |
# demo.launch() | |
demo.launch(server_name="0.0.0.0", server_port=3000, share=True) | |