Voice / app.py
Staticaliza's picture
Update app.py
d53f698 verified
# Imports
import gradio as gr
import spaces
import torch
import numpy as np
from kokoro import KModel, KPipeline
# Pre-Initialize
DEVICE = "auto"
if DEVICE == "auto":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")
# Variables
SILENT_THRESHOLD = 0.01
CHAR_LIMIT = 2000
DEFAULT_INPUT = ""
DEFAULT_VOICE = "af_heart"
CHOICES = {
"πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️": "af_heart",
"πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella",
"πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧": "af_nicole",
"πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
"πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
"πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
"πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
"πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
"πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
"πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
"πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
"πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
"πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
"πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
"πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
"πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
"πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
"πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
"πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
"πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
"πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
"πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
"πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
"πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
"πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
"πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
"πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
"πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
}
PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ"
for v in CHOICES.values():
PIPELINES[v[0]].load_voice(v)
MODEL = KModel().eval()
css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
visibility: hidden
}
'''
# Functions
def trim_silence(audio, threshold=SILENT_THRESHOLD):
abs_audio = np.abs(audio)
indices = np.where(abs_audio > threshold)[0]
if len(indices) == 0: return audio
start = indices[0]
end = indices[-1] + 1
return audio[start:end]
def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1):
text = text.strip()[:CHAR_LIMIT] + "."
pipeline = PIPELINES[voice[0]]
pack = pipeline.load_voice(voice)
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps) - 1]
audio = MODEL(ps, ref_s, speed)
return (24000, trim_silence(audio.numpy()))
def cloud():
print("[CLOUD] | Space maintained.")
@spaces.GPU()
def gpu():
return
# Initialize
with gr.Blocks(css=css) as main:
with gr.Column():
gr.Markdown("πŸͺ„ Instantly generate realistic voices using text input.")
with gr.Column():
input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice")
speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
submit = gr.Button("β–Ά")
maintain = gr.Button("☁️")
with gr.Column():
output = gr.Audio(label="Output")
submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output)
maintain.click(cloud, inputs=[], outputs=[], queue=False)
main.launch(show_api=True)