intone_mvp / app.py
stevenhillis's picture
prompt audio as list for files request
0270007
raw
history blame
3.2 kB
import io
import json
import os
import requests
import urllib
import gradio as gr
import numpy as np
from scipy.io import wavfile
base_url = "https://api.sandbox.deepgram.com/tts"
token_str = os.environ['DG_TOKEN']
def tts_fn(text, prompt_audio, prompt_seconds, inference_steps, inference_temperature, pitch_steps):
texts = [text]
sr = prompt_audio[0]
prompt_audio = np.reshape(prompt_audio[1], (1, 1, -1)).astype(np.float32, order='C') / 32768.0
params={'synthesize': 'true', 'pitch_steps': int(pitch_steps), 'soundstorm_steps': inference_steps, 'temperature': inference_temperature, 'prompt_seconds': prompt_seconds}
files=[('texts', ('texts', json.dumps(texts), 'application/json')), ('prompt_audio', ('prompt_audio', json.dumps(prompt_audio.tolist()), 'application/json'))]
response = requests.post(base_url, files=files, params=params, headers={'Authorization': f'Token {token_str}'}).json()
try:
sample_rate = int(response['results'][0]['sample_rate'])
audio = (np.array(response['results'][0]['audio']).transpose() / 1.414 * 32767).astype(np.int16)
except Exception:
print(response)
return (sample_rate, audio)
demo_files = ['demo_files/man.wav', 'demo_files/woman.wav', 'demo_files/man_2.wav', 'demo_files/woman_2.wav', 'demo_files/man_3.wav', 'demo_files/woman_3.wav', 'demo_files/woman_4.wav', 'demo_files/meditation.wav']
app = gr.Blocks()
with app:
with gr.Tab("TTS MVP"):
with gr.Row():
with gr.Column():
pangram = "The beige hue on the waters of the loch impressed all, including the French queen, before she heard that symphony again, just as young Arthur wanted."
cherry = "Your request has been processed and the audio is ready for playback."
textbox = gr.TextArea(label="Text", placeholder="Type a sentence here", value=cherry)
prompt_audio = gr.Audio(label="Prompt Audio", source='upload')
examples = gr.Examples(label='Sample Speakers', examples=demo_files, inputs=prompt_audio)
# speed = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Speed")
# variability = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, label="Variability")
inference_steps = gr.Slider(minimum=1, maximum=32, value=1, step=1, label="Inference Steps: quality vs latency tradeoff. Results are sometimes unstable for values >1.")
inference_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Temperature: fidelity vs variability tradeoff")
prompt_seconds = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=1.0, label="Use first N seconds of prompt audio")
pitch_steps = gr.Slider(minimum=-24, maximum=24, value=0, step=1, label="Pitch Steps: 12 to an octave")
with gr.Column():
audio_output = gr.Audio(label="Output Audio", elem_id='tts-audio')
btn = gr.Button("Generate")
btn.click(tts_fn, inputs=[textbox, prompt_audio, prompt_seconds, inference_steps, inference_temperature, pitch_steps], outputs=[audio_output])
app.launch()