Spaces:

stevenhillis
/

intone_mvp

Sleeping

File size: 3,201 Bytes

6a6023c
d86f106
 
 
6a6023c
d86f106
 
 
60bea2a
d86f106
 
f3e678b
d86f106
cb2b2ed
d86f106
ccbb9b9
0270007
f3e678b
0270007
f3e678b
2ae17a3
 
1a5fb77
2ae17a3
 
d86f106
 
6a9e916
 
d86f106
 
 
 
 
 
 
 
2ae17a3
d86f106
cb2b2ed
2ae17a3
 
 
 
 
cb2b2ed
 
d86f106
 
 
 
cb2b2ed
aaca925

import io
import json
import os
import requests
import urllib

import gradio as gr
import numpy as np
from scipy.io import wavfile


base_url = "https://api.sandbox.deepgram.com/tts"
token_str = os.environ['DG_TOKEN']
def tts_fn(text, prompt_audio, prompt_seconds, inference_steps, inference_temperature, pitch_steps):
    texts = [text]
    sr = prompt_audio[0]
    prompt_audio = np.reshape(prompt_audio[1], (1, 1, -1)).astype(np.float32, order='C') / 32768.0
    params={'synthesize': 'true', 'pitch_steps': int(pitch_steps), 'soundstorm_steps': inference_steps, 'temperature': inference_temperature, 'prompt_seconds': prompt_seconds}
    files=[('texts', ('texts', json.dumps(texts), 'application/json')), ('prompt_audio', ('prompt_audio', json.dumps(prompt_audio.tolist()), 'application/json'))]
    response = requests.post(base_url, files=files, params=params, headers={'Authorization': f'Token {token_str}'}).json()
    try:
        sample_rate = int(response['results'][0]['sample_rate'])
        audio = (np.array(response['results'][0]['audio']).transpose() / 1.414 * 32767).astype(np.int16)
    except Exception:
        print(response)
    return (sample_rate, audio)

demo_files = ['demo_files/man.wav', 'demo_files/woman.wav', 'demo_files/man_2.wav', 'demo_files/woman_2.wav', 'demo_files/man_3.wav', 'demo_files/woman_3.wav', 'demo_files/woman_4.wav', 'demo_files/meditation.wav']


app = gr.Blocks()

with app:
    with gr.Tab("TTS MVP"):
        with gr.Row():
            with gr.Column():
                pangram = "The beige hue on the waters of the loch impressed all, including the French queen, before she heard that symphony again, just as young Arthur wanted."
                cherry = "Your request has been processed and the audio is ready for playback."
                textbox = gr.TextArea(label="Text", placeholder="Type a sentence here", value=cherry)
                prompt_audio = gr.Audio(label="Prompt Audio", source='upload')
                examples = gr.Examples(label='Sample Speakers', examples=demo_files, inputs=prompt_audio)
                # speed = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Speed")
                # variability = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, label="Variability")
                inference_steps = gr.Slider(minimum=1, maximum=32, value=1, step=1, label="Inference Steps: quality vs latency tradeoff. Results are sometimes unstable for values >1.")
                inference_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Temperature: fidelity vs variability tradeoff")
                prompt_seconds = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=1.0, label="Use first N seconds of prompt audio")
                pitch_steps = gr.Slider(minimum=-24, maximum=24, value=0, step=1, label="Pitch Steps: 12 to an octave")

            with gr.Column():
                audio_output = gr.Audio(label="Output Audio", elem_id='tts-audio')
                btn = gr.Button("Generate")
                btn.click(tts_fn, inputs=[textbox, prompt_audio, prompt_seconds, inference_steps, inference_temperature, pitch_steps], outputs=[audio_output])
app.launch()