File size: 3,319 Bytes
6a6023c
d86f106
 
 
6a6023c
d86f106
 
 
60bea2a
d86f106
 
f3e678b
d86f106
cb2b2ed
d86f106
ccbb9b9
9c1a82f
9bcb0d8
 
 
ea0162a
9c1a82f
2ae17a3
9c1a82f
 
 
 
 
 
 
2ae17a3
 
d86f106
 
6a9e916
 
d86f106
 
 
 
 
 
 
 
2ae17a3
d86f106
cb2b2ed
2ae17a3
 
 
 
 
cb2b2ed
 
d86f106
 
 
 
cb2b2ed
aaca925
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import io
import json
import os
import requests
import urllib

import gradio as gr
import numpy as np
from scipy.io import wavfile


base_url = "https://api.sandbox.deepgram.com/tts"
token_str = os.environ['DG_TOKEN']
def tts_fn(text, prompt_audio, prompt_seconds, inference_steps, inference_temperature, pitch_steps):
    texts = [text]
    sr = prompt_audio[0]
    prompt_audio = prompt_audio[1].astype(np.float32, order='C') / 32768.0
    byte_io = io.BytesIO(bytes())
    wavfile.write(byte_io, sr, prompt_audio)
    prompt_audio_bytes = byte_io.read()
    params={'synthesize': 'true', 'text': urllib.parse.quote(text), 'pitch_steps': int(pitch_steps), 'soundstorm_steps': inference_steps, 'temperature': inference_temperature, 'prompt_seconds': prompt_seconds}
    response = requests.post(base_url, data=prompt_audio_bytes, params=params, headers={'Authorization': f'Token {token_str}'})
    try:
        with open('result.wav', 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk: f.write(chunk)
        sample_rate, audio = wavfile.read('result.wav')
        print(audio.dtype)
        audio = (audio / 1.414 * 32767).astype(np.int16)
        print(audio.dtype)
    except Exception:
        print(response)
    return (sample_rate, audio)

demo_files = ['demo_files/man.wav', 'demo_files/woman.wav', 'demo_files/man_2.wav', 'demo_files/woman_2.wav', 'demo_files/man_3.wav', 'demo_files/woman_3.wav', 'demo_files/woman_4.wav', 'demo_files/meditation.wav']


app = gr.Blocks()

with app:
    with gr.Tab("TTS MVP"):
        with gr.Row():
            with gr.Column():
                pangram = "The beige hue on the waters of the loch impressed all, including the French queen, before she heard that symphony again, just as young Arthur wanted."
                cherry = "Your request has been processed and the audio is ready for playback."
                textbox = gr.TextArea(label="Text", placeholder="Type a sentence here", value=cherry)
                prompt_audio = gr.Audio(label="Prompt Audio", source='upload')
                examples = gr.Examples(label='Sample Speakers', examples=demo_files, inputs=prompt_audio)
                # speed = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Speed")
                # variability = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, label="Variability")
                inference_steps = gr.Slider(minimum=1, maximum=32, value=1, step=1, label="Inference Steps: quality vs latency tradeoff. Results are sometimes unstable for values >1.")
                inference_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Temperature: fidelity vs variability tradeoff")
                prompt_seconds = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=1.0, label="Use first N seconds of prompt audio")
                pitch_steps = gr.Slider(minimum=-24, maximum=24, value=0, step=1, label="Pitch Steps: 12 to an octave")

            with gr.Column():
                audio_output = gr.Audio(label="Output Audio", elem_id='tts-audio')
                btn = gr.Button("Generate")
                btn.click(tts_fn, inputs=[textbox, prompt_audio, prompt_seconds, inference_steps, inference_temperature, pitch_steps], outputs=[audio_output])
app.launch()