ChatGPT-Speech / app.py
Yusin's picture
Update app.py
e0fcf8f
import os
import json
import openai
import tempfile
import gradio as gr
import infer
import config
from neon_tts_plugin_coqui import CoquiTTS
title = "Speech to ChatGPT to Speech"
coquiTTS = CoquiTTS()
LANGUAGES = list(CoquiTTS.langs.keys())
LANGUAGES = LANGUAGES + ['cn', 'jp']
default_lang = "en"
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
api_key = os.environ.get('api_key')
#if you have OpenAI API key as a string, enable the below
openai.api_key = api_key
pth_path = config.pth_path
config_json = config.config_json
net_g_ms, hps = infer.load_model(config_json, pth_path)
# ChatGPT
def chat_hf(audio, custom_token, language):
try:
whisper_text = translate(audio)
if whisper_text == "ERROR: You have to either use the microphone or upload an audio file":
gpt_response = "MISSING AUDIO: Record your voice by clicking the microphone button, do not forget to stop recording before sending your message ;)"
else:
gpt_response = openai_create(whisper_text)
except:
whisper_text = translate(audio)
gpt_response = """Sorry, I'm quite busy right now, but please try again later :)"""
# to voice
print(language)
if language in ['cn', 'jp']:
text = gpt_response.strip().replace(' ', '').replace('\n', '').replace('\r', '')
text = infer.clean_text(text)
audio = infer.infer(text, net_g_ms, 0, "demo")
voice_out = (hps.data.sampling_rate, audio)
return whisper_text, gpt_response, voice_out
else:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(gpt_response, fp, speaker = {"language" : language})
return whisper_text, gpt_response, fp.name
def translate(audio):
print("""
β€”
Sending audio to Whisper ...
β€”
""")
text_result = whisper(audio, None, "transcribe", fn_index=0)
print(text_result)
return text_result
def openai_create(prompt):
print("""
β€”
Giving response from ai ...
β€”
""")
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt,
temperature=0.9,
max_tokens=150,
top_p=1,
frequency_penalty=0,
presence_penalty=0.6,
stop=[" Human:", " AI:"]
)
print(response.choices[0].text)
return response.choices[0].text
with gr.Blocks() as blocks:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>" + title + "</h1>")
radio = gr.Radio(label="Language", choices=LANGUAGES, value=default_lang)
with gr.Row(equal_height=True):# equal_height=False
with gr.Column():# variant="panel"
audio_file = gr.Audio(source="microphone", type="filepath")
custom_token = gr.Textbox(label='If it fails, use your own session token', placeholder="your own session token")
with gr.Row():# mobile_collapse=False
submit = gr.Button("Submit", variant="primary")
with gr.Column():
text1 = gr.Textbox(label="Speech to Text")
text2 = gr.Textbox(label="ChatGPT Response")
audio = gr.Audio(label="Output", interactive=False)
# actions
submit.click(
chat_hf,
[audio_file, custom_token, radio],
[text1, text2, audio],
)
blocks.launch(debug=True)