ChatGPT-Speech

Runtime error

File size: 3,293 Bytes

dda3d27
 
 
4f485d9
 
dda3d27
 
4f485d9
dda3d27
 
 
4f485d9
 
 
 
 
 
 
 
dda3d27
 
 
4f485d9
0694ca8
4f485d9
31a365d
4f485d9
 
 
 
 
 
 
 
 
 
 
 
dda3d27
173f6a5
3828e69
21789e6
dda3d27
 
 
 
 
4f485d9
 
 
 
 
 
 
 
dda3d27
4f485d9
 
 
 
 
 
3828e69
 
 
 
 
4f485d9
 
 
 
 
 
 
 
 
 
912ece2
46bfec5
 
4f485d9
dda3d27
4f485d9
dda3d27
4f485d9

import os
import json
import openai
import tempfile
import gradio as gr
import infer
import config
from neon_tts_plugin_coqui import CoquiTTS
title = "Speech to ChatGPT to Speech"
coquiTTS = CoquiTTS()

LANGUAGES = list(CoquiTTS.langs.keys())
LANGUAGES = LANGUAGES + ['cn', 'jp']
default_lang = "en"
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
api_key = os.environ.get('api_key')
#if you have OpenAI API key as a string, enable the below
openai.api_key = api_key

pth_path = config.pth_path
config_json = config.config_json
net_g_ms, hps = infer.load_model(config_json, pth_path)


# ChatGPT
def chat_hf(audio, custom_token, language):
    try:
        whisper_text = translate(audio)
        if whisper_text == "ERROR: You have to either use the microphone or upload an audio file":
            gpt_response = "MISSING AUDIO: Record your voice by clicking the microphone button, do not forget to stop recording before sending your message ;)"
        else:
            gpt_response = openai_create(whisper_text)

    except:
        whisper_text = translate(audio)
        gpt_response = """Sorry, I'm quite busy right now, but please try again later :)"""
    
    # to voice
    if language == 'cn' or 'jp':
        text = infer.clean_text(gpt_response)
        audio = infer.infer(text, net_g_ms, 0, "demo")
        voice_out = (hps.data.sampling_rate, audio)
    else:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            coquiTTS.get_tts(gpt_response, fp, speaker = {"language" : language})
            voice_out = fp.name
    return whisper_text, gpt_response, voice_out


def translate(audio):
    print("""
    —
    Sending audio to Whisper ...
    —
    """)
   
    text_result = whisper(audio, None, "transcribe", fn_index=0)
    print(text_result)
    return text_result


def openai_create(prompt):
    print("""
    —
    Giving response from ai ...
    —
    """)
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0.9,
    max_tokens=150,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0.6,
    stop=[" Human:", " AI:"]
    )
    text_out = response.choices[0].text.strip().replace(' ', '').replace('\n', '').replace('\r', '')
    print(text_out)
    return text_out


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>" + title + "</h1>")
    radio = gr.Radio(label="Language", choices=LANGUAGES, value=default_lang)
    with gr.Row(equal_height=True):# equal_height=False
        with gr.Column():# variant="panel"
            audio_file = gr.Audio(source="microphone", type="filepath")
            custom_token = gr.Textbox(label='If it fails, use your own session token', placeholder="your own session token")
            with gr.Row():# mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
        with gr.Column():
            text1 = gr.Textbox(label="Speech to Text")
            text2 = gr.Textbox(label="ChatGPT Response")
            audio = gr.Audio(label="Output", interactive=False)
    # actions
    submit.click(
        chat_hf,
        [audio_file, custom_token, radio],
        [text1, text2, audio],
    )

blocks.launch(debug=True)