File size: 3,349 Bytes
dda3d27
 
 
4f485d9
 
dda3d27
 
4f485d9
dda3d27
 
 
4f485d9
 
 
 
 
 
 
 
dda3d27
 
 
4f485d9
0694ca8
4f485d9
31a365d
4f485d9
 
 
 
 
 
 
 
 
 
 
 
e0fcf8f
 
82346b0
 
3828e69
21789e6
2418a12
dda3d27
 
 
2418a12
 
4f485d9
 
 
 
 
 
 
 
dda3d27
4f485d9
 
 
 
 
 
3828e69
 
 
 
 
4f485d9
 
 
 
 
 
 
 
 
 
82346b0
 
4f485d9
dda3d27
4f485d9
dda3d27
4f485d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import json
import openai
import tempfile
import gradio as gr
import infer
import config
from neon_tts_plugin_coqui import CoquiTTS
title = "Speech to ChatGPT to Speech"
coquiTTS = CoquiTTS()

LANGUAGES = list(CoquiTTS.langs.keys())
LANGUAGES = LANGUAGES + ['cn', 'jp']
default_lang = "en"
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
api_key = os.environ.get('api_key')
#if you have OpenAI API key as a string, enable the below
openai.api_key = api_key

pth_path = config.pth_path
config_json = config.config_json
net_g_ms, hps = infer.load_model(config_json, pth_path)


# ChatGPT
def chat_hf(audio, custom_token, language):
    try:
        whisper_text = translate(audio)
        if whisper_text == "ERROR: You have to either use the microphone or upload an audio file":
            gpt_response = "MISSING AUDIO: Record your voice by clicking the microphone button, do not forget to stop recording before sending your message ;)"
        else:
            gpt_response = openai_create(whisper_text)

    except:
        whisper_text = translate(audio)
        gpt_response = """Sorry, I'm quite busy right now, but please try again later :)"""
    
    # to voice
    print(language)
    if language in ['cn', 'jp']:
        text = gpt_response.strip().replace(' ', '').replace('\n', '').replace('\r', '')
        text = infer.clean_text(text)
        audio = infer.infer(text, net_g_ms, 0, "demo")
        voice_out = (hps.data.sampling_rate, audio)
        return whisper_text, gpt_response, voice_out
    else:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            coquiTTS.get_tts(gpt_response, fp, speaker = {"language" : language})
        return whisper_text, gpt_response, fp.name



def translate(audio):
    print("""

    Sending audio to Whisper ...

    """)
   
    text_result = whisper(audio, None, "transcribe", fn_index=0)
    print(text_result)
    return text_result


def openai_create(prompt):
    print("""

    Giving response from ai ...

    """)
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0.9,
    max_tokens=150,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0.6,
    stop=[" Human:", " AI:"]
    )
    print(response.choices[0].text)
    return response.choices[0].text


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>" + title + "</h1>")
    radio = gr.Radio(label="Language", choices=LANGUAGES, value=default_lang)
    with gr.Row(equal_height=True):# equal_height=False
        with gr.Column():# variant="panel"
            audio_file = gr.Audio(source="microphone", type="filepath")
            custom_token = gr.Textbox(label='If it fails, use your own session token', placeholder="your own session token")
            with gr.Row():# mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
        with gr.Column():
            text1 = gr.Textbox(label="Speech to Text")
            text2 = gr.Textbox(label="ChatGPT Response")
            audio = gr.Audio(label="Output", interactive=False)
    # actions
    submit.click(
        chat_hf,
        [audio_file, custom_token, radio],
        [text1, text2, audio],
    )

blocks.launch(debug=True)