File size: 3,305 Bytes
dda3d27
 
 
4f485d9
 
dda3d27
 
4f485d9
dda3d27
 
 
4f485d9
 
 
 
 
 
 
 
dda3d27
 
 
4f485d9
0694ca8
4f485d9
31a365d
4f485d9
 
 
 
 
 
 
 
 
 
 
 
dda3d27
82346b0
 
3828e69
21789e6
dda3d27
 
 
 
 
4f485d9
 
 
 
 
 
 
 
dda3d27
4f485d9
 
 
 
 
 
3828e69
 
 
 
 
4f485d9
 
 
 
 
 
 
 
 
 
82346b0
 
4f485d9
dda3d27
4f485d9
dda3d27
4f485d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import json
import openai
import tempfile
import gradio as gr
import infer
import config
from neon_tts_plugin_coqui import CoquiTTS
title = "Speech to ChatGPT to Speech"
coquiTTS = CoquiTTS()

LANGUAGES = list(CoquiTTS.langs.keys())
LANGUAGES = LANGUAGES + ['cn', 'jp']
default_lang = "en"
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
api_key = os.environ.get('api_key')
#if you have OpenAI API key as a string, enable the below
openai.api_key = api_key

pth_path = config.pth_path
config_json = config.config_json
net_g_ms, hps = infer.load_model(config_json, pth_path)


# ChatGPT
def chat_hf(audio, custom_token, language):
    try:
        whisper_text = translate(audio)
        if whisper_text == "ERROR: You have to either use the microphone or upload an audio file":
            gpt_response = "MISSING AUDIO: Record your voice by clicking the microphone button, do not forget to stop recording before sending your message ;)"
        else:
            gpt_response = openai_create(whisper_text)

    except:
        whisper_text = translate(audio)
        gpt_response = """Sorry, I'm quite busy right now, but please try again later :)"""
    
    # to voice
    if language == 'cn' or 'jp':
        text = gpt_response.strip().replace(' ', '').replace('\n', '').replace('\r', '')
        text = infer.clean_text(text)
        audio = infer.infer(text, net_g_ms, 0, "demo")
        voice_out = (hps.data.sampling_rate, audio)
    else:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            coquiTTS.get_tts(gpt_response, fp, speaker = {"language" : language})
            voice_out = fp.name
    return whisper_text, gpt_response, voice_out


def translate(audio):
    print("""

    Sending audio to Whisper ...

    """)
   
    text_result = whisper(audio, None, "transcribe", fn_index=0)
    print(text_result)
    return text_result


def openai_create(prompt):
    print("""

    Giving response from ai ...

    """)
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0.9,
    max_tokens=150,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0.6,
    stop=[" Human:", " AI:"]
    )
    print(response.choices[0].text)
    return response.choices[0].text


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>" + title + "</h1>")
    radio = gr.Radio(label="Language", choices=LANGUAGES, value=default_lang)
    with gr.Row(equal_height=True):# equal_height=False
        with gr.Column():# variant="panel"
            audio_file = gr.Audio(source="microphone", type="filepath")
            custom_token = gr.Textbox(label='If it fails, use your own session token', placeholder="your own session token")
            with gr.Row():# mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
        with gr.Column():
            text1 = gr.Textbox(label="Speech to Text")
            text2 = gr.Textbox(label="ChatGPT Response")
            audio = gr.Audio(label="Output", interactive=False)
    # actions
    submit.click(
        chat_hf,
        [audio_file, custom_token, radio],
        [text1, text2, audio],
    )

blocks.launch(debug=True)