import os import gradio as gr import whisper import openai import tempfile from neon_tts_plugin_coqui import CoquiTTS model = whisper.load_model("small") class Dost: LANGUAGES = list(CoquiTTS.langs.keys()) coquiTTS = CoquiTTS() OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] def __init__(self): self.convHistory = [] self.voice = None self.result = [] def recognize(self, audio): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) lang = max(probs, key=probs.get) options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) print("-------------------RECOGNIZE---------------------") print(result) self.response(result.text, lang) def response(self, prompt, lang): response = openai.Completion.create( model="text-davinci-002", prompt=f"You: {prompt}Friend: ", temperature=0.5, max_tokens=60, top_p=1.0, frequency_penalty=0.5, presence_penalty=0.0, stop=["You:"] ) choice = response['choices'][0]['text'] print("-------------------RESPONSE---------------------") print(choice) self.convHistory.append((prompt, choice)) self.result.append(self.convHistory) print(self.convHistory[0]) print(type(self.convHistory[0])) self.say(choice, lang) def say(self, text, language): coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga'] if language not in coqui_langs: language = 'en' with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: self.coquiTTS.get_tts(text, fp, speaker = {"language" : language}) print("-------------------AUDIOOUTPUT---------------------") print("DONE", fp.name) self.result.append(fp.name) def start(self, audio, state): self.convHistory = state self.result = [] self.recognize(audio) print(self.result) return tuple(self.result) dost = Dost() with gr.Blocks() as demo: state = gr.State([]) with gr.Row(): with gr.Column(): input_audio = gr.Audio(source="microphone", type="filepath") btn = gr.Button("Submit") conversation = gr.Chatbot(value=dost.convHistory) output_audio = gr.Audio(label="AI voice response") btn.click(dost.start, inputs=[input_audio, state], outputs=[conversation, output_audio]) demo.launch(debug=True)