|
import os |
|
import gradio as gr |
|
import whisper |
|
import openai |
|
import tempfile |
|
from neon_tts_plugin_coqui import CoquiTTS |
|
|
|
model = whisper.load_model("small") |
|
|
|
class Dost: |
|
LANGUAGES = list(CoquiTTS.langs.keys()) |
|
coquiTTS = CoquiTTS() |
|
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] |
|
def __init__(self): |
|
self.convHistory = [] |
|
self.voice = None |
|
self.result = [] |
|
|
|
def recognize(self, audio): |
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
|
|
_, probs = model.detect_language(mel) |
|
lang = max(probs, key=probs.get) |
|
|
|
options = whisper.DecodingOptions(fp16 = False) |
|
result = whisper.decode(model, mel, options) |
|
|
|
print("-------------------RECOGNIZE---------------------") |
|
print(result) |
|
self.response(result.text, lang) |
|
|
|
def response(self, prompt, lang): |
|
response = openai.Completion.create( |
|
model="text-davinci-002", |
|
prompt=f"You: {prompt}Friend: ", |
|
temperature=0.5, |
|
max_tokens=60, |
|
top_p=1.0, |
|
frequency_penalty=0.5, |
|
presence_penalty=0.0, |
|
stop=["You:"] |
|
) |
|
choice = response['choices'][0]['text'] |
|
print("-------------------RESPONSE---------------------") |
|
print(choice) |
|
self.convHistory.append((prompt, choice)) |
|
self.result.append(self.convHistory) |
|
print(self.convHistory[0]) |
|
print(type(self.convHistory[0])) |
|
self.say(choice, lang) |
|
|
|
def say(self, text, language): |
|
coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga'] |
|
if language not in coqui_langs: |
|
language = 'en' |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
self.coquiTTS.get_tts(text, fp, speaker = {"language" : language}) |
|
print("-------------------AUDIOOUTPUT---------------------") |
|
print("DONE", fp.name) |
|
self.result.append(fp.name) |
|
|
|
def start(self, audio, state): |
|
self.convHistory = state |
|
self.result = [] |
|
self.recognize(audio) |
|
print(self.result) |
|
return tuple(self.result) |
|
|
|
dost = Dost() |
|
with gr.Blocks() as demo: |
|
state = gr.State([]) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_audio = gr.Audio(source="microphone", type="filepath") |
|
btn = gr.Button("Submit") |
|
conversation = gr.Chatbot(value=dost.convHistory) |
|
output_audio = gr.Audio(label="AI voice response") |
|
btn.click(dost.start, inputs=[input_audio, state], outputs=[conversation, output_audio]) |
|
|
|
demo.launch(debug=True) |