import openai from playsound import playsound from gtts import gTTS import speech_recognition as sr import gradio as gr import os openai.api_key = os.environ['api_key'] def generate_response(prompt): prompt = (f"{prompt}") response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": f"{prompt}"}, ]) message = response.choices[0]['message']['content'] return message r = sr.Recognizer() from pydub import AudioSegment def transcribe(audio, lang): with sr.AudioFile(audio) as source: audio = r.record(source) text = r.recognize_google(audio, language=lang) text = generate_response(text) tts = gTTS(text=text, lang=lang) out = "tmp.mp3" tts.save(out) return out with open('gradio_article.md') as f: article = f.read() interface_options = { "title": "Smart GPT", "description": "Let's have a chat! Talk to me, and I'll respond in a jiffy!", "article": article, "layout": "horizontal", "theme": "default", } inputs = gr.Audio(source="microphone", type="filepath") outputs = "audio" lang = gr.Dropdown(choices=["en", "vi", "nl"], value="en", resettable=False) if lang.value == "": lang.value = "en" gr.Interface(fn=transcribe, inputs=[inputs, lang], outputs=outputs, live=False, allow_clear=False, **interface_options).launch() # TODO # Custom voice # VALL-E # https://cloud.google.com/text-to-speech/custom-voice/docs/quickstart # Mozilla TTS # OpenSeq2Seq # Best VN: Vbee, FPT # Elevenlabs for English