import gradio as gr import openai from elevenlabslib import * from pydub import AudioSegment from pydub.playback import play import io import winsound openai.api_key = "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" api_key = "929b67c6e533e94018a438d70c960b60" from elevenlabslib import ElevenLabsUser user = ElevenLabsUser(api_key) messages = ["Respond with voice"] def transcribe(audio): global messages audio_file = open(audio, "rb") transcript = openai.Audio.transcribe("whisper-1", audio_file) messages.append(f"\nUser: {transcript['text']}") response = openai.Completion.create( engine="text-davinci-003", prompt=messages[-1], max_tokens=60, n=1, stop=None, temperature=0.5, ) system_message = response["choices"][0]["text"] messages.append(f"{system_message}") voice = user.get_voices_by_name("Bella")[0] audio = voice.generate_audio_bytes(system_message) audio = AudioSegment.from_file(io.BytesIO(audio), format="mp3") audio.export("output.wav", format="wav") winsound.PlaySound("output.wav", winsound.SND_FILENAME) chat_transcript = "\n".join(messages) return chat_transcript iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Voice Assistant", ) iface.launch()