akshayvkt's picture
Update app.py
history blame
2.52 kB
import gradio as gr
import openai
import requests
import json
import os
openai.api_key = os.environ.get('OPENAI_API_KEY')
messages = [{"role": "system", "content": 'You are Steve Jobs. Respond to all input in 25 words or less.'}]
# Set up the API endpoint URL and headers
url = f"https://api.elevenlabs.io/v1/text-to-speech/{os.environ.get('voice_id')}/stream"
headers = {
"accept": "*/*",
"xi-api-key": os.environ.get('elevenlabs_api_key'),
"Content-Type": "application/json",
# Define a function to handle the Gradio input and generate the response
def transcribe(audio):
global messages
# Use OpenAI to transcribe the user's audio input
# API call 1
audio_file = open(audio, "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
# Append the user's message to the message history
messages.append({"role": "user", "content": transcript["text"]})
# Generate a response using OpenAI's chat API
#API call 2
response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
# Extract the system message from the API response and append it to the message history
system_message = response["choices"][0]["message"]
#API Call 3
# Use the voice synthesis API to generate an audio response from the system message
data = {
"text": system_message["content"],
"voice_settings": {
"stability": 0,
"similarity_boost": 0
response = requests.post(url, headers=headers, data=json.dumps(data), stream=True)
# Save the audio response to a file
if response.ok:
with open("output.wav", "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
print(f"Error: {response.status_code} - {response.reason}")
# Generate a chat transcript for display in the Gradio UI
chat_transcript = ""
for message in messages:
if message['role'] != 'system':
chat_transcript += message['role'] + ": " + message['content'] + "\n\n"
return chat_transcript,'output.wav'
# Define the Gradio UI interface
# ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text")
ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs=['text','audio'])