Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import openai | |
import asyncio | |
import os | |
from deepgram import Deepgram | |
from vocode.streaming.models.transcriber import ( | |
DeepgramTranscriberConfig, | |
PunctuationEndpointingConfig, | |
) | |
from vocode.streaming.models.agent import ChatGPTAgentConfig | |
from vocode.streaming.models.message import BaseMessage | |
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig | |
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber | |
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent | |
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer | |
from vocode.streaming.streaming_conversation import StreamingConversation | |
from vocode.helpers import create_streaming_microphone_input_and_speaker_output | |
# Fetch API keys and voice IDs from environment variables | |
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") | |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY") | |
VOICE_ID = os.getenv("VOICE_ID") | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
# Initialize OpenAI client | |
client = openai.OpenAI(api_key=OPENAI_API_KEY) | |
# Initialize Deepgram | |
deepgram = Deepgram(DEEPGRAM_API_KEY) | |
# Function to transcribe audio using Deepgram | |
async def transcribe_audio(audio_file_path): | |
with open(audio_file_path, 'rb') as audio_file: | |
audio_data = audio_file.read() | |
response = await deepgram.transcription.prerecorded( | |
{"buffer": audio_data, "mimetype": "audio/wav"}, | |
{'punctuate': True, 'language': 'en'} | |
) | |
transcription = response['results']['channels'][0]['alternatives'][0]['transcript'] | |
return transcription | |
# Function to generate content using OpenAI GPT-4 | |
def generate_content(input_text): | |
response = client.chat.completions.create( | |
model="gpt-4", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": input_text} | |
] | |
) | |
generated_text = response.choices[0].message.content.strip() | |
return generated_text | |
# Function to convert text to speech using Eleven Labs | |
def text_to_speech(text): | |
url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}" | |
headers = { | |
"Accept": "audio/mpeg", | |
"Content-Type": "application/json", | |
"xi-api-key": ELEVEN_LABS_API_KEY | |
} | |
data = { | |
"text": text, | |
"voice_settings": { | |
"stability": 0.75, | |
"similarity_boost": 0.75 | |
} | |
} | |
response = requests.post(url, json=data, headers=headers) | |
if response.status_code == 200: | |
with open("output.mp3", "wb") as f: | |
f.write(response.content) | |
return "output.mp3" | |
else: | |
return f"Error: {response.status_code} - {response.text}" | |
# Main function to handle the entire process | |
async def process_audio(audio): | |
transcription = await transcribe_audio(audio) | |
generated_text = generate_content(transcription) | |
audio_file = text_to_speech(generated_text) | |
return transcription, generated_text, audio_file | |
# Gradio interface setup | |
interface = gr.Interface( | |
fn=lambda audio: asyncio.run(process_audio(audio)), | |
inputs=gr.Audio(type="filepath", label="Speak into your microphone"), | |
outputs=[ | |
gr.Textbox(label="Transcription Output"), | |
gr.Textbox(label="Generated Content"), | |
gr.Audio(label="Synthesized Speech") | |
], | |
title="Speech-to-Text, Content Generation, and Text-to-Speech", | |
description="Speak into the microphone, and the system will transcribe your speech, generate content, and convert the generated text into speech." | |
) | |
# Launch the Gradio interface | |
interface.launch() | |