Spaces:

Kabilash10
/

Vocode-VoiceAI

Runtime error

App Files Files Community

Vocode-VoiceAI / app.py

Kabilash10

Update app.py

630a9eb verified 2 months ago

raw

history blame

3.67 kB

	import gradio as gr
	import requests
	import openai
	import asyncio
	import os
	from deepgram import Deepgram
	from vocode.streaming.models.transcriber import (
	DeepgramTranscriberConfig,
	PunctuationEndpointingConfig,
	)
	from vocode.streaming.models.agent import ChatGPTAgentConfig
	from vocode.streaming.models.message import BaseMessage
	from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
	from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
	from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
	from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
	from vocode.streaming.streaming_conversation import StreamingConversation
	from vocode.helpers import create_streaming_microphone_input_and_speaker_output

	# Fetch API keys and voice IDs from environment variables
	DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
	ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
	VOICE_ID = os.getenv("VOICE_ID")
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	# Initialize OpenAI client
	client = openai.OpenAI(api_key=OPENAI_API_KEY)

	# Initialize Deepgram
	deepgram = Deepgram(DEEPGRAM_API_KEY)

	# Function to transcribe audio using Deepgram
	async def transcribe_audio(audio_file_path):
	with open(audio_file_path, 'rb') as audio_file:
	audio_data = audio_file.read()

	response = await deepgram.transcription.prerecorded(
	{"buffer": audio_data, "mimetype": "audio/wav"},
	{'punctuate': True, 'language': 'en'}
	)
	transcription = response['results']['channels'][0]['alternatives'][0]['transcript']
	return transcription

	# Function to generate content using OpenAI GPT-4
	def generate_content(input_text):
	response = client.chat.completions.create(
	model="gpt-4",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": input_text}
	]
	)
	generated_text = response.choices[0].message.content.strip()
	return generated_text

	# Function to convert text to speech using Eleven Labs
	def text_to_speech(text):
	url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
	headers = {
	"Accept": "audio/mpeg",
	"Content-Type": "application/json",
	"xi-api-key": ELEVEN_LABS_API_KEY
	}
	data = {
	"text": text,
	"voice_settings": {
	"stability": 0.75,
	"similarity_boost": 0.75
	}
	}
	response = requests.post(url, json=data, headers=headers)

	if response.status_code == 200:
	with open("output.mp3", "wb") as f:
	f.write(response.content)
	return "output.mp3"
	else:
	return f"Error: {response.status_code} - {response.text}"

	# Main function to handle the entire process
	async def process_audio(audio):
	transcription = await transcribe_audio(audio)
	generated_text = generate_content(transcription)
	audio_file = text_to_speech(generated_text)
	return transcription, generated_text, audio_file

	# Gradio interface setup
	interface = gr.Interface(
	fn=lambda audio: asyncio.run(process_audio(audio)),
	inputs=gr.Audio(type="filepath", label="Speak into your microphone"),
	outputs=[
	gr.Textbox(label="Transcription Output"),
	gr.Textbox(label="Generated Content"),
	gr.Audio(label="Synthesized Speech")
	],
	title="Speech-to-Text, Content Generation, and Text-to-Speech",
	description="Speak into the microphone, and the system will transcribe your speech, generate content, and convert the generated text into speech."
	)

	# Launch the Gradio interface
	interface.launch()