import gradio as gr import requests import openai import asyncio import os from deepgram import Deepgram from vocode.streaming.models.transcriber import ( DeepgramTranscriberConfig, PunctuationEndpointingConfig, ) from vocode.streaming.models.agent import ChatGPTAgentConfig from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer from vocode.streaming.streaming_conversation import StreamingConversation from vocode.helpers import create_streaming_microphone_input_and_speaker_output # Fetch API keys and voice IDs from environment variables DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY") VOICE_ID = os.getenv("VOICE_ID") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Initialize OpenAI client client = openai.OpenAI(api_key=OPENAI_API_KEY) # Initialize Deepgram deepgram = Deepgram(DEEPGRAM_API_KEY) # Function to transcribe audio using Deepgram async def transcribe_audio(audio_file_path): with open(audio_file_path, 'rb') as audio_file: audio_data = audio_file.read() response = await deepgram.transcription.prerecorded( {"buffer": audio_data, "mimetype": "audio/wav"}, {'punctuate': True, 'language': 'en'} ) transcription = response['results']['channels'][0]['alternatives'][0]['transcript'] return transcription # Function to generate content using OpenAI GPT-4 def generate_content(input_text): response = client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": input_text} ] ) generated_text = response.choices[0].message.content.strip() return generated_text # Function to convert text to speech using Eleven Labs def text_to_speech(text): url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}" headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": ELEVEN_LABS_API_KEY } data = { "text": text, "voice_settings": { "stability": 0.75, "similarity_boost": 0.75 } } response = requests.post(url, json=data, headers=headers) if response.status_code == 200: with open("output.mp3", "wb") as f: f.write(response.content) return "output.mp3" else: return f"Error: {response.status_code} - {response.text}" # Main function to handle the entire process async def process_audio(audio): transcription = await transcribe_audio(audio) generated_text = generate_content(transcription) audio_file = text_to_speech(generated_text) return transcription, generated_text, audio_file # Gradio interface setup interface = gr.Interface( fn=lambda audio: asyncio.run(process_audio(audio)), inputs=gr.Audio(type="filepath", label="Speak into your microphone"), outputs=[ gr.Textbox(label="Transcription Output"), gr.Textbox(label="Generated Content"), gr.Audio(label="Synthesized Speech") ], title="Speech-to-Text, Content Generation, and Text-to-Speech", description="Speak into the microphone, and the system will transcribe your speech, generate content, and convert the generated text into speech." ) # Launch the Gradio interface interface.launch()