Malcolm_X / app.py
jeremierostan's picture
Update app.py
fe4dcd5 verified
raw
history blame
3.26 kB
import gradio as gr
from openai import OpenAI
from typing import IO
from io import BytesIO
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
import tempfile
from PIL import Image
import requests
import os
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")
# Initialize Clients
openai_api_key = os.getenv("OPENAI_API_KEY")
client_openai = OpenAI(api_key=openai_api_key)
elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
client_elevenlabs = ElevenLabs(api_key=elevenlabs_api_key)
def text_to_speech_stream(text: str) -> IO[bytes]:
# Perform the text-to-speech conversion using ElevenLabs
response = client_elevenlabs.text_to_speech.convert( # Use client_elevenlabs
voice_id="VQE7uwgOwnnTI8oKNjqz", # Digitalized voice of Malcolm X
optimize_streaming_latency="0",
output_format="mp3_22050_32",
text=text,
model_id="eleven_multilingual_v2",
voice_settings=VoiceSettings(
stability=0.0,
similarity_boost=1.0,
style=0.0,
use_speaker_boost=True,
),
)
# Create a BytesIO object to hold the audio data in memory
audio_stream = BytesIO()
# Write each chunk of audio data to the stream
for chunk in response:
if chunk:
audio_stream.write(chunk)
# Reset stream position to the beginning
audio_stream.seek(0)
# Return the stream for further use
return audio_stream
def generate_assistant_response(user_message):
assistant = client_openai.beta.assistants.retrieve( # Use client_openai
assistant_id="asst_EzgIYI1atVqvV4tRvy6YmQni"
)
thread = client_openai.beta.threads.create()
client_openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content=user_message
)
run = client_openai.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=assistant.id
)
while True:
run_status = client_openai.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id
)
if run_status.status == 'completed':
break
messages = client_openai.beta.threads.messages.list(thread_id=thread.id)
assistant_response = messages.data[0].content[0].text.value
# Convert to voice using ElevenLabs
audio_stream = text_to_speech_stream(assistant_response)
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
temp_file.write(audio_stream.getvalue())
temp_audio_path = temp_file.name
return temp_audio_path # Return the temporary file path
# URL of the illustrative image
image_url = "image.png"
with gr.Blocks() as interface:
gr.Markdown("## Malcolm X")
with gr.Row():
with gr.Column(scale=1):
gr.Image(image_url, elem_id="illustrative-image") # Add the illustrative image here
with gr.Column(scale=3):
input_text = gr.Textbox(label="Your message")
output_audio = gr.Audio(label="Assistant's Response")
btn = gr.Button("Generate Response")
btn.click(generate_assistant_response, inputs=input_text, outputs=output_audio)
interface.launch(auth=(username, password))