File size: 3,263 Bytes
ec89123
 
 
 
 
 
 
 
 
bfb4b00
ec89123
4396409
 
 
ec89123
bfb4b00
 
ec89123
bfb4b00
 
ec89123
 
 
 
63a89c3
ec89123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4dcd5
ec89123
 
 
 
 
 
 
 
 
 
 
 
 
4396409
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
from openai import OpenAI
from typing import IO
from io import BytesIO
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
import tempfile
from PIL import Image
import requests
import os

username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")

# Initialize Clients
openai_api_key = os.getenv("OPENAI_API_KEY")
client_openai = OpenAI(api_key=openai_api_key)

elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
client_elevenlabs = ElevenLabs(api_key=elevenlabs_api_key)

def text_to_speech_stream(text: str) -> IO[bytes]:
    # Perform the text-to-speech conversion using ElevenLabs
    response = client_elevenlabs.text_to_speech.convert(  # Use client_elevenlabs
        voice_id="VQE7uwgOwnnTI8oKNjqz", # Digitalized voice of Malcolm X
        optimize_streaming_latency="0",
        output_format="mp3_22050_32",
        text=text,
        model_id="eleven_multilingual_v2",
        voice_settings=VoiceSettings(
            stability=0.0,
            similarity_boost=1.0,
            style=0.0,
            use_speaker_boost=True,
        ),
    )

    # Create a BytesIO object to hold the audio data in memory
    audio_stream = BytesIO()

    # Write each chunk of audio data to the stream
    for chunk in response:
        if chunk:
            audio_stream.write(chunk)

    # Reset stream position to the beginning
    audio_stream.seek(0)

    # Return the stream for further use
    return audio_stream


def generate_assistant_response(user_message):
    assistant = client_openai.beta.assistants.retrieve( # Use client_openai
        assistant_id="asst_EzgIYI1atVqvV4tRvy6YmQni"
    )

    thread = client_openai.beta.threads.create()
    client_openai.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=user_message
    )

    run = client_openai.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id
    )

    while True:
        run_status = client_openai.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id
        )
        if run_status.status == 'completed':
            break

    messages = client_openai.beta.threads.messages.list(thread_id=thread.id)
    assistant_response = messages.data[0].content[0].text.value

    # Convert to voice using ElevenLabs
    audio_stream = text_to_speech_stream(assistant_response)
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
        temp_file.write(audio_stream.getvalue())
        temp_audio_path = temp_file.name

    return temp_audio_path  # Return the temporary file path

# URL of the illustrative image
image_url = "image.png"

with gr.Blocks() as interface:
    gr.Markdown("## Malcolm X")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Image(image_url, elem_id="illustrative-image")  # Add the illustrative image here
        with gr.Column(scale=3):
            input_text = gr.Textbox(label="Your message")
            output_audio = gr.Audio(label="Assistant's Response")
            btn = gr.Button("Generate Response")
            btn.click(generate_assistant_response, inputs=input_text, outputs=output_audio)

interface.launch(auth=(username, password))