File size: 3,201 Bytes
1ac399b
 
 
 
 
 
b2051b3
 
1ac399b
3202126
 
 
 
 
 
 
 
1ac399b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
042390d
 
 
 
 
b2051b3
 
 
3202126
 
 
 
12f6caf
 
3202126
12f6caf
3202126
 
0471c24
1ac399b
3202126
449d4d5
042390d
 
1ac399b
042390d
1ac399b
 
 
 
3202126
1ac399b
 
 
 
 
 
 
 
 
 
 
 
 
042390d
 
 
 
 
 
 
 
3202126
042390d
1ac399b
 
 
 
 
 
 
3202126
 
042390d
3202126
 
 
 
042390d
 
 
 
3202126
1ac399b
 
3202126
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa

pipe = transformers.pipeline(
    model="fixie-ai/ultravox-v0_4_1-llama-3_1-8b",
    trust_remote_code=True,
    device=torch.device("cuda"),
)
whisper = transformers.pipeline(
    model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
)

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)

    token = client.tokens.create()

    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None


def transcribe(
    audio: tuple[int, np.ndarray],
    transformers_chat: list[dict],
    conversation: list[dict],
):
    original_sr = audio[0]
    target_sr = 16000

    audio_sr = librosa.resample(
        audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
    )

    tf_input = [d for d in transformers_chat]

    output = pipe(
        {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
        max_new_tokens=512,
    )
    transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})

    conversation.append({"role": "user", "content": transcription["text"]})
    conversation.append({"role": "assistant", "content": output})
    transformers_chat.append({"role": "user", "content": transcription["text"]})
    transformers_chat.append({"role": "assistant", "content": output})

    yield AdditionalOutputs(transformers_chat, conversation)


with gr.Blocks() as demo:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
    </h1>
    <p style='text-align: center'>
    Once you grant access to your microphone, you can talk naturally to Ultravox.
    When you stop talking, the audio will be sent for processing.
    </p>
    <p style='text-align: center'>
    Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
    </p>
    """
    )
    with gr.Row():
        transformers_chat = gr.State(
            value=[
                {
                    "role": "system",
                    "content": "You are a friendly and helpful character. You love to answer questions for people.",
                }
            ]
        )
        with gr.Group():
            transcript = gr.Chatbot(label="transcript", type="messages")
            audio = WebRTC(
                rtc_configuration=rtc_configuration,
                label="Stream",
                mode="send",
                modality="audio",
            )

    audio.stream(
        ReplyOnPause(transcribe),
        inputs=[audio, transformers_chat, transcript],
        outputs=[audio],
        time_limit=90,
    )
    audio.on_additional_outputs(
        lambda t, g: (t, g),
        outputs=[transformers_chat, transcript],
        queue=False,
        show_progress="hidden",
    )

if __name__ == "__main__":
    demo.launch()