Spaces:
Running
on
L4
Running
on
L4
File size: 3,201 Bytes
1ac399b b2051b3 1ac399b 3202126 1ac399b 042390d b2051b3 3202126 12f6caf 3202126 12f6caf 3202126 0471c24 1ac399b 3202126 449d4d5 042390d 1ac399b 042390d 1ac399b 3202126 1ac399b 042390d 3202126 042390d 1ac399b 3202126 042390d 3202126 042390d 3202126 1ac399b 3202126 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa
pipe = transformers.pipeline(
model="fixie-ai/ultravox-v0_4_1-llama-3_1-8b",
trust_remote_code=True,
device=torch.device("cuda"),
)
whisper = transformers.pipeline(
model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
)
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
if account_sid and auth_token:
client = Client(account_sid, auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
else:
rtc_configuration = None
def transcribe(
audio: tuple[int, np.ndarray],
transformers_chat: list[dict],
conversation: list[dict],
):
original_sr = audio[0]
target_sr = 16000
audio_sr = librosa.resample(
audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
)
tf_input = [d for d in transformers_chat]
output = pipe(
{"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
max_new_tokens=512,
)
transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})
conversation.append({"role": "user", "content": transcription["text"]})
conversation.append({"role": "assistant", "content": output})
transformers_chat.append({"role": "user", "content": transcription["text"]})
transformers_chat.append({"role": "assistant", "content": output})
yield AdditionalOutputs(transformers_chat, conversation)
with gr.Blocks() as demo:
gr.HTML(
"""
<h1 style='text-align: center'>
Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
</h1>
<p style='text-align: center'>
Once you grant access to your microphone, you can talk naturally to Ultravox.
When you stop talking, the audio will be sent for processing.
</p>
<p style='text-align: center'>
Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
</p>
"""
)
with gr.Row():
transformers_chat = gr.State(
value=[
{
"role": "system",
"content": "You are a friendly and helpful character. You love to answer questions for people.",
}
]
)
with gr.Group():
transcript = gr.Chatbot(label="transcript", type="messages")
audio = WebRTC(
rtc_configuration=rtc_configuration,
label="Stream",
mode="send",
modality="audio",
)
audio.stream(
ReplyOnPause(transcribe),
inputs=[audio, transformers_chat, transcript],
outputs=[audio],
time_limit=90,
)
audio.on_additional_outputs(
lambda t, g: (t, g),
outputs=[transformers_chat, transcript],
queue=False,
show_progress="hidden",
)
if __name__ == "__main__":
demo.launch()
|