talk-to-smolvox / app.py
Steveeeeeeen's picture
Steveeeeeeen HF staff
Update app.py
d1fe398 verified
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa
pipe = transformers.pipeline(
model="reach-vb/smolvox-smollm2-whisper-turbo",
trust_remote_code=True,
device=torch.device("cuda"),
)
whisper = transformers.pipeline(
model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
)
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
if account_sid and auth_token:
client = Client(account_sid, auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
else:
rtc_configuration = None
def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]):
original_sr = audio[0]
target_sr = 16000
audio_sr = librosa.resample(
audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
)
tf_input = [d for d in transformers_chat]
# Generate a response from the pipeline using the audio input
output = pipe(
{"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
max_new_tokens=2048,
)
# Transcribe the audio using Whisper
transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})
# Update both conversation histories
conversation.append({"role": "user", "content": transcription["text"]})
conversation.append({"role": "assistant", "content": output})
transformers_chat.append({"role": "user", "content": transcription["text"]})
transformers_chat.append({"role": "assistant", "content": output})
yield AdditionalOutputs(transformers_chat, conversation)
def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]):
if not user_text.strip():
return transformers_chat, conversation
# Append the user message from the textbox
conversation.append({"role": "user", "content": user_text})
transformers_chat.append({"role": "user", "content": user_text})
# Generate a response using the pipeline. We assume it can process text input via "text"
output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)
conversation.append({"role": "assistant", "content": output})
transformers_chat.append({"role": "assistant", "content": output})
return transformers_chat, conversation
with gr.Blocks() as demo:
gr.HTML(
"""
<h1 style='text-align: center'>
Talk to Smolvox Smollm2 1.7b (Powered by WebRTC ⚡️)
</h1>
<p style='text-align: center'>
Once you grant access to your microphone, you can talk naturally to Ultravox.
When you stop talking, the audio will be sent for processing.
</p>
<p style='text-align: center'>
Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
</p>
"""
)
# Shared conversation state
transformers_chat = gr.State(
value=[
{
"role": "system",
"content": "You are a friendly and helpful character. You love to answer questions for people.",
}
]
)
# Chat transcript at the top
transcript = gr.Chatbot(label="Transcript", type="messages")
# Lower row: text input and audio input side by side
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
placeholder="Type your message here and press Enter...", label="Your Message"
)
with gr.Column(scale=1):
audio = WebRTC(
rtc_configuration=rtc_configuration,
label="Stream",
mode="send",
modality="audio",
)
# Audio stream: process audio when speaking stops.
audio.stream(
ReplyOnPause(transcribe),
inputs=[audio, transformers_chat, transcript],
outputs=[audio],
time_limit=90,
)
audio.on_additional_outputs(
lambda t, g: (t, g),
outputs=[transformers_chat, transcript],
queue=False,
show_progress="hidden",
)
# Text input: submit callback when pressing Enter.
text_input.submit(
respond_text,
inputs=[text_input, transformers_chat, transcript],
outputs=[transformers_chat, transcript],
)
# Clear text input after submission.
text_input.submit(lambda: "", inputs=[], outputs=[text_input])
if __name__ == "__main__":
demo.launch()