freddyaboulton HF staff commited on
Commit
3202126
·
1 Parent(s): 0471c24
Files changed (1) hide show
  1. app.py +41 -28
app.py CHANGED
@@ -7,11 +7,14 @@ import os
7
  import torch
8
  import librosa
9
 
10
-
11
- pipe = transformers.pipeline(model='fixie-ai/ultravox-v0_4_1-llama-3_1-8b', trust_remote_code=True,
12
- device=torch.device('cuda'))
13
- whisper = transformers.pipeline(model="openai/whisper-large-v3-turbo",
14
- device=torch.device('cuda'))
 
 
 
15
 
16
  account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
17
  auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
@@ -29,29 +32,29 @@ else:
29
  rtc_configuration = None
30
 
31
 
32
-
33
- def transcribe(audio: tuple[int, np.ndarray], conversation: list[dict], gradio_convo: list[dict]):
34
  original_sr = audio[0]
35
  target_sr = 16000
36
 
37
- audio_sr = librosa.resample(audio[1].astype(np.float32) / 32768.0,
38
- orig_sr=original_sr, target_sr=target_sr)
39
-
40
- output = pipe({"audio": audio_sr, "turns": conversation, "sampling_rate": target_sr},
41
- max_new_tokens=512)
 
 
 
42
  transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})
43
 
44
- conversation.append({"role": "user", "content": transcription})
45
  conversation.append({"role": "assistant", "content": output})
46
- gradio_convo.append({"role": "user", "content": transcription})
47
- gradio_convo.append({"role": "assistant", "content": output})
48
 
49
- yield AdditionalOutputs(conversation, gradio_convo)
50
 
51
 
52
  with gr.Blocks() as demo:
53
  gr.HTML(
54
- """
55
  <h1 style='text-align: center'>
56
  Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
57
  </h1>
@@ -64,24 +67,34 @@ with gr.Blocks() as demo:
64
  </p>
65
  """
66
  )
67
- transformers_convo = gr.State(value=[{
68
- "role": "system",
69
- "content": "You are a friendly and helpful character. You love to answer questions for people."
70
- }])
71
  with gr.Row():
72
- with gr.Column():
 
 
 
 
 
 
 
 
 
 
73
  audio = WebRTC(
74
  rtc_configuration=rtc_configuration,
75
  label="Stream",
76
  mode="send",
77
  modality="audio",
78
  )
79
- with gr.Column():
80
- transcript = gr.Chatbot(label="transcript", type="messages")
81
 
82
- audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio], time_limit=90)
83
- audio.on_additional_outputs(lambda s,a: (s,a), outputs=[transformers_convo, transcript],
84
- queue=False, show_progress="hidden")
 
 
 
 
 
 
85
 
86
  if __name__ == "__main__":
87
- demo.launch()
 
7
  import torch
8
  import librosa
9
 
10
+ pipe = transformers.pipeline(
11
+ model="fixie-ai/ultravox-v0_4_1-llama-3_1-8b",
12
+ trust_remote_code=True,
13
+ device=torch.device("cuda"),
14
+ )
15
+ whisper = transformers.pipeline(
16
+ model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
17
+ )
18
 
19
  account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
20
  auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
 
32
  rtc_configuration = None
33
 
34
 
35
+ def transcribe(audio: tuple[int, np.ndarray], conversation: list[dict]):
 
36
  original_sr = audio[0]
37
  target_sr = 16000
38
 
39
+ audio_sr = librosa.resample(
40
+ audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
41
+ )
42
+
43
+ output = pipe(
44
+ {"audio": audio_sr, "turns": conversation, "sampling_rate": target_sr},
45
+ max_new_tokens=512,
46
+ )
47
  transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})
48
 
49
+ conversation.append({"role": "user", "content": transcription["text"]})
50
  conversation.append({"role": "assistant", "content": output})
 
 
51
 
52
+ yield AdditionalOutputs(conversation)
53
 
54
 
55
  with gr.Blocks() as demo:
56
  gr.HTML(
57
+ """
58
  <h1 style='text-align: center'>
59
  Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
60
  </h1>
 
67
  </p>
68
  """
69
  )
 
 
 
 
70
  with gr.Row():
71
+ with gr.Group():
72
+ transcript = gr.Chatbot(
73
+ label="transcript",
74
+ type="messages",
75
+ value=[
76
+ {
77
+ "role": "system",
78
+ "content": "You are a friendly and helpful character. You love to answer questions for people.",
79
+ }
80
+ ],
81
+ )
82
  audio = WebRTC(
83
  rtc_configuration=rtc_configuration,
84
  label="Stream",
85
  mode="send",
86
  modality="audio",
87
  )
 
 
88
 
89
+ audio.stream(
90
+ ReplyOnPause(transcribe),
91
+ inputs=[audio, transcript],
92
+ outputs=[audio],
93
+ time_limit=90,
94
+ )
95
+ audio.on_additional_outputs(
96
+ lambda s: s, outputs=[transcript], queue=False, show_progress="hidden"
97
+ )
98
 
99
  if __name__ == "__main__":
100
+ demo.launch()