Rcarvalo commited on
Commit
fc67d54
Β·
verified Β·
1 Parent(s): eec54dd

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +141 -129
app.py CHANGED
@@ -1,12 +1,15 @@
1
  """
2
- Gradio app for LFM2-Audio speech-to-speech demo
3
- Compatible with Hugging Face Spaces
4
  """
5
 
 
 
 
6
  import gradio as gr
7
  import numpy as np
8
  import torch
9
- import torchaudio
10
 
11
  from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
12
 
@@ -28,132 +31,138 @@ mimi = mimi.to(device)
28
  print(f"Models loaded on {device}")
29
 
30
 
31
- def generate_response(audio_input, temperature, top_k, chat_state):
32
- """Generate speech-to-speech response"""
33
-
34
- if audio_input is None:
35
- return None, "Please record audio first", chat_state
36
-
37
- # Parse audio input
38
- rate, wav = audio_input
39
-
40
- # Convert to torch tensor
41
- if wav.dtype == np.int16:
42
- wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
43
- else:
44
- wav_tensor = torch.tensor(wav, dtype=torch.float32)
45
-
46
- # Ensure mono and correct shape (channels, samples)
47
- if len(wav_tensor.shape) > 1:
48
- wav_tensor = wav_tensor.mean(dim=-1)
49
-
50
- # add_audio expects shape (channels, samples), so add channel dimension
51
- if len(wav_tensor.shape) == 1:
52
- wav_tensor = wav_tensor.unsqueeze(0)
53
-
54
- # Initialize chat state if empty
55
- if len(chat_state.text) == 1:
56
- chat_state.new_turn("system")
57
- chat_state.add_text("Respond with interleaved text and audio.")
58
- chat_state.end_turn()
59
-
60
- # Add user audio
61
- chat_state.new_turn("user")
62
- chat_state.add_audio(wav_tensor, rate)
63
- chat_state.end_turn()
64
-
65
- # Start assistant turn
66
- chat_state.new_turn("assistant")
67
-
68
- # Set generation parameters
69
- temp = None if temperature == 0 else float(temperature)
70
- topk = None if top_k == 0 else int(top_k)
71
-
72
- # Generate response
73
- text_out = []
74
- audio_out = []
75
- modality_out = []
76
-
77
- full_text = ""
78
-
79
- print("Generating response...")
80
- with torch.no_grad():
81
  for t in model.generate_interleaved(
82
- **chat_state,
83
  max_new_tokens=1024,
84
  audio_temperature=temp,
85
  audio_top_k=topk,
86
  ):
87
- if t.numel() == 1: # Text token
88
- text_out.append(t)
89
- modality_out.append(LFMModality.TEXT)
90
- decoded = processor.text.decode(t)
91
- full_text += decoded
92
- print(decoded, end="", flush=True)
93
- elif t.numel() == 8: # Audio token
94
- audio_out.append(t)
95
- modality_out.append(LFMModality.AUDIO_OUT)
 
96
 
97
- print("\nGeneration complete")
98
 
99
- # Clean up text
100
- full_text = full_text.replace("<|text_end|>", "").strip()
 
 
 
 
101
 
102
- # Decode audio (remove last end-of-audio token)
103
- if len(audio_out) > 1:
104
- mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
105
- with torch.no_grad():
106
- waveform = mimi.decode(mimi_codes)[0]
107
 
108
- # Convert to numpy for Gradio
109
- audio_np = waveform.cpu().numpy()
110
- audio_output = (24000, audio_np.T) # Gradio expects (rate, data)
111
- else:
112
- audio_output = None
113
 
114
- # Update chat state
115
- if text_out and audio_out:
116
- chat_state.append(
117
- text=torch.stack(text_out, 1),
118
- audio_out=torch.stack(audio_out, 1),
119
- modality_flag=torch.tensor(modality_out, device=device),
120
- )
121
 
122
- chat_state.end_turn()
123
- chat_state.new_turn("user")
 
124
 
125
- return audio_output, full_text, chat_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
 
 
127
 
128
- def reset_chat():
129
- """Reset chat state"""
130
- return ChatState(processor), "", None
 
 
131
 
132
 
133
  # Create Gradio interface
134
- with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
135
  gr.Markdown("""
136
- # LFM2-Audio Speech-to-Speech Chat
137
 
138
- Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
139
 
140
  **How to use:**
141
- 1. Click the microphone button to record your voice
142
- 2. Adjust temperature and top-k parameters if needed (or leave defaults)
143
- 3. Click "Generate Response"
144
- 4. Listen to the audio response and read the text transcription
145
-
146
- **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
 
 
 
147
  """)
148
 
149
  chat_state = gr.State(ChatState(processor))
150
 
151
  with gr.Row():
152
  with gr.Column():
153
- audio_input = gr.Audio(
154
- sources=["microphone"],
155
- type="numpy",
156
- label="Record your voice"
157
  )
158
 
159
  with gr.Row():
@@ -163,7 +172,7 @@ with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
163
  value=1.0,
164
  step=0.1,
165
  label="Temperature (0 for greedy)",
166
- info="Higher = more creative, lower = more deterministic"
167
  )
168
  top_k = gr.Slider(
169
  minimum=0,
@@ -171,47 +180,50 @@ with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
171
  value=4,
172
  step=1,
173
  label="Top-k (0 for no filtering)",
174
- info="Number of top tokens to sample from"
175
  )
176
 
177
- generate_btn = gr.Button("Generate Response", variant="primary")
178
- reset_btn = gr.Button("Reset Chat")
179
 
180
  with gr.Column():
181
- text_output = gr.Textbox(
182
- label="Assistant Response (Text)",
183
- lines=4,
184
- interactive=False
185
- )
186
- audio_output = gr.Audio(
187
- label="Assistant Response (Audio)",
188
- type="numpy",
189
  interactive=False
190
  )
191
 
192
  gr.Markdown("""
193
- ### About LFM2-Audio
 
 
 
194
 
195
- LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
196
- - Real-time speech-to-speech conversations
197
- - Low-latency interleaved text and audio generation
198
- - Natural flowing conversations
199
 
200
- [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
201
  """)
202
 
203
- # Event handlers
204
- generate_btn.click(
205
- fn=generate_response,
206
- inputs=[audio_input, temperature, top_k, chat_state],
207
- outputs=[audio_output, text_output, chat_state]
 
 
 
 
 
208
  )
209
 
210
- reset_btn.click(
211
- fn=reset_chat,
212
- outputs=[chat_state, text_output, audio_output]
213
  )
214
 
 
 
215
 
216
  if __name__ == "__main__":
217
- demo.launch()
 
1
  """
2
+ Real-time WebRTC speech-to-speech demo with fastrtc
3
+ Based on the original liquid-audio demo
4
  """
5
 
6
+ from queue import Queue
7
+ from threading import Thread
8
+
9
  import gradio as gr
10
  import numpy as np
11
  import torch
12
+ from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC
13
 
14
  from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
15
 
 
31
  print(f"Models loaded on {device}")
32
 
33
 
34
+ def chat_producer(
35
+ q: Queue[torch.Tensor | None],
36
+ chat: ChatState,
37
+ temp: float | None,
38
+ topk: int | None,
39
+ ):
40
+ """Producer thread that generates tokens"""
41
+ print(f"Starting generation with state {chat}.")
42
+ with torch.no_grad(), mimi.streaming(1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  for t in model.generate_interleaved(
44
+ **chat,
45
  max_new_tokens=1024,
46
  audio_temperature=temp,
47
  audio_top_k=topk,
48
  ):
49
+ q.put(t)
50
+
51
+ if t.numel() > 1:
52
+ if (t == 2048).any():
53
+ continue
54
+
55
+ wav_chunk = mimi.decode(t[None, :, None])[0]
56
+ q.put(wav_chunk)
57
+
58
+ q.put(None)
59
 
 
60
 
61
+ def chat_response(audio: tuple[int, np.ndarray], _id: str, chat: ChatState, temp: float | None = 1.0, topk: int | None = 4):
62
+ """Handle incoming audio and generate streaming response"""
63
+ if temp == 0:
64
+ temp = None
65
+ if topk == 0:
66
+ topk = None
67
 
68
+ if temp is not None:
69
+ temp = float(temp)
70
+ if topk is not None:
71
+ topk = int(topk)
 
72
 
73
+ if len(chat.text) == 1:
74
+ chat.new_turn("system")
75
+ chat.add_text("Respond with interleaved text and audio.")
76
+ chat.end_turn()
 
77
 
78
+ chat.new_turn("user")
 
 
 
 
 
 
79
 
80
+ rate, wav = audio
81
+ # Convert to tensor with proper shape (channels, samples)
82
+ wav_tensor = torch.tensor(wav / 32_768, dtype=torch.float)
83
 
84
+ # Ensure correct shape
85
+ if len(wav_tensor.shape) == 1:
86
+ wav_tensor = wav_tensor.unsqueeze(0)
87
+ elif len(wav_tensor.shape) > 1:
88
+ # If stereo, convert to mono
89
+ wav_tensor = wav_tensor.mean(dim=-1, keepdim=True).T
90
+
91
+ chat.add_audio(wav_tensor, rate)
92
+ chat.end_turn()
93
+
94
+ chat.new_turn("assistant")
95
+
96
+ q: Queue[torch.Tensor | None] = Queue()
97
+ chat_thread = Thread(target=chat_producer, args=(q, chat, temp, topk))
98
+ chat_thread.start()
99
+
100
+ out_text: list[torch.Tensor] = []
101
+ out_audio: list[torch.Tensor] = []
102
+ out_modality: list[LFMModality] = []
103
+
104
+ while True:
105
+ t = q.get()
106
+ if t is None:
107
+ break
108
+ elif t.numel() == 1: # text
109
+ out_text.append(t)
110
+ out_modality.append(LFMModality.TEXT)
111
+ print(processor.text.decode(t), end="")
112
+ cur_string = processor.text.decode(torch.cat(out_text)).removesuffix("<|text_end|>")
113
+ yield AdditionalOutputs(cur_string)
114
+ elif t.numel() == 8:
115
+ out_audio.append(t)
116
+ out_modality.append(LFMModality.AUDIO_OUT)
117
+ elif t.numel() == 1920:
118
+ np_chunk = (t.cpu().numpy() * 32_767).astype(np.int16)
119
+ yield (24_000, np_chunk)
120
+ else:
121
+ raise RuntimeError(f"unexpected shape: {t.shape}")
122
+
123
+ chat.append(
124
+ text=torch.stack(out_text, 1),
125
+ audio_out=torch.stack(out_audio, 1),
126
+ modality_flag=torch.tensor(out_modality, device=device),
127
+ )
128
 
129
+ chat.end_turn()
130
+ chat.new_turn("user")
131
 
132
+
133
+ def clear():
134
+ """Clear chat history"""
135
+ gr.Info("Cleared chat history", duration=3)
136
+ return ChatState(processor), None
137
 
138
 
139
  # Create Gradio interface
140
+ with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
141
  gr.Markdown("""
142
+ # LFM2-Audio Real-time Speech-to-Speech Chat
143
 
144
+ **Real-time WebRTC streaming** powered by fastrtc - Talk naturally and get instant responses!
145
 
146
  **How to use:**
147
+ 1. Click "Allow" when prompted for microphone access
148
+ 2. Start speaking - the model listens and responds in real-time
149
+ 3. The conversation flows naturally with minimal latency
150
+
151
+ **Features:**
152
+ - πŸŽ™οΈ Real-time WebRTC streaming
153
+ - ⚑ Low latency response
154
+ - πŸ’¬ Interleaved text and audio output
155
+ - πŸ”„ Multi-turn conversations
156
  """)
157
 
158
  chat_state = gr.State(ChatState(processor))
159
 
160
  with gr.Row():
161
  with gr.Column():
162
+ webrtc = WebRTC(
163
+ modality="audio",
164
+ mode="send-receive",
165
+ full_screen=False,
166
  )
167
 
168
  with gr.Row():
 
172
  value=1.0,
173
  step=0.1,
174
  label="Temperature (0 for greedy)",
175
+ info="Higher = more creative"
176
  )
177
  top_k = gr.Slider(
178
  minimum=0,
 
180
  value=4,
181
  step=1,
182
  label="Top-k (0 for no filtering)",
183
+ info="Sampling diversity"
184
  )
185
 
186
+ clear_btn = gr.Button("Reset Chat")
 
187
 
188
  with gr.Column():
189
+ text_out = gr.Textbox(
190
+ lines=10,
191
+ label="Conversation Text",
 
 
 
 
 
192
  interactive=False
193
  )
194
 
195
  gr.Markdown("""
196
+ ### About this demo
197
+
198
+ This demo uses **fastrtc** for WebRTC streaming, enabling real-time speech-to-speech interaction with minimal latency.
199
+ The model processes your speech and generates both text and audio responses simultaneously.
200
 
201
+ **Model**: LFM2-Audio-1.5B by Liquid AI
202
+ **Mode**: Interleaved generation (optimized for real-time)
203
+ **Audio Codec**: Mimi (24kHz)
 
204
 
205
+ [Liquid AI](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | [Model Card](https://huggingface.co/LiquidAI/LFM2-Audio-1.5B)
206
  """)
207
 
208
+ # Setup WebRTC streaming
209
+ webrtc.stream(
210
+ ReplyOnPause(
211
+ chat_response, # type: ignore[arg-type]
212
+ input_sample_rate=24_000,
213
+ output_sample_rate=24_000,
214
+ can_interrupt=False,
215
+ ),
216
+ inputs=[webrtc, chat_state, temperature, top_k],
217
+ outputs=[webrtc],
218
  )
219
 
220
+ webrtc.on_additional_outputs(
221
+ lambda s: s,
222
+ outputs=[text_out],
223
  )
224
 
225
+ clear_btn.click(clear, outputs=[chat_state, text_out])
226
+
227
 
228
  if __name__ == "__main__":
229
+ demo.launch()