|
|
import torch |
|
|
import torchaudio |
|
|
import gradio as gr |
|
|
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality |
|
|
|
|
|
HF_REPO = "LiquidAI/LFM2-Audio-1.5B" |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
processor = LFM2AudioProcessor.from_pretrained(HF_REPO) |
|
|
model = LFM2AudioModel.from_pretrained(HF_REPO).to(device).eval() |
|
|
|
|
|
|
|
|
chat = ChatState(processor) |
|
|
|
|
|
def reset_chat(): |
|
|
global chat |
|
|
chat = ChatState(processor) |
|
|
return [], "Chat reset successfully." |
|
|
|
|
|
def generate_response(audio_input, text_input, history): |
|
|
global chat |
|
|
|
|
|
|
|
|
if not history: |
|
|
chat.new_turn("system") |
|
|
chat.add_text("You are a helpful multimodal AI assistant that can reply with both text and audio.") |
|
|
chat.end_turn() |
|
|
|
|
|
|
|
|
chat.new_turn("user") |
|
|
if text_input: |
|
|
chat.add_text(text_input) |
|
|
if audio_input: |
|
|
wav, sr = torchaudio.load(audio_input) |
|
|
chat.add_audio(wav, sr) |
|
|
chat.end_turn() |
|
|
|
|
|
|
|
|
chat.new_turn("assistant") |
|
|
|
|
|
text_out, audio_out, modality_out = [], [], [] |
|
|
|
|
|
for t in model.generate_interleaved( |
|
|
**chat, max_new_tokens=512, audio_temperature=1.0, audio_top_k=4 |
|
|
): |
|
|
if t.numel() == 1: |
|
|
text_out.append(t) |
|
|
modality_out.append(LFMModality.TEXT) |
|
|
else: |
|
|
audio_out.append(t) |
|
|
modality_out.append(LFMModality.AUDIO_OUT) |
|
|
|
|
|
decoded_text, audio_path = "", None |
|
|
|
|
|
|
|
|
if text_out: |
|
|
tokens = torch.stack(text_out, 1) |
|
|
decoded_text = processor.text.decode(tokens) |
|
|
|
|
|
|
|
|
if audio_out: |
|
|
mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) |
|
|
with torch.no_grad(): |
|
|
waveform = processor.mimi.decode(mimi_codes)[0] |
|
|
audio_path = "assistant_reply.wav" |
|
|
torchaudio.save(audio_path, waveform.cpu(), 24000) |
|
|
|
|
|
|
|
|
history.append((text_input or "[Audio Input]", decoded_text or "[Audio Output]")) |
|
|
chat.append( |
|
|
text=torch.stack(text_out, 1) if text_out else None, |
|
|
audio_out=torch.stack(audio_out, 1) if audio_out else None, |
|
|
modality_flag=torch.tensor(modality_out), |
|
|
) |
|
|
chat.end_turn() |
|
|
|
|
|
return history, decoded_text, audio_path |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="π§ LFM2-Audio-1.5B Chat") as demo: |
|
|
gr.Markdown("## π§ LFM2-Audio-1.5B β Multimodal AI Chatbot") |
|
|
gr.Markdown("Chat using **text or voice** β get replies in **text and audio** form.") |
|
|
|
|
|
with gr.Row(): |
|
|
text_inp = gr.Textbox(label="π¬ Type your message", placeholder="Say something...") |
|
|
audio_inp = gr.Audio(source="microphone", type="filepath", label="π Record / Upload Audio") |
|
|
|
|
|
with gr.Row(): |
|
|
send_btn = gr.Button("Generate Response", variant="primary") |
|
|
reset_btn = gr.Button("π Reset Chat") |
|
|
|
|
|
chatbox = gr.Chatbot(label="Conversation History", height=400) |
|
|
text_out = gr.Textbox(label="π Text Response") |
|
|
audio_out = gr.Audio(label="π Audio Response", type="filepath") |
|
|
|
|
|
send_btn.click( |
|
|
generate_response, |
|
|
inputs=[audio_inp, text_inp, chatbox], |
|
|
outputs=[chatbox, text_out, audio_out], |
|
|
) |
|
|
|
|
|
reset_btn.click(reset_chat, outputs=[chatbox, text_out]) |
|
|
|
|
|
demo.queue().launch() |