File size: 3,406 Bytes
5c2f019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import torch
import torchaudio
import gradio as gr
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality

HF_REPO = "LiquidAI/LFM2-Audio-1.5B"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load processor and model
processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
model = LFM2AudioModel.from_pretrained(HF_REPO).to(device).eval()

# Persistent chat state
chat = ChatState(processor)

def reset_chat():
    global chat
    chat = ChatState(processor)
    return [], "Chat reset successfully."

def generate_response(audio_input, text_input, history):
    global chat

    # Initialize system prompt if first turn
    if not history:
        chat.new_turn("system")
        chat.add_text("You are a helpful multimodal AI assistant that can reply with both text and audio.")
        chat.end_turn()

    # New user turn
    chat.new_turn("user")
    if text_input:
        chat.add_text(text_input)
    if audio_input:
        wav, sr = torchaudio.load(audio_input)
        chat.add_audio(wav, sr)
    chat.end_turn()

    # Assistant generation
    chat.new_turn("assistant")

    text_out, audio_out, modality_out = [], [], []

    for t in model.generate_interleaved(
        **chat, max_new_tokens=512, audio_temperature=1.0, audio_top_k=4
    ):
        if t.numel() == 1:
            text_out.append(t)
            modality_out.append(LFMModality.TEXT)
        else:
            audio_out.append(t)
            modality_out.append(LFMModality.AUDIO_OUT)

    decoded_text, audio_path = "", None

    # Decode text output
    if text_out:
        tokens = torch.stack(text_out, 1)
        decoded_text = processor.text.decode(tokens)

    # Decode audio output
    if audio_out:
        mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
        with torch.no_grad():
            waveform = processor.mimi.decode(mimi_codes)[0]
        audio_path = "assistant_reply.wav"
        torchaudio.save(audio_path, waveform.cpu(), 24000)

    # Add to chat history
    history.append((text_input or "[Audio Input]", decoded_text or "[Audio Output]"))
    chat.append(
        text=torch.stack(text_out, 1) if text_out else None,
        audio_out=torch.stack(audio_out, 1) if audio_out else None,
        modality_flag=torch.tensor(modality_out),
    )
    chat.end_turn()

    return history, decoded_text, audio_path


# === Gradio UI ===
with gr.Blocks(title="🎧 LFM2-Audio-1.5B Chat") as demo:
    gr.Markdown("## 🎧 LFM2-Audio-1.5B β€” Multimodal AI Chatbot")
    gr.Markdown("Chat using **text or voice** β€” get replies in **text and audio** form.")

    with gr.Row():
        text_inp = gr.Textbox(label="πŸ’¬ Type your message", placeholder="Say something...")
        audio_inp = gr.Audio(source="microphone", type="filepath", label="πŸŽ™ Record / Upload Audio")

    with gr.Row():
        send_btn = gr.Button("Generate Response", variant="primary")
        reset_btn = gr.Button("πŸ” Reset Chat")

    chatbox = gr.Chatbot(label="Conversation History", height=400)
    text_out = gr.Textbox(label="πŸ“ Text Response")
    audio_out = gr.Audio(label="πŸ”Š Audio Response", type="filepath")

    send_btn.click(
        generate_response,
        inputs=[audio_inp, text_inp, chatbox],
        outputs=[chatbox, text_out, audio_out],
    )

    reset_btn.click(reset_chat, outputs=[chatbox, text_out])

demo.queue().launch()