Audio-llm / app.py
Existance's picture
Create app.py
5c2f019 verified
raw
history blame
3.41 kB
import torch
import torchaudio
import gradio as gr
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load processor and model
processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
model = LFM2AudioModel.from_pretrained(HF_REPO).to(device).eval()
# Persistent chat state
chat = ChatState(processor)
def reset_chat():
global chat
chat = ChatState(processor)
return [], "Chat reset successfully."
def generate_response(audio_input, text_input, history):
global chat
# Initialize system prompt if first turn
if not history:
chat.new_turn("system")
chat.add_text("You are a helpful multimodal AI assistant that can reply with both text and audio.")
chat.end_turn()
# New user turn
chat.new_turn("user")
if text_input:
chat.add_text(text_input)
if audio_input:
wav, sr = torchaudio.load(audio_input)
chat.add_audio(wav, sr)
chat.end_turn()
# Assistant generation
chat.new_turn("assistant")
text_out, audio_out, modality_out = [], [], []
for t in model.generate_interleaved(
**chat, max_new_tokens=512, audio_temperature=1.0, audio_top_k=4
):
if t.numel() == 1:
text_out.append(t)
modality_out.append(LFMModality.TEXT)
else:
audio_out.append(t)
modality_out.append(LFMModality.AUDIO_OUT)
decoded_text, audio_path = "", None
# Decode text output
if text_out:
tokens = torch.stack(text_out, 1)
decoded_text = processor.text.decode(tokens)
# Decode audio output
if audio_out:
mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
with torch.no_grad():
waveform = processor.mimi.decode(mimi_codes)[0]
audio_path = "assistant_reply.wav"
torchaudio.save(audio_path, waveform.cpu(), 24000)
# Add to chat history
history.append((text_input or "[Audio Input]", decoded_text or "[Audio Output]"))
chat.append(
text=torch.stack(text_out, 1) if text_out else None,
audio_out=torch.stack(audio_out, 1) if audio_out else None,
modality_flag=torch.tensor(modality_out),
)
chat.end_turn()
return history, decoded_text, audio_path
# === Gradio UI ===
with gr.Blocks(title="🎧 LFM2-Audio-1.5B Chat") as demo:
gr.Markdown("## 🎧 LFM2-Audio-1.5B β€” Multimodal AI Chatbot")
gr.Markdown("Chat using **text or voice** β€” get replies in **text and audio** form.")
with gr.Row():
text_inp = gr.Textbox(label="πŸ’¬ Type your message", placeholder="Say something...")
audio_inp = gr.Audio(source="microphone", type="filepath", label="πŸŽ™ Record / Upload Audio")
with gr.Row():
send_btn = gr.Button("Generate Response", variant="primary")
reset_btn = gr.Button("πŸ” Reset Chat")
chatbox = gr.Chatbot(label="Conversation History", height=400)
text_out = gr.Textbox(label="πŸ“ Text Response")
audio_out = gr.Audio(label="πŸ”Š Audio Response", type="filepath")
send_btn.click(
generate_response,
inputs=[audio_inp, text_inp, chatbox],
outputs=[chatbox, text_out, audio_out],
)
reset_btn.click(reset_chat, outputs=[chatbox, text_out])
demo.queue().launch()