multisense-app / app.py
kenooo's picture
edits
bb7ea5f
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import scipy
import numpy as np
from diffusers import AudioLDMPipeline
model = AutoModelForCausalLM.from_pretrained("kenooo/multisense-tinyllama-finetune")
tokenizer = AutoTokenizer.from_pretrained("kenooo/multisense-tinyllama-finetune")
# Function to generate sound description
def generate_sound_description(text_input):
prompt = f"### Instruction:\nDescribe the sound of the following action.\n{text_input}\n### Sound:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=100,
temperature=0.9,
do_sample=True,
top_p=0.95,
repetition_penalty=1.2
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
result = decoded.split("### Sound:")[-1].split("###")[0].strip()
if not isinstance(result, str) or len(result.strip()) == 0:
return "[Error: No valid sound description returned]"
return result
pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch.float32).to("cpu")
def generate_audio_from_description(description, output_path="output.wav"):
audio = pipe(description, num_inference_steps=50).audios[0]
audio_np = (audio * 32767).astype(np.int16)
scipy.io.wavfile.write(output_path, rate=16000, data=audio_np)
print(f"[πŸ“ DESCRIPTION]: {repr(description)}")
return output_path
async def multisense_pipeline(text_input):
description = generate_sound_description(text_input)
audio_file = generate_audio_from_description(description)
return description, audio_file
# Gradio interface
iface = gr.Interface(
fn=multisense_pipeline,
inputs=gr.Textbox(lines=2, placeholder="e.g., Stirring onions in a hot pan"),
outputs=[
gr.Textbox(label="Sound Description"),
gr.Audio(label="Generated Audio", type="filepath")
],
title="🍳 Cooking Sound Description Generator",
description="Enter a cooking action. The model will describe the sound it would make."
)
iface.launch(share=True)