Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
import torch | |
import scipy | |
import numpy as np | |
from diffusers import AudioLDMPipeline | |
model = AutoModelForCausalLM.from_pretrained("kenooo/multisense-tinyllama-finetune") | |
tokenizer = AutoTokenizer.from_pretrained("kenooo/multisense-tinyllama-finetune") | |
# Function to generate sound description | |
def generate_sound_description(text_input): | |
prompt = f"### Instruction:\nDescribe the sound of the following action.\n{text_input}\n### Sound:" | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=100, | |
temperature=0.9, | |
do_sample=True, | |
top_p=0.95, | |
repetition_penalty=1.2 | |
) | |
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
result = decoded.split("### Sound:")[-1].split("###")[0].strip() | |
if not isinstance(result, str) or len(result.strip()) == 0: | |
return "[Error: No valid sound description returned]" | |
return result | |
pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch.float32).to("cpu") | |
def generate_audio_from_description(description, output_path="output.wav"): | |
audio = pipe(description, num_inference_steps=50).audios[0] | |
audio_np = (audio * 32767).astype(np.int16) | |
scipy.io.wavfile.write(output_path, rate=16000, data=audio_np) | |
print(f"[π DESCRIPTION]: {repr(description)}") | |
return output_path | |
async def multisense_pipeline(text_input): | |
description = generate_sound_description(text_input) | |
audio_file = generate_audio_from_description(description) | |
return description, audio_file | |
# Gradio interface | |
iface = gr.Interface( | |
fn=multisense_pipeline, | |
inputs=gr.Textbox(lines=2, placeholder="e.g., Stirring onions in a hot pan"), | |
outputs=[ | |
gr.Textbox(label="Sound Description"), | |
gr.Audio(label="Generated Audio", type="filepath") | |
], | |
title="π³ Cooking Sound Description Generator", | |
description="Enter a cooking action. The model will describe the sound it would make." | |
) | |
iface.launch(share=True) | |