phi4_mm / app.py
Gijs Wijngaard
again
0dbdb6b
raw
history blame
2.5 kB
import spaces
import gradio as gr
import io
from urllib.request import urlopen
import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
MODEL_ID = "microsoft/Phi-4-multimodal-instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cuda" if torch.cuda.is_available() else "cpu",
torch_dtype="auto",
trust_remote_code=True
)
model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'})
model.set_adapter("speech")
generation_config = GenerationConfig.from_pretrained(MODEL_ID)
@spaces.GPU
def run_phi4(audio_path: str, instruction: str) -> str:
if not audio_path:
return "Please upload an audio file."
audio, samplerate = sf.read(audio_path)
user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"
prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}"
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)
output_ids = model.generate(
**inputs,
max_new_tokens=4096,
generation_config=generation_config,
)
output_ids = output_ids[:, inputs["input_ids"].shape[1]:]
response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return response
with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo:
gr.Markdown("# Phi-4 Multimodal (Audio) Demo")
gr.Markdown("Upload an audio file and run instructions with Phi-4.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
instruction = gr.Textbox(
label="Instruction",
value=(
"Transcribe the audio to text, and then translate the audio to French. "
"Use <sep> as a separator between the original transcript and the translation."
),
)
submit_btn = gr.Button("Run", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Model Response", lines=14)
submit_btn.click(run_phi4, [audio_input, instruction], output_text)
if __name__ == "__main__":
demo.queue().launch(share=False, ssr_mode=False)