|
import gradio as gr |
|
import soundfile as sf |
|
from PIL import Image |
|
import spaces |
|
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig |
|
|
|
|
|
model_path = "microsoft/Phi-4-multimodal-instruct" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
device_map="auto", |
|
torch_dtype="auto", |
|
trust_remote_code=True, |
|
attn_implementation='eager', |
|
) |
|
|
|
generation_config = GenerationConfig.from_pretrained(model_path) |
|
|
|
|
|
user_prompt = '<|user|>' |
|
assistant_prompt = '<|assistant|>' |
|
prompt_suffix = '<|end|>' |
|
|
|
@spaces.GPU |
|
def process_multimodal(input_file, query): |
|
if input_file is None: |
|
return "Please upload an image or an audio file." |
|
|
|
file_type = input_file.type |
|
prompt = f"{user_prompt}<|media_1|>{query}{prompt_suffix}{assistant_prompt}" |
|
|
|
if "image" in file_type: |
|
image = Image.open(input_file) |
|
inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0') |
|
elif "audio" in file_type: |
|
audio, samplerate = sf.read(input_file.name) |
|
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0') |
|
else: |
|
return "Unsupported file format. Please upload an image or audio file." |
|
|
|
generate_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=1000, |
|
generation_config=generation_config, |
|
num_logits_to_keep=0, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
)[0] |
|
|
|
return response |
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(""" |
|
# Phi-4 Multimodal Chat |
|
Upload an image or an audio file and ask questions related to it! |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_file = gr.File(label="Upload Image or Audio") |
|
query = gr.Textbox(label="Ask a question") |
|
submit_btn = gr.Button("Submit") |
|
|
|
with gr.Column(): |
|
output = gr.Textbox(label="Response", interactive=False) |
|
|
|
submit_btn.click(process_multimodal, inputs=[input_file, query], outputs=output) |
|
|
|
demo.launch() |
|
|