Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,129 Bytes
f9923d4 97e7627 39d7a6f d2c4dcd 97e7627 f9923d4 97e7627 6cec260 97e7627 6cec260 f9923d4 d2c4dcd f9923d4 d2c4dcd 97e7627 d2c4dcd 39d7a6f d2c4dcd 39d7a6f f9923d4 97e7627 d2c4dcd 39d7a6f 97e7627 f9923d4 d2c4dcd 97e7627 d2c4dcd f9923d4 d2c4dcd 39d7a6f 97e7627 d2c4dcd 97e7627 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from urllib.request import urlopen
import spaces
# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"
# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager",
)
# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
# Define inference function
@spaces.GPU
def process_input(input_type, file, question):
if not file or not question:
return "Please upload a file and provide a question."
# Prepare the prompt
if input_type == "Image":
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
# Open image from uploaded file
image = Image.open(file)
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
elif input_type == "Audio":
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
# Read audio from uploaded file
audio, samplerate = sf.read(file)
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
else:
return "Invalid input type selected."
# Generate response
with torch.no_grad():
generate_ids = model.generate(
**inputs,
max_new_tokens=200,
num_logits_to_keep=0,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
# Gradio interface
with gr.Blocks(
title="Phi-4 Multimodal Demo",
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="gray",
radius_size="lg",
),
) as demo:
gr.Markdown(
"""
# Phi-4 Multimodal Demo
Upload an **image** or **audio** file, ask a question, and get a response from the model!
Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
"""
)
with gr.Row():
with gr.Column(scale=1):
input_type = gr.Radio(
choices=["Image", "Audio"],
label="Select Input Type",
value="Image",
)
file_input = gr.File(
label="Upload Your File",
file_types=["image", "audio"],
)
question_input = gr.Textbox(
label="Your Question",
placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
lines=2,
)
submit_btn = gr.Button("Submit", variant="primary")
with gr.Column(scale=2):
output_text = gr.Textbox(
label="Model Response",
placeholder="Response will appear here...",
lines=10,
interactive=False,
)
# Example section
with gr.Accordion("Examples", open=False):
gr.Markdown("Try these examples:")
gr.Examples(
examples=[
["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
],
inputs=[input_type, file_input, question_input],
outputs=output_text,
fn=process_input,
cache_examples=False,
)
# Connect the submit button
submit_btn.click(
fn=process_input,
inputs=[input_type, file_input, question_input],
outputs=output_text,
)
# Launch the demo
demo.launch() |