|
import gradio as gr |
|
import spaces |
|
import torch |
|
from transformers import AutoProcessor, LlavaForConditionalGeneration |
|
|
|
model_id = "llava-hf/llava-1.5-7b-hf" |
|
|
|
prompt_format = "USER: <image>\n{}\nASSISTANT:" |
|
|
|
model = LlavaForConditionalGeneration.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.float16, |
|
low_cpu_mem_usage=True, |
|
).cuda() |
|
|
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
@spaces.GPU |
|
def inference(text, image): |
|
prompt = prompt_format.format(text) |
|
inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16) |
|
output = model.generate(**inputs, max_new_tokens=1024) |
|
return processor.decode(output[0], skip_special_tokens=True).split("ASSISTANT:")[-1] |
|
|
|
gr.Interface(fn=inference, inputs=[gr.Text(), gr.Image()], outputs=gr.Text()).launch() |