|
import gradio as gr |
|
from transformers import VisionEncoderDecoderModel, AutoTokenizer |
|
from PIL import Image |
|
import io |
|
import torch |
|
|
|
|
|
model = VisionEncoderDecoderModel.from_pretrained("liuhaotian/LLaVA-1.5-7b") |
|
tokenizer = AutoTokenizer.from_pretrained("liuhaotian/LLaVA-1.5-7b") |
|
|
|
|
|
def analyze_image(image_blob): |
|
image = Image.open(io.BytesIO(image_blob)) |
|
pixel_values = torch.tensor(image).unsqueeze(0) |
|
inputs = tokenizer("Analyze the emotions in this image", return_tensors="pt") |
|
|
|
|
|
outputs = model.generate(**inputs, pixel_values=pixel_values) |
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
return result |
|
|
|
|
|
iface = gr.Interface(fn=analyze_image, inputs="file", outputs="text") |
|
iface.launch() |
|
|