|
import gradio as gr |
|
from transformers import VisionEncoderDecoderModel, AutoTokenizer |
|
from PIL import Image |
|
import io |
|
|
|
|
|
model = LLaVAForVisionTextGeneration.from_pretrained("liuhaotian/LLaVA-1.5-7b") |
|
tokenizer = LLaVATokenizer.from_pretrained("liuhaotian/LLaVA-1.5-7b") |
|
|
|
|
|
def analyze_image(image_blob): |
|
image = Image.open(io.BytesIO(image_blob)) |
|
inputs = tokenizer("Analyze the emotions in this image", return_tensors="pt") |
|
outputs = model.generate(**inputs, images=image) |
|
return tokenizer.decode(outputs[0]) |
|
|
|
|
|
iface = gr.Interface(fn=analyze_image, inputs="file", outputs="text") |
|
iface.launch() |
|
|