maxcushion / app.py
colt12's picture
Create app.py
46e68f0 verified
raw
history blame
1.07 kB
import io
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForVisionEncoderDecoder
# Load the model and processor
model_name = "colt12/maxcushion"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVisionEncoderDecoder.from_pretrained(model_name)
def predict(image_bytes):
# Open the image using PIL
image = Image.open(io.BytesIO(image_bytes))
# Preprocess the image
pixel_values = processor(images=image, return_tensors="pt").pixel_values
# Generate the caption
generated_ids = model.generate(pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_caption
def run(raw_image):
# Input validation
if not raw_image:
raise ValueError("No image provided")
try:
# Process the image and generate the caption
result = predict(raw_image)
return {"caption": result}
except Exception as e:
# Error handling
return {"error": str(e)}