|
import io |
|
from PIL import Image |
|
import torch |
|
from transformers import AutoProcessor, AutoModelForVisionEncoderDecoder |
|
|
|
|
|
model_name = "colt12/maxcushion" |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
model = AutoModelForVisionEncoderDecoder.from_pretrained(model_name) |
|
|
|
def predict(image_bytes): |
|
|
|
image = Image.open(io.BytesIO(image_bytes)) |
|
|
|
|
|
pixel_values = processor(images=image, return_tensors="pt").pixel_values |
|
|
|
|
|
generated_ids = model.generate(pixel_values, max_length=50) |
|
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
|
return generated_caption |
|
|
|
def run(raw_image): |
|
|
|
if not raw_image: |
|
raise ValueError("No image provided") |
|
|
|
try: |
|
|
|
result = predict(raw_image) |
|
return {"caption": result} |
|
except Exception as e: |
|
|
|
return {"error": str(e)} |
|
|