File size: 1,771 Bytes

from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
from io import BytesIO
import base64

# Initialize the model and tokenizer
model_id = "HuggingFaceM4/idefics2-8b"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Check if CUDA (GPU support) is available and then set the device to GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def preprocess_image(encoded_image):
    """Decode and preprocess the input image."""
    decoded_image = base64.b64decode(encoded_image)
    img = Image.open(BytesIO(decoded_image)).convert("RGB")
    return img

def handler(event, context):
    """Handle the incoming request."""
    try:
        # Extract the base64-encoded image and question from the event
        input_image = event['body']['image']
        question = event['body'].get('question', "What is this image about?")

        # Preprocess the image
        img = preprocess_image(input_image)

        # Perform inference
        enc_image = model.encode_image(img).to(device)
        answer = model.answer_question(enc_image, question, tokenizer)

        # If the output is a tensor, move it back to CPU and convert to list
        if isinstance(answer, torch.Tensor):
            answer = answer.cpu().numpy().tolist()

        # Create the response
        response = {
            "statusCode": 200,
            "body": {
                "answer": answer
            }
        }
        return response
    except Exception as e:
        # Handle any errors
        response = {
            "statusCode": 500,
            "body": {
                "error": str(e)
            }
        }
        return response