from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import torch from io import BytesIO import base64 # Initialize the model and tokenizer model_id = "HuggingFaceM4/idefics2-8b" model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) # Check if CUDA (GPU support) is available and then set the device to GPU or CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def preprocess_image(encoded_image): """Decode and preprocess the input image.""" decoded_image = base64.b64decode(encoded_image) img = Image.open(BytesIO(decoded_image)).convert("RGB") return img def handler(event, context): """Handle the incoming request.""" try: # Extract the base64-encoded image and question from the event input_image = event['body']['image'] question = event['body'].get('question', "What is this image about?") # Preprocess the image img = preprocess_image(input_image) # Perform inference enc_image = model.encode_image(img).to(device) answer = model.answer_question(enc_image, question, tokenizer) # If the output is a tensor, move it back to CPU and convert to list if isinstance(answer, torch.Tensor): answer = answer.cpu().numpy().tolist() # Create the response response = { "statusCode": 200, "body": { "answer": answer } } return response except Exception as e: # Handle any errors response = { "statusCode": 500, "body": { "error": str(e) } } return response