from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation from PIL import Image import requests import torch # Load Mask2Former trained on COCO instance segmentation dataset image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-coco-instance") model = Mask2FormerForUniversalSegmentation.from_pretrained( "facebook/mask2former-swin-small-coco-instance" ) url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) inputs = image_processor(image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) # Model predicts class_queries_logits of shape `(batch_size, num_queries)` # and masks_queries_logits of shape `(batch_size, num_queries, height, width)` class_queries_logits = outputs.class_queries_logits masks_queries_logits = outputs.masks_queries_logits # Perform post-processing to get instance segmentation map pred_instance_map = image_processor.post_process_semantic_segmentation( outputs, target_sizes=[image.size[::-1]] )[0] print(pred_instance_map.shape)