panoptic-segment-anything-api

Running

App Files Files Community

Tobias Cornille commited on Apr 19, 2023

Commit

a884db1

•

1 Parent(s): f8636a4

Make more robust

Browse files

Files changed (1) hide show

app.py +79 -61

app.py CHANGED Viewed

@@ -107,7 +107,7 @@ def dino_detection(
         visualization = Image.fromarray(annotated_frame)
         return boxes, category_ids, visualization
     else:
-        return boxes, category_ids
 def sam_masks_from_dino_boxes(predictor, image_array, boxes, device):
@@ -153,13 +153,16 @@ def clipseg_segmentation(
     ).to(device)
     with torch.no_grad():
         outputs = model(**inputs)
     # resize the outputs
-    logits = nn.functional.interpolate(
-        outputs.logits.unsqueeze(1),
         size=(image.size[1], image.size[0]),
         mode="bilinear",
     )
-    preds = torch.sigmoid(logits.squeeze())
     semantic_inds = preds_to_semantic_inds(preds, background_threshold)
     return preds, semantic_inds
@@ -192,7 +195,7 @@ def clip_and_shrink_preds(semantic_inds, preds, shrink_kernel_size, num_categori
         torch.sum(bool_masks[i].int()).item() for i in range(1, bool_masks.size(0))
     ]
     max_size = max(sizes)
-    relative_sizes = [size / max_size for size in sizes]
     # use bool masks to clip preds
     clipped_preds = torch.zeros_like(preds)
@@ -237,7 +240,7 @@ def upsample_pred(pred, image_source):
     else:
         target_height = int(upsampled_tensor.shape[2] * aspect_ratio)
         upsampled_tensor = upsampled_tensor[:, :, :target_height, :]
-    return upsampled_tensor.squeeze()
 def sam_mask_from_points(predictor, image_array, points):
@@ -335,67 +338,82 @@ def generate_panoptic_mask(
     image = image.convert("RGB")
     image_array = np.asarray(image)
-    # detect boxes for "thing" categories using Grounding DINO
-    thing_boxes, thing_category_ids = dino_detection(
-        dino_model,
-        image,
-        image_array,
-        thing_category_names,
-        category_name_to_id,
-        dino_box_threshold,
-        dino_text_threshold,
-        device,
-    )
     # compute SAM image embedding
     sam_predictor.set_image(image_array)
-    # get segmentation masks for the thing boxes
-    thing_masks = sam_masks_from_dino_boxes(
-        sam_predictor, image_array, thing_boxes, device
-    )
-    # get rough segmentation masks for "stuff" categories using CLIPSeg
-    clipseg_preds, clipseg_semantic_inds = clipseg_segmentation(
-        clipseg_processor,
-        clipseg_model,
-        image,
-        stuff_category_names,
-        segmentation_background_threshold,
-        device,
-    )
-    # remove things from stuff masks
-    combined_things_mask = torch.any(thing_masks, dim=0)
-    clipseg_semantic_inds_without_things = clipseg_semantic_inds.clone()
-    clipseg_semantic_inds_without_things[combined_things_mask[0]] = 0
-    # clip CLIPSeg preds based on non-overlapping semantic segmentation inds (+ optionally shrink the mask of each category)
-    # also returns the relative size of each category
-    clipsed_clipped_preds, relative_sizes = clip_and_shrink_preds(
-        clipseg_semantic_inds_without_things,
-        clipseg_preds,
-        shrink_kernel_size,
-        len(stuff_category_names) + 1,
-    )
-    # get finer segmentation masks for the "stuff" categories using SAM
-    sam_preds = torch.zeros_like(clipsed_clipped_preds)
-    for i in range(clipsed_clipped_preds.shape[0]):
-        clipseg_pred = clipsed_clipped_preds[i]
-        # for each "stuff" category, sample points in the rough segmentation mask
-        num_samples = int(relative_sizes[i] * num_samples_factor)
-        if num_samples == 0:
-            continue
-        points = sample_points_based_on_preds(clipseg_pred.cpu().numpy(), num_samples)
-        if len(points) == 0:
-            continue
-        # use SAM to get mask for points
-        pred = sam_mask_from_points(sam_predictor, image_array, points)
-        sam_preds[i] = pred
-    sam_semantic_inds = preds_to_semantic_inds(
-        sam_preds, segmentation_background_threshold
-    )
     # combine the thing inds and the stuff inds into panoptic inds
-    panoptic_inds = sam_semantic_inds.clone()
     ind = len(stuff_category_names) + 1
     for thing_mask in thing_masks:
         # overlay thing mask on panoptic inds
-        panoptic_inds[thing_mask.squeeze()] = ind
         ind += 1
     segmentation_bitmap, annotations = inds_to_segments_format(

         visualization = Image.fromarray(annotated_frame)
         return boxes, category_ids, visualization
     else:
+        return boxes, category_ids, phrases
 def sam_masks_from_dino_boxes(predictor, image_array, boxes, device):
     ).to(device)
     with torch.no_grad():
         outputs = model(**inputs)
+    logits = outputs.logits
+    if len(logits.shape) == 2:
+        logits = logits.unsqueeze(0)
     # resize the outputs
+    upscaled_logits = nn.functional.interpolate(
+        logits.unsqueeze(1),
         size=(image.size[1], image.size[0]),
         mode="bilinear",
     )
+    preds = torch.sigmoid(upscaled_logits.squeeze(dim=1))
     semantic_inds = preds_to_semantic_inds(preds, background_threshold)
     return preds, semantic_inds
         torch.sum(bool_masks[i].int()).item() for i in range(1, bool_masks.size(0))
     ]
     max_size = max(sizes)
+    relative_sizes = [size / max_size for size in sizes] if max_size > 0 else sizes
     # use bool masks to clip preds
     clipped_preds = torch.zeros_like(preds)
     else:
         target_height = int(upsampled_tensor.shape[2] * aspect_ratio)
         upsampled_tensor = upsampled_tensor[:, :, :target_height, :]
+    return upsampled_tensor.squeeze(dim=1)
 def sam_mask_from_points(predictor, image_array, points):
     image = image.convert("RGB")
     image_array = np.asarray(image)
     # compute SAM image embedding
     sam_predictor.set_image(image_array)
+    # detect boxes for "thing" categories using Grounding DINO
+    thing_category_ids = []
+    thing_masks = []
+    thing_boxes = []
+    if len(thing_category_names) > 0:
+        thing_boxes, thing_category_ids, _ = dino_detection(
+            dino_model,
+            image,
+            image_array,
+            thing_category_names,
+            category_name_to_id,
+            dino_box_threshold,
+            dino_text_threshold,
+            device,
+        )
+        if len(thing_boxes) > 0:
+            # get segmentation masks for the thing boxes
+            thing_masks = sam_masks_from_dino_boxes(
+                sam_predictor, image_array, thing_boxes, device
+            )
+    if len(stuff_category_names) > 0:
+        # get rough segmentation masks for "stuff" categories using CLIPSeg
+        clipseg_preds, clipseg_semantic_inds = clipseg_segmentation(
+            clipseg_processor,
+            clipseg_model,
+            image,
+            stuff_category_names,
+            segmentation_background_threshold,
+            device,
+        )
+        # remove things from stuff masks
+        clipseg_semantic_inds_without_things = clipseg_semantic_inds.clone()
+        if len(thing_boxes) > 0:
+            combined_things_mask = torch.any(thing_masks, dim=0)
+            clipseg_semantic_inds_without_things[combined_things_mask[0]] = 0
+        # clip CLIPSeg preds based on non-overlapping semantic segmentation inds (+ optionally shrink the mask of each category)
+        # also returns the relative size of each category
+        clipsed_clipped_preds, relative_sizes = clip_and_shrink_preds(
+            clipseg_semantic_inds_without_things,
+            clipseg_preds,
+            shrink_kernel_size,
+            len(stuff_category_names) + 1,
+        )
+        # get finer segmentation masks for the "stuff" categories using SAM
+        sam_preds = torch.zeros_like(clipsed_clipped_preds)
+        for i in range(clipsed_clipped_preds.shape[0]):
+            clipseg_pred = clipsed_clipped_preds[i]
+            # for each "stuff" category, sample points in the rough segmentation mask
+            num_samples = int(relative_sizes[i] * num_samples_factor)
+            if num_samples == 0:
+                continue
+            points = sample_points_based_on_preds(
+                clipseg_pred.cpu().numpy(), num_samples
+            )
+            if len(points) == 0:
+                continue
+            # use SAM to get mask for points
+            pred = sam_mask_from_points(sam_predictor, image_array, points)
+            sam_preds[i] = pred
+        sam_semantic_inds = preds_to_semantic_inds(
+            sam_preds, segmentation_background_threshold
+        )
     # combine the thing inds and the stuff inds into panoptic inds
+    panoptic_inds = (
+        sam_semantic_inds.clone()
+        if len(stuff_category_names) > 0
+        else torch.zeros(image_array.shape[0], image_array.shape[1], dtype=torch.long)
+    )
     ind = len(stuff_category_names) + 1
     for thing_mask in thing_masks:
         # overlay thing mask on panoptic inds
+        panoptic_inds[thing_mask.squeeze(dim=0)] = ind
         ind += 1
     segmentation_bitmap, annotations = inds_to_segments_format(