Spaces:

Ashoka74
/

Demo_Refurnish

Sleeping

App Files Files Community

Ashoka74 commited on 16 days ago

Commit

9db138f

•

1 Parent(s): 274c822

Update app_3.py

Browse files

Files changed (1) hide show

app_3.py +231 -0

app_3.py CHANGED Viewed

@@ -1303,6 +1303,237 @@ def process_image(input_image, input_text):
             return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
         return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
 block = gr.Blocks().queue()

             return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
         return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
+@spaces.GPU(duration=60)
+@torch.inference_mode
+def process_image(input_image, input_text):
+    """Main processing function for the Gradio interface"""
+    if isinstance(input_image, Image.Image):
+        input_image = np.array(input_image)
+    # Initialize configs
+    API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
+    SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
+    SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml")
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    HEIGHT = 768
+    WIDTH = 768
+    # Initialize DDS client
+    config = Config(API_TOKEN)
+    client = Client(config)
+    # Process classes from text prompt
+    classes = [x.strip().lower() for x in input_text.split('.') if x]
+    class_name_to_id = {name: id for id, name in enumerate(classes)}
+    class_id_to_name = {id: name for name, id in class_name_to_id.items()}
+    # Save input image to temp file and get URL
+    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
+        cv2.imwrite(tmpfile.name, input_image)
+        image_url = client.upload_file(tmpfile.name)
+    os.remove(tmpfile.name)
+    # Process detection results
+    input_boxes = []
+    masks = []
+    confidences = []
+    class_names = []
+    class_ids = []
+    if len(input_text) == 0:
+        task = DinoxTask(
+            image_url=image_url,
+            prompts=[TextPrompt(text="<prompt_free>")],
+            # targets=[DetectionTarget.BBox, DetectionTarget.Mask]
+        )
+        client.run_task(task)
+        predictions = task.result.objects
+        classes = [pred.category for pred in predictions]
+        classes = list(set(classes))
+        class_name_to_id = {name: id for id, name in enumerate(classes)}
+        class_id_to_name = {id: name for name, id in class_name_to_id.items()}
+        for idx, obj in enumerate(predictions):
+            input_boxes.append(obj.bbox)
+            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
+            confidences.append(obj.score)
+            cls_name = obj.category.lower().strip()
+            class_names.append(cls_name)
+            class_ids.append(class_name_to_id[cls_name])
+        boxes = np.array(input_boxes)
+        masks = np.array(masks)
+        class_ids = np.array(class_ids)
+        labels = [
+            f"{class_name} {confidence:.2f}"
+            for class_name, confidence
+            in zip(class_names, confidences)
+        ]
+        detections = sv.Detections(
+            xyxy=boxes,
+            mask=masks.astype(bool),
+            class_id=class_ids
+        )
+        box_annotator = sv.BoxAnnotator()
+        label_annotator = sv.LabelAnnotator()
+        mask_annotator = sv.MaskAnnotator()
+        annotated_frame = input_image.copy()
+        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
+        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+        # Create transparent mask for first detected object
+        if len(detections) > 0:
+            # Get first mask
+            first_mask = detections.mask[0]
+            # Get original RGB image
+            img = input_image.copy()
+            H, W, C = img.shape
+            # Create RGBA image with default 255 alpha
+            alpha = np.zeros((H, W, 1), dtype=np.uint8)
+            alpha[~first_mask] = 128 # Set semi-transparency for background
+            alpha[first_mask] = 255 # Make the foreground opaque
+            rgba = np.dstack((img, alpha)).astype(np.uint8)
+            # get the bounding box of alpha
+            y, x = np.where(alpha > 0)
+            y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
+            x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
+            image_center = rgba[y0:y1, x0:x1]
+            # resize the longer side to H * 0.9
+            H, W, _ = image_center.shape
+            if H > W:
+                W = int(W * (HEIGHT * 0.9) / H)
+                H = int(HEIGHT * 0.9)
+            else:
+                H = int(H * (WIDTH * 0.9) / W)
+                W = int(WIDTH * 0.9)
+            image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
+            # pad to H, W
+            start_h = (HEIGHT - H) // 2
+            start_w = (WIDTH - W) // 2
+            image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
+            image[start_h : start_h + H, start_w : start_w + W] = image_center
+            image = image.astype(np.float32) / 255.0
+            image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+            image = (image * 255).clip(0, 255).astype(np.uint8)
+            image = Image.fromarray(image)
+            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
+        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
+    else:
+        # Run DINO-X detection
+        task = DinoxTask(
+            image_url=image_url,
+            prompts=[TextPrompt(text=input_text)],
+            targets=[DetectionTarget.BBox, DetectionTarget.Mask]
+        )
+        client.run_task(task)
+        result = task.result
+        objects = result.objects
+        predictions = task.result.objects
+        classes = [x.strip().lower() for x in input_text.split('.') if x]
+        class_name_to_id = {name: id for id, name in enumerate(classes)}
+        class_id_to_name = {id: name for name, id in class_name_to_id.items()}
+        boxes = []
+        masks = []
+        confidences = []
+        class_names = []
+        class_ids = []
+        for idx, obj in enumerate(predictions):
+            boxes.append(obj.bbox)
+            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
+            confidences.append(obj.score)
+            cls_name = obj.category.lower().strip()
+            class_names.append(cls_name)
+            class_ids.append(class_name_to_id[cls_name])
+        boxes = np.array(boxes)
+        masks = np.array(masks)
+        class_ids = np.array(class_ids)
+        labels = [
+            f"{class_name} {confidence:.2f}"
+            for class_name, confidence
+            in zip(class_names, confidences)
+        ]
+        detections = sv.Detections(
+            xyxy=boxes,
+            mask=masks.astype(bool),
+            class_id=class_ids,
+        )
+        box_annotator = sv.BoxAnnotator()
+        label_annotator = sv.LabelAnnotator()
+        mask_annotator = sv.MaskAnnotator()
+        annotated_frame = input_image.copy()
+        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
+        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+        # Create transparent mask for first detected object
+        if len(detections) > 0:
+            # Get first mask
+            first_mask = detections.mask[0]
+            # Get original RGB image
+            img = input_image.copy()
+            H, W, C = img.shape
+            # Create RGBA image with default 255 alpha
+            alpha = np.zeros((H, W, 1), dtype=np.uint8)
+            alpha[~first_mask] = 128 # Set semi-transparency for background
+            alpha[first_mask] = 255 # Make the foreground opaque
+            rgba = np.dstack((img, alpha)).astype(np.uint8)
+            # get the bounding box of alpha
+            y, x = np.where(alpha > 0)
+            y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
+            x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
+            image_center = rgba[y0:y1, x0:x1]
+            # resize the longer side to H * 0.9
+            H, W, _ = image_center.shape
+            if H > W:
+                W = int(W * (HEIGHT * 0.9) / H)
+                H = int(HEIGHT * 0.9)
+            else:
+                H = int(H * (WIDTH * 0.9) / W)
+                W = int(WIDTH * 0.9)
+            image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
+            # pad to H, W
+            start_h = (HEIGHT - H) // 2
+            start_w = (WIDTH - W) // 2
+            image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
+            image[start_h : start_h + H, start_w : start_w + W] = image_center
+            image = image.astype(np.float32) / 255.0
+            image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+            image = (image * 255).clip(0, 255).astype(np.uint8)
+            image = Image.fromarray(image)
+            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
+        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
 block = gr.Blocks().queue()