Spaces:

pablovela5620
/

grounding-sam

Runtime error

App Files Files Community

pablovela5620 commited on Apr 24, 2023

Commit

f0d4b35

•

1 Parent(s): 48f587e

initial working demo

Browse files

Files changed (4) hide show

.gitignore +2 -1
gradio_demo.py +97 -1
main.py +178 -0
models.py +234 -0

.gitignore CHANGED Viewed

@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 .vscode/*
-static/*

 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 .vscode/*
+static/*
+model/*

gradio_demo.py CHANGED Viewed

@@ -4,10 +4,27 @@ from fastapi.staticfiles import StaticFiles
 import uvicorn
 import gradio as gr
 from datetime import datetime
 import rerun as rr
-rr.init("cube")
 # create a FastAPI app
 app = FastAPI()
@@ -20,12 +37,91 @@ static_dir.mkdir(parents=True, exist_ok=True)
 app.mount("/static", StaticFiles(directory=static_dir), name="static")
 # Gradio stuff
 def predict():
     file_name = f"{datetime.utcnow().strftime('%s')}.html"
     file_path = static_dir / file_name
     rec = rr.memory_recording()
     with open(file_path, "w") as f:
         f.write(rec.as_html())
     iframe = f"""<iframe src="/static/{file_name}" width="950" height="712"></iframe>"""

 import uvicorn
 import gradio as gr
 from datetime import datetime
+from typing import Union, List
+import cv2
+import torch
+from main import grounding_dino_detect
+from models import (
+    run_segmentation,
+    resize_img,
+    load_image,
+    get_downloaded_model_path,
+    load_grounding_model,
+    create_sam,
+    CONFIG_PATH,
+)
+from segment_anything import SamPredictor
+from segment_anything.modeling import Sam
+from groundingdino.models import GroundingDINO
 import rerun as rr
+rr.init("GroundingSAM")
 # create a FastAPI app
 app = FastAPI()
 app.mount("/static", StaticFiles(directory=static_dir), name="static")
+def log_video_segmentation(
+    video_path: Path,
+    prompt: str,
+    model: GroundingDINO,
+    predictor: Sam,
+    device: str = "cpu",
+):
+    assert video_path.exists()
+    cap = cv2.VideoCapture(str(video_path))
+    idx = 0
+    while cap.isOpened():
+        ret, bgr = cap.read()
+        if not ret or idx > 20:
+            break
+        rr.set_time_sequence("frame", idx)
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        rgb = resize_img(rgb, 512)
+        rr.log_image("image", rgb)
+        detections, phrases, id_from_phrase = grounding_dino_detect(
+            model, device, rgb, prompt
+        )
+        predictor.set_image(rgb)
+        run_segmentation(predictor, rgb, detections, phrases, id_from_phrase)
+        idx += 1
+def log_images_segmentation(
+    images: list[Union[str, Path]],
+    prompt: str,
+    model: GroundingDINO,
+    predictor: Sam,
+    device: str = "cpu",
+):
+    for n, image_uri in enumerate(images):
+        rr.set_time_sequence("image", n)
+        image = load_image(image_uri)
+        rr.log_image("image", image)
+        detections, phrases, id_from_phrase = grounding_dino_detect(
+            model, device, image, prompt
+        )
+        predictor.set_image(image)
+        run_segmentation(predictor, image, detections, phrases, id_from_phrase)
 # Gradio stuff
 def predict():
     file_name = f"{datetime.utcnow().strftime('%s')}.html"
     file_path = static_dir / file_name
     rec = rr.memory_recording()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # load model
+    grounded_checkpoint = get_downloaded_model_path("grounding")
+    model = load_grounding_model(CONFIG_PATH, grounded_checkpoint, device=device)
+    sam = create_sam("vit_b", device)
+    predictor = SamPredictor(sam)
+    # prompt = "tires"
+    log_video_segmentation(
+        Path("dog_and_woman.mp4"),
+        "dog, woman",
+        model,
+        predictor,
+        device=device,
+    )
+    # log_images_segmentation(
+    #     [
+    #         "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg"
+    #     ],
+    #     prompt,
+    #     model,
+    #     predictor,
+    #     device=device,
+    # )
     with open(file_path, "w") as f:
         f.write(rec.as_html())
     iframe = f"""<iframe src="/static/{file_name}" width="950" height="712"></iframe>"""

main.py ADDED Viewed

	@@ -0,0 +1,178 @@

+#!/usr/bin/env python3
+"""
+Example of using Rerun to log and visualize the out of grounded dino + segment-anything.
+See: [segment_anything](https://github.com/IDEA-Research/Grounded-Segment-Anything).
+Can be used to test mask-generation on one or more images, as well as videos. Images can be local file-paths
+or remote urls. Videos must be local file-paths. Can use multiple prompts.
+"""
+import argparse
+import logging
+import rerun as rr
+import torch
+import cv2
+from pathlib import Path
+from models import CONFIG_PATH, MODEL_URLS, get_downloaded_model_path
+from models import load_grounding_model, create_sam, load_image, image_to_tensor
+from models import get_grounding_output, run_segmentation, resize_img
+from segment_anything import SamPredictor
+from segment_anything.modeling import Sam
+from groundingdino.models import GroundingDINO
+def log_images_segmentation(args, model: GroundingDINO, predictor: Sam):
+    for n, image_uri in enumerate(args.images):
+        rr.set_time_sequence("image", n)
+        image = load_image(image_uri)
+        rr.log_image("image", image)
+        detections, phrases, id_from_phrase = grounding_dino_detect(
+            model, args.device, image, args.prompt
+        )
+        predictor.set_image(image)
+        run_segmentation(predictor, image, detections, phrases, id_from_phrase)
+def grounding_dino_detect(model, device, image, prompt):
+    image_tensor = image_to_tensor(image)
+    logging.info(f"Running GroundedDINO with DETECTION PROMPT {prompt}.")
+    boxes_filt, box_phrases = get_grounding_output(
+        model, image_tensor, prompt, 0.3, 0.25, device=device
+    )
+    logging.info(f"Grounded output with prediction phrases: {box_phrases}")
+    # denormalize boxes (from [0, 1] to image size)
+    H, W, _ = image.shape
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    id_from_phrase = {phrase: i for i, phrase in enumerate(set(box_phrases), start=1)}
+    box_ids = [id_from_phrase[phrase] for phrase in box_phrases]  # One mask per box
+    # Make sure we have an AnnotationInfo present for every class-id used in this image
+    rr.log_annotation_context(
+        "image",
+        [
+            rr.AnnotationInfo(id=id, label=phrase)
+            for phrase, id in id_from_phrase.items()
+        ],
+        timeless=False,
+    )
+    rr.log_rects(
+        "image/detections",
+        rects=boxes_filt.numpy(),
+        class_ids=box_ids,
+        rect_format=rr.RectFormat.XYXY,
+    )
+    return boxes_filt, box_phrases, id_from_phrase
+def log_video_segmentation(args, model: GroundingDINO, predictor: Sam):
+    video_path = args.video_path
+    assert video_path.exists()
+    cap = cv2.VideoCapture(str(video_path))
+    idx = 0
+    while cap.isOpened():
+        ret, bgr = cap.read()
+        if not ret:
+            break
+        rr.set_time_sequence("frame", idx)
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        rgb = resize_img(rgb, 512)
+        rr.log_image("image", rgb)
+        detections, phrases, id_from_phrase = grounding_dino_detect(
+            model, args.device, rgb, args.prompt
+        )
+        predictor.set_image(rgb)
+        run_segmentation(predictor, rgb, detections, phrases, id_from_phrase)
+        idx += 1
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Run IDEA Research Grounded Dino + SAM example.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model",
+        action="store",
+        default="vit_b",
+        choices=MODEL_URLS.keys(),
+        help="Which model to use."
+        "(See: https://github.com/facebookresearch/segment-anything#model-checkpoints)",
+    )
+    parser.add_argument(
+        "--device",
+        action="store",
+        default="cpu",
+        help="Which torch device to use, e.g. cpu or cuda. "
+        "(See: https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)",
+    )
+    parser.add_argument(
+        "--prompt",
+        default="tires and windows",
+        type=str,
+        help="List of prompts to use for bounding box detection.",
+    )
+    parser.add_argument(
+        "images", metavar="N", type=str, nargs="*", help="A list of images to process."
+    )
+    parser.add_argument(
+        "--bbox-threshold",
+        default=0.3,
+        type=float,
+        help="Threshold for a bounding box to be considered.",
+    )
+    parser.add_argument(
+        "--video-path",
+        default=None,
+        type=Path,
+        help="Path to video to run segmentation on",
+    )
+    rr.script_add_args(parser)
+    args = parser.parse_args()
+    rr.script_setup(args, "grounded_sam")
+    logging.getLogger().addHandler(rr.LoggingHandler("logs"))
+    logging.getLogger().setLevel(logging.INFO)
+    # load model
+    grounded_checkpoint = get_downloaded_model_path("grounding")
+    model = load_grounding_model(CONFIG_PATH, grounded_checkpoint, device=args.device)
+    sam = create_sam(args.model, args.device)
+    predictor = SamPredictor(sam)
+    if len(args.images) == 0 and args.video_path is None:
+        logging.info("No image provided. Using default.")
+        args.images = [
+            "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg"
+        ]
+    if len(args.images) > 0:
+        log_images_segmentation(args, model, predictor)
+    elif args.video_path is not None:
+        log_video_segmentation(args, model, predictor)
+    rr.script_teardown(args)
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import logging
+import os
+from pathlib import Path
+from typing import Final, List, Mapping
+from urllib.parse import urlparse
+import cv2
+from PIL import Image
+import numpy as np
+import requests
+import rerun as rr
+import torch
+import torchvision
+from cv2 import Mat
+from segment_anything import SamPredictor, sam_model_registry
+from segment_anything.modeling import Sam
+from tqdm import tqdm
+# Grounding DINO
+import GroundingDINO.groundingdino.datasets.transforms as T
+from GroundingDINO.groundingdino.models import build_model
+from GroundingDINO.groundingdino.util.slconfig import SLConfig
+from GroundingDINO.groundingdino.util.utils import (
+    clean_state_dict,
+    get_phrases_from_posmap,
+)
+from groundingdino.models import GroundingDINO
+CONFIG_PATH: Final = (
+    Path(os.path.dirname(__file__))
+    / "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+)
+MODEL_DIR: Final = Path(os.path.dirname(__file__)) / "model"
+MODEL_URLS: Final = {
+    "vit_h": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+    "vit_l": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
+    "vit_b": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth",
+    "grounding": "https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth",
+}
+def download_with_progress(url: str, dest: Path) -> None:
+    """Download file with tqdm progress bar."""
+    chunk_size = 1024 * 1024
+    resp = requests.get(url, stream=True)
+    total_size = int(resp.headers.get("content-length", 0))
+    with open(dest, "wb") as dest_file:
+        with tqdm(
+            desc="Downloading model",
+            total=total_size,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as progress:
+            for data in resp.iter_content(chunk_size):
+                dest_file.write(data)
+                progress.update(len(data))
+def get_downloaded_model_path(model_name: str) -> Path:
+    """Fetch the segment-anything model to a local cache directory."""
+    model_url = MODEL_URLS[model_name]
+    model_location = MODEL_DIR / model_url.split("/")[-1]
+    if not model_location.exists():
+        os.makedirs(MODEL_DIR, exist_ok=True)
+        download_with_progress(model_url, model_location)
+    return model_location
+def create_sam(model: str, device: str) -> Sam:
+    """Load the segment-anything model, fetching the model-file as necessary."""
+    model_path = get_downloaded_model_path(model)
+    logging.info("PyTorch version: {}".format(torch.__version__))
+    logging.info("Torchvision version: {}".format(torchvision.__version__))
+    logging.info("CUDA is available: {}".format(torch.cuda.is_available()))
+    logging.info("Building sam from: {}".format(model_path))
+    sam = sam_model_registry[model](checkpoint=model_path)
+    return sam.to(device=device)
+def run_segmentation(
+    predictor: SamPredictor,
+    image: Mat,
+    detections,
+    phrases: List[str],
+    id_from_phrase: Mapping[str, int],
+) -> None:
+    """Run segmentation on a single image."""
+    if detections.shape[0] == 0:
+        return
+    logging.info("Finding masks")
+    transformed_boxes = predictor.transform.apply_boxes_torch(
+        detections, image.shape[:2]
+    )
+    masks, _, _ = predictor.predict_torch(
+        point_coords=None,
+        point_labels=None,
+        boxes=transformed_boxes.to(predictor.device),
+        multimask_output=False,
+    )
+    logging.info("Found {} masks".format(len(masks)))
+    # Layer all of the masks that belong to a single phrase together
+    segmentation_img = np.zeros((image.shape[0], image.shape[1]))
+    for phrase, mask in zip(phrases, masks):
+        segmentation_img[mask.squeeze()] = id_from_phrase[phrase]
+    rr.log_segmentation_image("image/segmentation", segmentation_img)
+def is_url(path: str) -> bool:
+    """Check if a path is a url or a local file."""
+    try:
+        result = urlparse(path)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+def resize_img(img: Mat, max_dimension: int = 512) -> Mat:
+    height, width = img.shape[:2]
+    # Check if either dimension is larger than the maximum
+    if max(height, width) > max_dimension:
+        # Calculate the new dimensions while maintaining the aspect ratio
+        if height > width:
+            new_height = max_dimension
+            new_width = int((new_height * width) / height)
+        else:
+            new_width = max_dimension
+            new_height = int((new_width * height) / width)
+        # Resize the image
+        resized_image = cv2.resize(
+            img, (new_width, new_height), interpolation=cv2.INTER_AREA
+        )
+    return resized_image
+def image_to_tensor(image: Mat) -> torch.Tensor:
+    """
+    Assumes a RGB OpenCV image, this is required for the DINO model
+    """
+    image_pil = Image.fromarray(image)
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image_tensor, _ = transform(image_pil, None)  # 3, h, w
+    return image_tensor
+def load_image(image_uri: str) -> Mat:
+    """Conditionally download an image from URL or load it from disk."""
+    logging.info("Loading: {}".format(image_uri))
+    if is_url(image_uri):
+        response = requests.get(image_uri)
+        response.raise_for_status()
+        image_data = np.asarray(bytearray(response.content), dtype="uint8")
+        image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
+    else:
+        image = cv2.imread(image_uri, cv2.IMREAD_COLOR)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    return image
+def load_grounding_model(
+    model_config_path: Path, model_checkpoint_path: Path, device: str
+) -> GroundingDINO:
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    _ = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    _ = model.eval()
+    return model
+def get_grounding_output(
+    model: GroundingDINO,
+    image: torch.Tensor,
+    caption: str,
+    box_threshold: float,
+    text_threshold: float,
+    with_logits: bool = False,
+    device: str = "cpu",
+):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(
+            logit > text_threshold, tokenized, tokenlizer
+        )
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+    return boxes_filt, pred_phrases