Spaces:

3dlg-hcvc
/

opdmulti-demo

Sleeping

App Files Files Community

atwang commited on Sep 21, 2023

Commit

5ceacf4

•

1 Parent(s): 01664b3

semi-working demo for one part

Browse files

Files changed (6) hide show

.gitignore +3 -0
app.py +60 -24
inference.py +23 -10
mask2former/__init__.py +3 -0
requirements.txt +1 -0
utilities.py +102 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,6 @@
 build/
 venv/
 __pycache__/

 build/
+dist/
+*.egg-info
 venv/
 __pycache__/
+.output/

app.py CHANGED Viewed

@@ -1,29 +1,36 @@
 import os
 import re
 from types import SimpleNamespace
 from typing import Any
 import gradio as gr
 import numpy as np
 from detectron2 import engine
 from inference import main, setup_cfg
 # internal settings
 NUM_PROCESSES = 1
-CROP = False
 SCORE_THRESHOLD = 0.8
 MAX_PARTS = 5
 ARGS = SimpleNamespace(
     config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
-    model="...",
     input_format="RGB",
     output=".output",
     cpu=True,
 )
 def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_samples: int) -> list[Any]:
     def find_gifs(path: str) -> list[str]:
         """Scrape folders for all generated gif files."""
         for file in os.listdir(path):
@@ -33,6 +40,36 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
                     if re.match(r".*\.gif$", image_file):
                         yield os.path.join(sub_path, image_file)
     cfg = setup_cfg(ARGS)
     engine.launch(
@@ -48,25 +85,22 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
             SCORE_THRESHOLD,
         ),
     )
     # process output
     # TODO: may want to select these in decreasing order of score
-    pre_outputs = list(find_gifs(ARGS.output))
-    outputs = []
-    for idx in range(MAX_PARTS):  # hide unused components
-        if idx < len(pre_outputs):
-            outputs.append(gr.update(value=pre_outputs[idx], visible=True))
-        else:
-            outputs.append(gr.update(visible=False))
-    return outputs
-def variable_outputs(idx):
-    idx = int(idx)
-with gr.Blocks() as app:
     gr.Markdown(
         """
     # OPDMulti Demo
@@ -81,7 +115,7 @@ with gr.Blocks() as app:
             image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
         )
         depth_image = gr.Image(
-            image_mode="L", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
         )
     intrinsics = gr.Dataframe(
@@ -89,16 +123,16 @@ with gr.Blocks() as app:
             [
                 214.85935872395834,
                 0.0,
-                0.0,
             ],
             [
                 0.0,
                 214.85935872395834,
-                0.0,
             ],
             [
-                125.90160319010417,
-                95.13726399739583,
                 1.0,
             ],
         ],
@@ -124,11 +158,13 @@ with gr.Blocks() as app:
     # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
     # identified.
-    outputs = [gr.Image(type="filepath", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
     # TODO: maybe need to use a queue here so we don't overload the instance
     submit_btn.click(
-        fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=outputs, api_name="run_model"
     )
-app.launch()

 import os
 import re
+import shutil
+import time
 from types import SimpleNamespace
 from typing import Any
 import gradio as gr
 import numpy as np
 from detectron2 import engine
+from PIL import Image
 from inference import main, setup_cfg
 # internal settings
 NUM_PROCESSES = 1
+CROP = True
 SCORE_THRESHOLD = 0.8
 MAX_PARTS = 5
 ARGS = SimpleNamespace(
     config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
+    model="../data/models/motion_state_pred_opdformerp_rgb.pth",
     input_format="RGB",
     output=".output",
     cpu=True,
 )
+outputs = []
 def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_samples: int) -> list[Any]:
+    global outputs
     def find_gifs(path: str) -> list[str]:
         """Scrape folders for all generated gif files."""
         for file in os.listdir(path):
                     if re.match(r".*\.gif$", image_file):
                         yield os.path.join(sub_path, image_file)
+    def find_images(path: str) -> list[str]:
+        """Scrape folders for all generated gif files."""
+        images = {}
+        for file in os.listdir(path):
+            sub_path = os.path.join(path, file)
+            if os.path.isdir(sub_path):
+                images[file] = []
+                for image_file in sorted(os.listdir(sub_path)):
+                    if re.match(r".*\.png$", image_file):
+                        images[file].append(os.path.join(sub_path, image_file))
+        return images
+    def get_generator(images):
+        def gen():
+            while True:
+                for im in images:
+                    time.sleep(0.025)
+                    yield im
+                time.sleep(3)
+        return gen
+    # clear old predictions
+    for path in os.listdir(ARGS.output):
+        full_path = os.path.join(ARGS.output, path)
+        if os.path.isdir(full_path):
+            shutil.rmtree(full_path)
+        else:
+            os.remove(full_path)
     cfg = setup_cfg(ARGS)
     engine.launch(
             SCORE_THRESHOLD,
         ),
     )
     # process output
     # TODO: may want to select these in decreasing order of score
+    image_files = find_images(ARGS.output)
+    output = []
+    for count, part in enumerate(image_files):
+        if count < MAX_PARTS:
+            # output.append(gr.update(value=get_generator([Image.open(im) for im in image_files[part]]), visible=True))
+            output.append(get_generator([Image.open(im) for im in image_files[part]]))
+    # while len(output) < MAX_PARTS:
+    #     output.append(gr.update(visible=False))
+    yield from output[0]()
+with gr.Blocks() as demo:
     gr.Markdown(
         """
     # OPDMulti Demo
             image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
         )
         depth_image = gr.Image(
+            image_mode="I;16", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
         )
     intrinsics = gr.Dataframe(
             [
                 214.85935872395834,
                 0.0,
+                125.90160319010417,
             ],
             [
                 0.0,
                 214.85935872395834,
+                95.13726399739583,
             ],
             [
+                0.0,
+                0.0,
                 1.0,
             ],
         ],
     # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
     # identified.
+    # images = [gr.Image(type="pil", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
+    image = gr.Image(type="pil", visible=True)
     # TODO: maybe need to use a queue here so we don't overload the instance
     submit_btn.click(
+        fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=image, api_name="run_model"
     )
+demo.queue(api_open=False)
+demo.launch()

inference.py CHANGED Viewed

@@ -40,8 +40,9 @@ from mask2former import (
     add_maskformer2_config,
     add_motionnet_config,
 )
-# import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint.py, in order to
 # replicate model loading without the overhead of setting up an OPDTrainer
 TORCH_VERSION: tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
@@ -63,6 +64,7 @@ TYPE_CLASSIFICATION = {
 }
 POINT_COLOR = [1, 0, 0]  # red for demonstration
 IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
@@ -614,7 +616,7 @@ def batch_trim(images_path: str, save_path: str, identical: bool = False) -> Non
         optimal_box = None
         # load all images
-        for image_file in os.listdir(images_path):
             if image_file.endswith(IMAGE_EXTENSIONS):
                 image_path = os.path.join(images_path, image_file)
                 images.append(Image.open(image_path))
@@ -636,10 +638,10 @@ def batch_trim(images_path: str, save_path: str, identical: bool = False) -> Non
                 )
         # apply cropping, if optimal box was found
-        if optimal_box:
-            for im in images:
-                im.crop(optimal_box)
-                im.close()
     else:  # trim each image separately
         for image_file in os.listdir(images_path):
@@ -665,6 +667,9 @@ def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "ou
     # Read the images using imageio
     images = [imageio.imread(image_file) for image_file in image_files]
     # Save images as a gif
     gif_output_path = f"{image_folder_path}/{gif_filename}"
@@ -710,9 +715,16 @@ def main(
     # run model on data
     logger.info("Running model.")
     prediction = predict(model, inp)[0]  # index 0 since there is only one image
     # select best prediction to visualize
-    pred_instances = prediction["instances"]
     score_ranking = np.argsort([-1 * pred_instances[i].scores.item() for i in range(len(pred_instances))])
     score_ranking = [idx for idx in score_ranking if pred_instances[int(idx)].scores.item() > score_threshold]
     if len(score_ranking) == 0:
@@ -756,7 +768,7 @@ def main(
         # Create a LineSet to visualize the direction vector
         axis_arrow = draw_line(origin, axis_vector + origin)
-        axis_arrow.paint_uniform_color([0, 1, 0])
         # if USE_GT:
         #     anno_path = f"/localhome/atw7/projects/opdmulti/data/data_demo_dev/59-4860.json"
@@ -807,9 +819,10 @@ def main(
                 if not os.path.isdir(output_dir_cropped):
                     os.makedirs(output_dir_cropped)
                 batch_trim(output_dir, output_dir_cropped, identical=True)
-                create_gif(output_dir_cropped, num_samples)
             else:  # leave original dimensions of image as-is
-                create_gif(output_dir, num_samples)
 if __name__ == "__main__":

     add_maskformer2_config,
     add_motionnet_config,
 )
+from utilities import prediction_to_json
+# import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint, in order to
 # replicate model loading without the overhead of setting up an OPDTrainer
 TORCH_VERSION: tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
 }
 POINT_COLOR = [1, 0, 0]  # red for demonstration
+ARROW_COLOR = [0, 1, 0]  # green
 IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
         optimal_box = None
         # load all images
+        for image_file in sorted(os.listdir(images_path)):
             if image_file.endswith(IMAGE_EXTENSIONS):
                 image_path = os.path.join(images_path, image_file)
                 images.append(Image.open(image_path))
                 )
         # apply cropping, if optimal box was found
+        for idx, im in enumerate(images):
+            im.crop(optimal_box)
+            im.save(os.path.join(save_path, f"{idx}.png"))
+            im.close()
     else:  # trim each image separately
         for image_file in os.listdir(images_path):
     # Read the images using imageio
     images = [imageio.imread(image_file) for image_file in image_files]
+    assert all(
+        images[0].shape == im.shape for im in images
+    ), f"Found some images with a different shape: {[im.shape for im in images]}"
     # Save images as a gif
     gif_output_path = f"{image_folder_path}/{gif_filename}"
     # run model on data
     logger.info("Running model.")
     prediction = predict(model, inp)[0]  # index 0 since there is only one image
+    pred_instances = prediction["instances"]
+    # log results
+    image_id = os.path.splitext(os.path.basename(rgb_image))[0]
+    pred_dict = {"image_id": image_id}
+    instances = pred_instances.to(torch.device("cpu"))
+    pred_dict["instances"] = prediction_to_json(instances, image_id)
+    torch.save(pred_dict, os.path.join(cfg.OUTPUT_DIR, f"{image_id}_prediction.pth"))
     # select best prediction to visualize
     score_ranking = np.argsort([-1 * pred_instances[i].scores.item() for i in range(len(pred_instances))])
     score_ranking = [idx for idx in score_ranking if pred_instances[int(idx)].scores.item() > score_threshold]
     if len(score_ranking) == 0:
         # Create a LineSet to visualize the direction vector
         axis_arrow = draw_line(origin, axis_vector + origin)
+        axis_arrow.paint_uniform_color(ARROW_COLOR)
         # if USE_GT:
         #     anno_path = f"/localhome/atw7/projects/opdmulti/data/data_demo_dev/59-4860.json"
                 if not os.path.isdir(output_dir_cropped):
                     os.makedirs(output_dir_cropped)
                 batch_trim(output_dir, output_dir_cropped, identical=True)
+                # create_gif(output_dir_cropped, num_samples)
             else:  # leave original dimensions of image as-is
+                # create_gif(output_dir, num_samples)
+                pass
 if __name__ == "__main__":

mask2former/__init__.py CHANGED Viewed

@@ -4,8 +4,11 @@ from . import modeling
 # config
 from .config import add_maskformer2_config, add_motionnet_config
 __all__ = [
     "modeling",
     "add_maskformer2_config",
     "add_motionnet_config",
 ]

 # config
 from .config import add_maskformer2_config, add_motionnet_config
+from .maskformer_model import MaskFormer
 __all__ = [
     "modeling",
     "add_maskformer2_config",
     "add_motionnet_config",
+    "MaskFormer",
 ]

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ scikit-learn==1.3.0
 scipy==1.11.2
 timm==0.9.7
 detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a

 scipy==1.11.2
 timm==0.9.7
 detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
+-e mask2former/modeling/pixel_decoder/ops/

utilities.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import numpy as np
+import pycocotools.mask as mask_util
+from detectron2.structures import BoxMode
+# MotionNet: based on instances_to_coco_json and relevant codes in densepose
+def prediction_to_json(instances, img_id: str):
+    """
+    Args:
+        instances (Instances): the output of the model
+        img_id (str): the image id in COCO
+    Returns:
+        list[dict]: the results in densepose evaluation format
+    """
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    # Prediction for MotionNet
+    # mtype = instances.mtype.squeeze(axis=1).tolist()
+    # 2.0.3
+    if instances.has("pdim"):
+        pdim = instances.pdim.tolist()
+    if instances.has("ptrans"):
+        ptrans = instances.ptrans.tolist()
+    if instances.has("prot"):
+        prot = instances.prot.tolist()
+    mtype = instances.mtype.tolist()
+    morigin = instances.morigin.tolist()
+    maxis = instances.maxis.tolist()
+    mstate = instances.mstate.tolist()
+    mstatemax = instances.mstatemax.tolist()
+    if instances.has("mextrinsic"):
+        mextrinsic = instances.mextrinsic.tolist()
+    # if motionstate:
+    #     mstate = instances.mstate.tolist()
+    # MotionNet has masks in the annotation
+    # use RLE to encode the masks, because they are too large and takes memory
+    # since this evaluator stores outputs of the entire dataset
+    rles = [mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks]
+    for rle in rles:
+        # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+        # json writer which always produces strings cannot serialize a bytestream
+        # unless you decode it. Thankfully, utf-8 works out (which is also what
+        # the pycocotools/_mask.pyx does).
+        rle["counts"] = rle["counts"].decode("utf-8")
+    results = []
+    for k in range(len(instances)):
+        if instances.has("pdim"):
+            result = {
+                "image_id": img_id,
+                "category_id": classes[k],
+                "bbox": boxes[k],
+                "score": scores[k],
+                "segmentation": rles[k],
+                "pdim": pdim[k],
+                "ptrans": ptrans[k],
+                "prot": prot[k],
+                "mtype": mtype[k],
+                "morigin": morigin[k],
+                "maxis": maxis[k],
+                "mstate": mstate[k],
+                "mstatemax": mstatemax[k],
+            }
+        elif instances.has("mextrinsic"):
+            result = {
+                "image_id": img_id,
+                "category_id": classes[k],
+                "bbox": boxes[k],
+                "score": scores[k],
+                "segmentation": rles[k],
+                "mtype": mtype[k],
+                "morigin": morigin[k],
+                "maxis": maxis[k],
+                "mextrinsic": mextrinsic[k],
+                "mstate": mstate[k],
+                "mstatemax": mstatemax[k],
+            }
+        else:
+            result = {
+                "image_id": img_id,
+                "category_id": classes[k],
+                "bbox": boxes[k],
+                "score": scores[k],
+                "segmentation": rles[k],
+                "mtype": mtype[k],
+                "morigin": morigin[k],
+                "maxis": maxis[k],
+                "mstate": mstate[k],
+                "mstatemax": mstatemax[k],
+            }
+        # if motionstate:
+        #     result["mstate"] = mstate[k]
+        results.append(result)
+    return results