atwang commited on
Commit
5ceacf4
1 Parent(s): 01664b3

semi-working demo for one part

Browse files
Files changed (6) hide show
  1. .gitignore +3 -0
  2. app.py +60 -24
  3. inference.py +23 -10
  4. mask2former/__init__.py +3 -0
  5. requirements.txt +1 -0
  6. utilities.py +102 -0
.gitignore CHANGED
@@ -1,3 +1,6 @@
1
  build/
 
 
2
  venv/
3
  __pycache__/
 
 
1
  build/
2
+ dist/
3
+ *.egg-info
4
  venv/
5
  __pycache__/
6
+ .output/
app.py CHANGED
@@ -1,29 +1,36 @@
1
  import os
2
  import re
 
 
3
  from types import SimpleNamespace
4
  from typing import Any
5
 
6
  import gradio as gr
7
  import numpy as np
8
  from detectron2 import engine
 
9
 
10
  from inference import main, setup_cfg
11
 
12
  # internal settings
13
  NUM_PROCESSES = 1
14
- CROP = False
15
  SCORE_THRESHOLD = 0.8
16
  MAX_PARTS = 5
17
  ARGS = SimpleNamespace(
18
  config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
19
- model="...",
20
  input_format="RGB",
21
  output=".output",
22
  cpu=True,
23
  )
24
 
 
 
25
 
26
  def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_samples: int) -> list[Any]:
 
 
27
  def find_gifs(path: str) -> list[str]:
28
  """Scrape folders for all generated gif files."""
29
  for file in os.listdir(path):
@@ -33,6 +40,36 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
33
  if re.match(r".*\.gif$", image_file):
34
  yield os.path.join(sub_path, image_file)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  cfg = setup_cfg(ARGS)
37
 
38
  engine.launch(
@@ -48,25 +85,22 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
48
  SCORE_THRESHOLD,
49
  ),
50
  )
51
-
52
  # process output
53
  # TODO: may want to select these in decreasing order of score
54
- pre_outputs = list(find_gifs(ARGS.output))
55
-
56
- outputs = []
57
- for idx in range(MAX_PARTS): # hide unused components
58
- if idx < len(pre_outputs):
59
- outputs.append(gr.update(value=pre_outputs[idx], visible=True))
60
- else:
61
- outputs.append(gr.update(visible=False))
62
- return outputs
63
-
64
 
65
- def variable_outputs(idx):
66
- idx = int(idx)
67
 
68
 
69
- with gr.Blocks() as app:
70
  gr.Markdown(
71
  """
72
  # OPDMulti Demo
@@ -81,7 +115,7 @@ with gr.Blocks() as app:
81
  image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
82
  )
83
  depth_image = gr.Image(
84
- image_mode="L", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
85
  )
86
 
87
  intrinsics = gr.Dataframe(
@@ -89,16 +123,16 @@ with gr.Blocks() as app:
89
  [
90
  214.85935872395834,
91
  0.0,
92
- 0.0,
93
  ],
94
  [
95
  0.0,
96
  214.85935872395834,
97
- 0.0,
98
  ],
99
  [
100
- 125.90160319010417,
101
- 95.13726399739583,
102
  1.0,
103
  ],
104
  ],
@@ -124,11 +158,13 @@ with gr.Blocks() as app:
124
 
125
  # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
126
  # identified.
127
- outputs = [gr.Image(type="filepath", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
 
128
 
129
  # TODO: maybe need to use a queue here so we don't overload the instance
130
  submit_btn.click(
131
- fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=outputs, api_name="run_model"
132
  )
133
 
134
- app.launch()
 
 
1
  import os
2
  import re
3
+ import shutil
4
+ import time
5
  from types import SimpleNamespace
6
  from typing import Any
7
 
8
  import gradio as gr
9
  import numpy as np
10
  from detectron2 import engine
11
+ from PIL import Image
12
 
13
  from inference import main, setup_cfg
14
 
15
  # internal settings
16
  NUM_PROCESSES = 1
17
+ CROP = True
18
  SCORE_THRESHOLD = 0.8
19
  MAX_PARTS = 5
20
  ARGS = SimpleNamespace(
21
  config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
22
+ model="../data/models/motion_state_pred_opdformerp_rgb.pth",
23
  input_format="RGB",
24
  output=".output",
25
  cpu=True,
26
  )
27
 
28
+ outputs = []
29
+
30
 
31
  def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_samples: int) -> list[Any]:
32
+ global outputs
33
+
34
  def find_gifs(path: str) -> list[str]:
35
  """Scrape folders for all generated gif files."""
36
  for file in os.listdir(path):
 
40
  if re.match(r".*\.gif$", image_file):
41
  yield os.path.join(sub_path, image_file)
42
 
43
+ def find_images(path: str) -> list[str]:
44
+ """Scrape folders for all generated gif files."""
45
+ images = {}
46
+ for file in os.listdir(path):
47
+ sub_path = os.path.join(path, file)
48
+ if os.path.isdir(sub_path):
49
+ images[file] = []
50
+ for image_file in sorted(os.listdir(sub_path)):
51
+ if re.match(r".*\.png$", image_file):
52
+ images[file].append(os.path.join(sub_path, image_file))
53
+ return images
54
+
55
+ def get_generator(images):
56
+ def gen():
57
+ while True:
58
+ for im in images:
59
+ time.sleep(0.025)
60
+ yield im
61
+ time.sleep(3)
62
+
63
+ return gen
64
+
65
+ # clear old predictions
66
+ for path in os.listdir(ARGS.output):
67
+ full_path = os.path.join(ARGS.output, path)
68
+ if os.path.isdir(full_path):
69
+ shutil.rmtree(full_path)
70
+ else:
71
+ os.remove(full_path)
72
+
73
  cfg = setup_cfg(ARGS)
74
 
75
  engine.launch(
 
85
  SCORE_THRESHOLD,
86
  ),
87
  )
88
+
89
  # process output
90
  # TODO: may want to select these in decreasing order of score
91
+ image_files = find_images(ARGS.output)
92
+ output = []
93
+ for count, part in enumerate(image_files):
94
+ if count < MAX_PARTS:
95
+ # output.append(gr.update(value=get_generator([Image.open(im) for im in image_files[part]]), visible=True))
96
+ output.append(get_generator([Image.open(im) for im in image_files[part]]))
97
+ # while len(output) < MAX_PARTS:
98
+ # output.append(gr.update(visible=False))
 
 
99
 
100
+ yield from output[0]()
 
101
 
102
 
103
+ with gr.Blocks() as demo:
104
  gr.Markdown(
105
  """
106
  # OPDMulti Demo
 
115
  image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
116
  )
117
  depth_image = gr.Image(
118
+ image_mode="I;16", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
119
  )
120
 
121
  intrinsics = gr.Dataframe(
 
123
  [
124
  214.85935872395834,
125
  0.0,
126
+ 125.90160319010417,
127
  ],
128
  [
129
  0.0,
130
  214.85935872395834,
131
+ 95.13726399739583,
132
  ],
133
  [
134
+ 0.0,
135
+ 0.0,
136
  1.0,
137
  ],
138
  ],
 
158
 
159
  # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
160
  # identified.
161
+ # images = [gr.Image(type="pil", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
162
+ image = gr.Image(type="pil", visible=True)
163
 
164
  # TODO: maybe need to use a queue here so we don't overload the instance
165
  submit_btn.click(
166
+ fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=image, api_name="run_model"
167
  )
168
 
169
+ demo.queue(api_open=False)
170
+ demo.launch()
inference.py CHANGED
@@ -40,8 +40,9 @@ from mask2former import (
40
  add_maskformer2_config,
41
  add_motionnet_config,
42
  )
 
43
 
44
- # import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint.py, in order to
45
  # replicate model loading without the overhead of setting up an OPDTrainer
46
 
47
  TORCH_VERSION: tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
@@ -63,6 +64,7 @@ TYPE_CLASSIFICATION = {
63
  }
64
 
65
  POINT_COLOR = [1, 0, 0] # red for demonstration
 
66
  IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
67
 
68
 
@@ -614,7 +616,7 @@ def batch_trim(images_path: str, save_path: str, identical: bool = False) -> Non
614
  optimal_box = None
615
 
616
  # load all images
617
- for image_file in os.listdir(images_path):
618
  if image_file.endswith(IMAGE_EXTENSIONS):
619
  image_path = os.path.join(images_path, image_file)
620
  images.append(Image.open(image_path))
@@ -636,10 +638,10 @@ def batch_trim(images_path: str, save_path: str, identical: bool = False) -> Non
636
  )
637
 
638
  # apply cropping, if optimal box was found
639
- if optimal_box:
640
- for im in images:
641
- im.crop(optimal_box)
642
- im.close()
643
 
644
  else: # trim each image separately
645
  for image_file in os.listdir(images_path):
@@ -665,6 +667,9 @@ def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "ou
665
 
666
  # Read the images using imageio
667
  images = [imageio.imread(image_file) for image_file in image_files]
 
 
 
668
 
669
  # Save images as a gif
670
  gif_output_path = f"{image_folder_path}/{gif_filename}"
@@ -710,9 +715,16 @@ def main(
710
  # run model on data
711
  logger.info("Running model.")
712
  prediction = predict(model, inp)[0] # index 0 since there is only one image
 
 
 
 
 
 
 
 
713
 
714
  # select best prediction to visualize
715
- pred_instances = prediction["instances"]
716
  score_ranking = np.argsort([-1 * pred_instances[i].scores.item() for i in range(len(pred_instances))])
717
  score_ranking = [idx for idx in score_ranking if pred_instances[int(idx)].scores.item() > score_threshold]
718
  if len(score_ranking) == 0:
@@ -756,7 +768,7 @@ def main(
756
 
757
  # Create a LineSet to visualize the direction vector
758
  axis_arrow = draw_line(origin, axis_vector + origin)
759
- axis_arrow.paint_uniform_color([0, 1, 0])
760
 
761
  # if USE_GT:
762
  # anno_path = f"/localhome/atw7/projects/opdmulti/data/data_demo_dev/59-4860.json"
@@ -807,9 +819,10 @@ def main(
807
  if not os.path.isdir(output_dir_cropped):
808
  os.makedirs(output_dir_cropped)
809
  batch_trim(output_dir, output_dir_cropped, identical=True)
810
- create_gif(output_dir_cropped, num_samples)
811
  else: # leave original dimensions of image as-is
812
- create_gif(output_dir, num_samples)
 
813
 
814
 
815
  if __name__ == "__main__":
 
40
  add_maskformer2_config,
41
  add_motionnet_config,
42
  )
43
+ from utilities import prediction_to_json
44
 
45
+ # import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint, in order to
46
  # replicate model loading without the overhead of setting up an OPDTrainer
47
 
48
  TORCH_VERSION: tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
 
64
  }
65
 
66
  POINT_COLOR = [1, 0, 0] # red for demonstration
67
+ ARROW_COLOR = [0, 1, 0] # green
68
  IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
69
 
70
 
 
616
  optimal_box = None
617
 
618
  # load all images
619
+ for image_file in sorted(os.listdir(images_path)):
620
  if image_file.endswith(IMAGE_EXTENSIONS):
621
  image_path = os.path.join(images_path, image_file)
622
  images.append(Image.open(image_path))
 
638
  )
639
 
640
  # apply cropping, if optimal box was found
641
+ for idx, im in enumerate(images):
642
+ im.crop(optimal_box)
643
+ im.save(os.path.join(save_path, f"{idx}.png"))
644
+ im.close()
645
 
646
  else: # trim each image separately
647
  for image_file in os.listdir(images_path):
 
667
 
668
  # Read the images using imageio
669
  images = [imageio.imread(image_file) for image_file in image_files]
670
+ assert all(
671
+ images[0].shape == im.shape for im in images
672
+ ), f"Found some images with a different shape: {[im.shape for im in images]}"
673
 
674
  # Save images as a gif
675
  gif_output_path = f"{image_folder_path}/{gif_filename}"
 
715
  # run model on data
716
  logger.info("Running model.")
717
  prediction = predict(model, inp)[0] # index 0 since there is only one image
718
+ pred_instances = prediction["instances"]
719
+
720
+ # log results
721
+ image_id = os.path.splitext(os.path.basename(rgb_image))[0]
722
+ pred_dict = {"image_id": image_id}
723
+ instances = pred_instances.to(torch.device("cpu"))
724
+ pred_dict["instances"] = prediction_to_json(instances, image_id)
725
+ torch.save(pred_dict, os.path.join(cfg.OUTPUT_DIR, f"{image_id}_prediction.pth"))
726
 
727
  # select best prediction to visualize
 
728
  score_ranking = np.argsort([-1 * pred_instances[i].scores.item() for i in range(len(pred_instances))])
729
  score_ranking = [idx for idx in score_ranking if pred_instances[int(idx)].scores.item() > score_threshold]
730
  if len(score_ranking) == 0:
 
768
 
769
  # Create a LineSet to visualize the direction vector
770
  axis_arrow = draw_line(origin, axis_vector + origin)
771
+ axis_arrow.paint_uniform_color(ARROW_COLOR)
772
 
773
  # if USE_GT:
774
  # anno_path = f"/localhome/atw7/projects/opdmulti/data/data_demo_dev/59-4860.json"
 
819
  if not os.path.isdir(output_dir_cropped):
820
  os.makedirs(output_dir_cropped)
821
  batch_trim(output_dir, output_dir_cropped, identical=True)
822
+ # create_gif(output_dir_cropped, num_samples)
823
  else: # leave original dimensions of image as-is
824
+ # create_gif(output_dir, num_samples)
825
+ pass
826
 
827
 
828
  if __name__ == "__main__":
mask2former/__init__.py CHANGED
@@ -4,8 +4,11 @@ from . import modeling
4
  # config
5
  from .config import add_maskformer2_config, add_motionnet_config
6
 
 
 
7
  __all__ = [
8
  "modeling",
9
  "add_maskformer2_config",
10
  "add_motionnet_config",
 
11
  ]
 
4
  # config
5
  from .config import add_maskformer2_config, add_motionnet_config
6
 
7
+ from .maskformer_model import MaskFormer
8
+
9
  __all__ = [
10
  "modeling",
11
  "add_maskformer2_config",
12
  "add_motionnet_config",
13
+ "MaskFormer",
14
  ]
requirements.txt CHANGED
@@ -9,3 +9,4 @@ scikit-learn==1.3.0
9
  scipy==1.11.2
10
  timm==0.9.7
11
  detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
 
 
9
  scipy==1.11.2
10
  timm==0.9.7
11
  detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
12
+ -e mask2former/modeling/pixel_decoder/ops/
utilities.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pycocotools.mask as mask_util
3
+ from detectron2.structures import BoxMode
4
+
5
+
6
+ # MotionNet: based on instances_to_coco_json and relevant codes in densepose
7
+ def prediction_to_json(instances, img_id: str):
8
+ """
9
+ Args:
10
+ instances (Instances): the output of the model
11
+ img_id (str): the image id in COCO
12
+
13
+ Returns:
14
+ list[dict]: the results in densepose evaluation format
15
+ """
16
+ boxes = instances.pred_boxes.tensor.numpy()
17
+ boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
18
+ boxes = boxes.tolist()
19
+ scores = instances.scores.tolist()
20
+ classes = instances.pred_classes.tolist()
21
+ # Prediction for MotionNet
22
+ # mtype = instances.mtype.squeeze(axis=1).tolist()
23
+
24
+ # 2.0.3
25
+ if instances.has("pdim"):
26
+ pdim = instances.pdim.tolist()
27
+ if instances.has("ptrans"):
28
+ ptrans = instances.ptrans.tolist()
29
+ if instances.has("prot"):
30
+ prot = instances.prot.tolist()
31
+
32
+ mtype = instances.mtype.tolist()
33
+ morigin = instances.morigin.tolist()
34
+ maxis = instances.maxis.tolist()
35
+ mstate = instances.mstate.tolist()
36
+ mstatemax = instances.mstatemax.tolist()
37
+ if instances.has("mextrinsic"):
38
+ mextrinsic = instances.mextrinsic.tolist()
39
+
40
+ # if motionstate:
41
+ # mstate = instances.mstate.tolist()
42
+
43
+ # MotionNet has masks in the annotation
44
+ # use RLE to encode the masks, because they are too large and takes memory
45
+ # since this evaluator stores outputs of the entire dataset
46
+ rles = [mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks]
47
+ for rle in rles:
48
+ # "counts" is an array encoded by mask_util as a byte-stream. Python3's
49
+ # json writer which always produces strings cannot serialize a bytestream
50
+ # unless you decode it. Thankfully, utf-8 works out (which is also what
51
+ # the pycocotools/_mask.pyx does).
52
+ rle["counts"] = rle["counts"].decode("utf-8")
53
+
54
+ results = []
55
+ for k in range(len(instances)):
56
+ if instances.has("pdim"):
57
+ result = {
58
+ "image_id": img_id,
59
+ "category_id": classes[k],
60
+ "bbox": boxes[k],
61
+ "score": scores[k],
62
+ "segmentation": rles[k],
63
+ "pdim": pdim[k],
64
+ "ptrans": ptrans[k],
65
+ "prot": prot[k],
66
+ "mtype": mtype[k],
67
+ "morigin": morigin[k],
68
+ "maxis": maxis[k],
69
+ "mstate": mstate[k],
70
+ "mstatemax": mstatemax[k],
71
+ }
72
+ elif instances.has("mextrinsic"):
73
+ result = {
74
+ "image_id": img_id,
75
+ "category_id": classes[k],
76
+ "bbox": boxes[k],
77
+ "score": scores[k],
78
+ "segmentation": rles[k],
79
+ "mtype": mtype[k],
80
+ "morigin": morigin[k],
81
+ "maxis": maxis[k],
82
+ "mextrinsic": mextrinsic[k],
83
+ "mstate": mstate[k],
84
+ "mstatemax": mstatemax[k],
85
+ }
86
+ else:
87
+ result = {
88
+ "image_id": img_id,
89
+ "category_id": classes[k],
90
+ "bbox": boxes[k],
91
+ "score": scores[k],
92
+ "segmentation": rles[k],
93
+ "mtype": mtype[k],
94
+ "morigin": morigin[k],
95
+ "maxis": maxis[k],
96
+ "mstate": mstate[k],
97
+ "mstatemax": mstatemax[k],
98
+ }
99
+ # if motionstate:
100
+ # result["mstate"] = mstate[k]
101
+ results.append(result)
102
+ return results