atwang commited on
Commit
17456cf
1 Parent(s): 92d915f

finish documentation of repo

Browse files
Files changed (3) hide show
  1. README.md +44 -1
  2. app.py +92 -26
  3. requirements.txt +1 -0
README.md CHANGED
@@ -9,7 +9,13 @@ pinned: false
9
  license: mit
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
13
 
14
  ## Installation
15
 
@@ -90,3 +96,40 @@ gradio app.py
90
  You can view the app on the specified port (usually 7860). To run over an ssh connection, setup port forwarding using
91
  `-L 7860:localhost:7860` when you create your ssh connection. Note that you will need to install Open3D in headless
92
  rendering for this to work, as described above.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  license: mit
10
  ---
11
 
12
+ # OPDMulti: Openable Part Detection for Multiple Objects
13
+ [Xiaohao Sun*](https://sun-xh.github.io/), [Hanxiao Jiang*](https://jianghanxiao.github.io/), [Manolis Savva](https://msavva.github.io/), [Angel Xuan Chang](http://angelxuanchang.github.io/)
14
+
15
+ This repository is intended as a deployment of a demo for the [OPDMulti](https://github.com/3dlg-hcvc/OPDMulti) project.
16
+ Please refer there for more information about the proect and implementation.
17
+
18
+ [arXiv](https://arxiv.org/abs/2303.14087)  [Website](https://3dlg-hcvc.github.io/OPDMulti/)
19
 
20
  ## Installation
21
 
 
96
  You can view the app on the specified port (usually 7860). To run over an ssh connection, setup port forwarding using
97
  `-L 7860:localhost:7860` when you create your ssh connection. Note that you will need to install Open3D in headless
98
  rendering for this to work, as described above.
99
+
100
+ ## Citation
101
+ If you find this code useful, please consider citing:
102
+ ```bibtex
103
+ @article{sun2023opdmulti,
104
+ title={OPDMulti: Openable Part Detection for Multiple Objects},
105
+ author={Sun, Xiaohao and Jiang, Hanxiao and Savva, Manolis and Chang, Angel Xuan},
106
+ journal={arXiv preprint arXiv:2303.14087},
107
+ year={2023}
108
+ }
109
+
110
+ @article{mao2022multiscan,
111
+ title={MultiScan: Scalable RGBD scanning for 3D environments with articulated objects},
112
+ author={Mao, Yongsen and Zhang, Yiming and Jiang, Hanxiao and Chang, Angel and Savva, Manolis},
113
+ journal={Advances in Neural Information Processing Systems},
114
+ volume={35},
115
+ pages={9058--9071},
116
+ year={2022}
117
+ }
118
+
119
+ @inproceedings{jiang2022opd,
120
+ title={OPD: Single-view 3D openable part detection},
121
+ author={Jiang, Hanxiao and Mao, Yongsen and Savva, Manolis and Chang, Angel X},
122
+ booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXIX},
123
+ pages={410--426},
124
+ year={2022},
125
+ organization={Springer}
126
+ }
127
+
128
+ @inproceedings{cheng2022masked,
129
+ title={Masked-attention mask transformer for universal image segmentation},
130
+ author={Cheng, Bowen and Misra, Ishan and Schwing, Alexander G and Kirillov, Alexander and Girdhar, Rohit},
131
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
132
+ pages={1290--1299},
133
+ year={2022}
134
+ }
135
+ ```
app.py CHANGED
@@ -3,11 +3,12 @@ import re
3
  import shutil
4
  import time
5
  from types import SimpleNamespace
6
- from typing import Any
7
 
8
  import gradio as gr
9
  import numpy as np
10
  from detectron2 import engine
 
11
  from PIL import Image
12
 
13
  from inference import main, setup_cfg
@@ -16,7 +17,7 @@ from inference import main, setup_cfg
16
  NUM_PROCESSES = 1
17
  CROP = False
18
  SCORE_THRESHOLD = 0.8
19
- MAX_PARTS = 5
20
  ARGS = SimpleNamespace(
21
  config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
22
  model=".data/models/motion_state_pred_opdformerp_rgb.pth",
@@ -26,25 +27,49 @@ ARGS = SimpleNamespace(
26
  )
27
  NUM_SAMPLES = 10
28
 
29
- outputs = {}
30
-
31
-
32
- def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_samples: int) -> list[Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  global outputs
34
 
35
  def find_images(path: str) -> dict[str, list[str]]:
36
- """Scrape folders for all generated gif files."""
37
  images = {}
38
  for file in os.listdir(path):
39
  sub_path = os.path.join(path, file)
40
  if os.path.isdir(sub_path):
41
  images[file] = []
42
- for image_file in sorted(os.listdir(sub_path)):
43
  if re.match(r".*\.png$", image_file):
44
  images[file].append(os.path.join(sub_path, image_file))
45
  return images
46
 
47
  # clear old predictions
 
48
  os.makedirs(ARGS.output, exist_ok=True)
49
  for path in os.listdir(ARGS.output):
50
  full_path = os.path.join(ARGS.output, path)
@@ -61,8 +86,8 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
61
  gr.Error("You must provide a depth image before running the model.")
62
  return [None] * 5
63
 
 
64
  cfg = setup_cfg(ARGS)
65
-
66
  engine.launch(
67
  main,
68
  NUM_PROCESSES,
@@ -70,7 +95,7 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
70
  cfg,
71
  rgb_image,
72
  depth_image,
73
- intrinsics,
74
  num_samples,
75
  CROP,
76
  SCORE_THRESHOLD,
@@ -82,7 +107,7 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
82
  outputs[rgb_image] = []
83
  image_files = find_images(ARGS.output)
84
  for count, part in enumerate(image_files):
85
- if count < MAX_PARTS:
86
  outputs[rgb_image].append([Image.open(im) for im in image_files[part]])
87
 
88
  return [
@@ -91,19 +116,47 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
91
  ]
92
 
93
 
94
- def get_trigger(idx: int, fps: int = 40, oscillate: bool = True):
95
- def iter_images(rgb_image: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  if not rgb_image or rgb_image not in outputs:
97
  gr.Warning("You must upload an image and run the model before you can view the output.")
98
 
99
  elif idx < len(outputs[rgb_image]):
 
 
 
100
  for im in outputs[rgb_image][idx]:
101
- time.sleep(1.0 / fps)
102
  yield im
 
 
 
103
  if oscillate:
104
  for im in reversed(outputs[rgb_image][idx]):
105
- time.sleep(1.0 / fps)
106
  yield im
 
107
 
108
  else:
109
  gr.Error("Could not find any images to load into this module.")
@@ -112,6 +165,9 @@ def get_trigger(idx: int, fps: int = 40, oscillate: bool = True):
112
 
113
 
114
  def clear_outputs():
 
 
 
115
  return [gr.update(value=None, visible=(idx == 0)) for idx in range(MAX_PARTS)]
116
 
117
 
@@ -119,12 +175,19 @@ with gr.Blocks() as demo:
119
  gr.Markdown(
120
  """
121
  # OPDMulti Demo
122
- Upload an image to see its range of motion.
 
 
 
 
 
 
 
 
123
  """
124
  )
125
 
126
- # TODO: add gr.Examples
127
-
128
  with gr.Row():
129
  rgb_image = gr.Image(
130
  image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
@@ -133,7 +196,7 @@ with gr.Blocks() as demo:
133
  image_mode="I;16", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
134
  )
135
 
136
- intrinsics = gr.Dataframe(
137
  value=[
138
  [
139
  214.85935872395834,
@@ -155,7 +218,7 @@ with gr.Blocks() as demo:
155
  col_count=(3, "fixed"),
156
  datatype="number",
157
  type="numpy",
158
- label="Intrinsics matrix",
159
  show_label=True,
160
  interactive=True,
161
  )
@@ -169,6 +232,7 @@ with gr.Blocks() as demo:
169
  maximum=20,
170
  )
171
 
 
172
  examples = gr.Examples(
173
  examples=[
174
  ["examples/59-4860.png", "examples/59-4860_d.png"],
@@ -182,10 +246,12 @@ with gr.Blocks() as demo:
182
  )
183
 
184
  submit_btn = gr.Button("Run model")
185
- explanation = gr.Markdown(value="# Output\nClick on an image to see an animation of the part motion.")
186
 
187
- # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
188
- # identified.
 
 
 
189
  images = [
190
  gr.Image(type="pil", label=f"Part {idx + 1}", show_download_button=False, visible=(idx == 0))
191
  for idx in range(MAX_PARTS)
@@ -194,11 +260,11 @@ with gr.Blocks() as demo:
194
  image_comp.select(get_trigger(idx), inputs=rgb_image, outputs=image_comp, api_name=False)
195
 
196
  # if user changes input, clear output images
197
- rgb_image.change(clear_outputs, inputs=[], outputs=images, api_name=False)
198
- depth_image.change(clear_outputs, inputs=[], outputs=images, api_name=False)
199
 
200
  submit_btn.click(
201
- fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=images, api_name=False
202
  )
203
 
204
  demo.queue(api_open=False)
 
3
  import shutil
4
  import time
5
  from types import SimpleNamespace
6
+ from typing import Any, Callable, Generator, Optional
7
 
8
  import gradio as gr
9
  import numpy as np
10
  from detectron2 import engine
11
+ from natsort import natsorted
12
  from PIL import Image
13
 
14
  from inference import main, setup_cfg
 
17
  NUM_PROCESSES = 1
18
  CROP = False
19
  SCORE_THRESHOLD = 0.8
20
+ MAX_PARTS = 5 # TODO: we can replace this by having a slider and a single image visualization component rather than multiple components
21
  ARGS = SimpleNamespace(
22
  config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
23
  model=".data/models/motion_state_pred_opdformerp_rgb.pth",
 
27
  )
28
  NUM_SAMPLES = 10
29
 
30
+ # this variable holds the current state of results, as the user will need to be able to "reload" the results in order
31
+ # to visualize the demo again. The output images are cached by the temporary path of the image, meaning that multiple
32
+ # users should be able to simultaneously run the demo. Gradio should be able to handle the case where multiple distinct
33
+ # images are uploaded with the same name, as I believe the caching of temp path is based on base64 encoding, not the
34
+ # filename itself.
35
+ # TODO: right now there is no gc system for outputs, which means if there is enough traffic per unit time such that the
36
+ # outputs are all generated on the same system instantiation of the code, the RAM could max out, acknowledging also that
37
+ # this is not designed to run on GPU and so the model and all will also need to be stored in CPU memory. Solutions could
38
+ # include
39
+ # 1. a caching design to remove old results periodically, especially if the image is reset;
40
+ # 2. caching results on disk rather than in memory, since the cap is higher; or
41
+ # 3. figuring out some way to cache results in browser instead of in the backend (couldn't figure out a way to do this
42
+ # earlier.
43
+ outputs: dict[str, list[list[Image.Image]]] = {}
44
+
45
+
46
+ def predict(rgb_image: str, depth_image: str, intrinsic: np.ndarray, num_samples: int) -> list[Any]:
47
+ """
48
+ Run model on input image and generate output visualizations.
49
+
50
+ :param rgb_image: local path to RGB image file, used for model prediction and visualization
51
+ :param depth_image: local path to depth image file, used for visualization
52
+ :param intrinsic: array of dimension (3, 3) representing the intrinsic matrix of the camera
53
+ :param num_samples: number of visualization states to generate.
54
+ :return: list of updates to make to image components to visualize first image of visualization sequence, or
55
+ otherwise to hide an image component from visualization.
56
+ """
57
  global outputs
58
 
59
  def find_images(path: str) -> dict[str, list[str]]:
60
+ """Scrape folders for all generated image files."""
61
  images = {}
62
  for file in os.listdir(path):
63
  sub_path = os.path.join(path, file)
64
  if os.path.isdir(sub_path):
65
  images[file] = []
66
+ for image_file in natsorted(os.listdir(sub_path)):
67
  if re.match(r".*\.png$", image_file):
68
  images[file].append(os.path.join(sub_path, image_file))
69
  return images
70
 
71
  # clear old predictions
72
+ # TODO: might be a better place for this than at the beginning of every invocation
73
  os.makedirs(ARGS.output, exist_ok=True)
74
  for path in os.listdir(ARGS.output):
75
  full_path = os.path.join(ARGS.output, path)
 
86
  gr.Error("You must provide a depth image before running the model.")
87
  return [None] * 5
88
 
89
+ # run model
90
  cfg = setup_cfg(ARGS)
 
91
  engine.launch(
92
  main,
93
  NUM_PROCESSES,
 
95
  cfg,
96
  rgb_image,
97
  depth_image,
98
+ intrinsic,
99
  num_samples,
100
  CROP,
101
  SCORE_THRESHOLD,
 
107
  outputs[rgb_image] = []
108
  image_files = find_images(ARGS.output)
109
  for count, part in enumerate(image_files):
110
+ if count < MAX_PARTS: # only visualize up to MAX_PARTS parts
111
  outputs[rgb_image].append([Image.open(im) for im in image_files[part]])
112
 
113
  return [
 
116
  ]
117
 
118
 
119
+ def get_trigger(
120
+ idx: int, fps: int = 25, oscillate: bool = True
121
+ ) -> Callable[[str], Generator[Image.Image, None, None]]:
122
+ """
123
+ Return event listener trigger function for image component to animate image sequence.
124
+
125
+ :param idx: index of part to animate from output
126
+ :param fps: approximate rate at which images should be cycled through in frames per second. Note that the fps cannot
127
+ be higher than the rate at which images can be returned and rendered. Defaults to 40
128
+ :param oscillate: if True, animates part in reverse after running from start to end. Defaults to True
129
+ """
130
+
131
+ def iter_images(rgb_image: str) -> Generator[Image.Image, None, None]:
132
+ """Iterator to yield sequence of images for rendering, based on temp RGB image path"""
133
+ start_time = time.time()
134
+
135
+ def wait_until_next_frame(frame_count: int) -> None:
136
+ """wait until appropriate time per the specified fps, relative to start time of iteration"""
137
+ time_to_sleep = max(frame_count / fps - (time.time() - start_time), 0)
138
+ if time_to_sleep <= 0:
139
+ print("[WARNING] frames cannot be rendered at the specified FPS due to processing/rendering time.")
140
+ time.sleep(time_to_sleep)
141
+
142
  if not rgb_image or rgb_image not in outputs:
143
  gr.Warning("You must upload an image and run the model before you can view the output.")
144
 
145
  elif idx < len(outputs[rgb_image]):
146
+ frame_count = 0
147
+
148
+ # iterate forward
149
  for im in outputs[rgb_image][idx]:
150
+ wait_until_next_frame(frame_count)
151
  yield im
152
+ frame_count += 1
153
+
154
+ # iterate in reverse
155
  if oscillate:
156
  for im in reversed(outputs[rgb_image][idx]):
157
+ wait_until_next_frame(frame_count)
158
  yield im
159
+ frame_count += 1
160
 
161
  else:
162
  gr.Error("Could not find any images to load into this module.")
 
165
 
166
 
167
  def clear_outputs():
168
+ """
169
+ Remove images from image components.
170
+ """
171
  return [gr.update(value=None, visible=(idx == 0)) for idx in range(MAX_PARTS)]
172
 
173
 
 
175
  gr.Markdown(
176
  """
177
  # OPDMulti Demo
178
+ We tackle the openable-part-detection (OPD) problem where we identify in a single-view image parts that are openable and their motion parameters. Our OPDFORMER architecture outputs segmentations for openable parts on potentially multiple objects, along with each part’s motion parameters: motion type (translation or rotation, indicated by blue or purple mask), motion axis and origin (see green arrows and points). For each openable part, we predict the motion parameters (axis and origin) in object coordinates along with an object pose prediction to convert to camera coordinates.
179
+
180
+ More information about the project, including code, can be found [here](https://3dlg-hcvc.github.io/OPDMulti/).
181
+
182
+ Upload an image to see a visualization of its range of motion below. Only the RGB image is needed for the model itself, but the depth image is required as of now for the visualization of motion.
183
+
184
+ If you know the intrinsic matrix of your camera, you can specify that here or otherwise use the default matrix which will work with any of the provided examples.
185
+
186
+ You can also change the number of samples to define the number of states in the visualization generated.
187
  """
188
  )
189
 
190
+ # inputs
 
191
  with gr.Row():
192
  rgb_image = gr.Image(
193
  image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
 
196
  image_mode="I;16", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
197
  )
198
 
199
+ intrinsic = gr.Dataframe(
200
  value=[
201
  [
202
  214.85935872395834,
 
218
  col_count=(3, "fixed"),
219
  datatype="number",
220
  type="numpy",
221
+ label="Intrinsic matrix",
222
  show_label=True,
223
  interactive=True,
224
  )
 
232
  maximum=20,
233
  )
234
 
235
+ # specify examples which can be used to start
236
  examples = gr.Examples(
237
  examples=[
238
  ["examples/59-4860.png", "examples/59-4860_d.png"],
 
246
  )
247
 
248
  submit_btn = gr.Button("Run model")
 
249
 
250
+ # output
251
+ explanation = gr.Markdown(
252
+ value=f"# Output\nClick on an image to see an animation of the part motion. As of now, only up to {MAX_PARTS} parts can be visualized due to limitations of the visualizer."
253
+ )
254
+
255
  images = [
256
  gr.Image(type="pil", label=f"Part {idx + 1}", show_download_button=False, visible=(idx == 0))
257
  for idx in range(MAX_PARTS)
 
260
  image_comp.select(get_trigger(idx), inputs=rgb_image, outputs=image_comp, api_name=False)
261
 
262
  # if user changes input, clear output images
263
+ rgb_image.change(clear_outputs, inputs=rgb_image, outputs=images, api_name=False)
264
+ depth_image.change(clear_outputs, inputs=rgb_image, outputs=images, api_name=False)
265
 
266
  submit_btn.click(
267
+ fn=predict, inputs=[rgb_image, depth_image, intrinsic, num_samples], outputs=images, api_name=False
268
  )
269
 
270
  demo.queue(api_open=False)
requirements.txt CHANGED
@@ -16,3 +16,4 @@ timm==0.9.7
16
  black==23.9.1
17
  gradio==3.44.3
18
  huggingface-hub==0.17.2
 
 
16
  black==23.9.1
17
  gradio==3.44.3
18
  huggingface-hub==0.17.2
19
+ natsort==8.4.0