File size: 13,755 Bytes
2fd6166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd9aff3
 
2fd6166
 
 
 
 
 
 
 
 
 
 
cd9aff3
 
 
 
 
 
 
2fd6166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
"""
Demo for template-free reconstruction

python demo.py model=ho-attn run.image_path=/BS/xxie-2/work/HDM/outputs/000000017450/k1.color.jpg run.job=sample model.predict_binary=True dataset.std_coverage=3.0
"""
import pickle as pkl
import sys, os
import os.path as osp
from typing import Iterable, Optional

import cv2
from accelerate import Accelerator
from tqdm import tqdm
from glob import glob

sys.path.append(os.getcwd())
import hydra
import torch
import numpy as np
import imageio
from torch.utils.data import DataLoader
from pytorch3d.datasets import R2N2, collate_batched_meshes
from pytorch3d.structures import Pointclouds
from pytorch3d.renderer import PerspectiveCameras, look_at_view_transform
from pytorch3d.io import IO
import torchvision.transforms.functional as TVF
from huggingface_hub import hf_hub_download

import training_utils
from configs.structured import ProjectConfig
from dataset.demo_dataset import DemoDataset
from model import CrossAttenHODiffusionModel, ConditionalPCDiffusionSeparateSegm
from render.pyt3d_wrapper import PcloudRenderer


class DemoRunner:
    def __init__(self, cfg: ProjectConfig):
        cfg.model.model_name, cfg.model.predict_binary = 'pc2-diff-ho-sepsegm', True
        model_stage1 = ConditionalPCDiffusionSeparateSegm(**cfg.model)
        cfg.model.model_name, cfg.model.predict_binary = 'diff-ho-attn', False # stage 2 does not predict segmentation
        model_stage2 = CrossAttenHODiffusionModel(**cfg.model)

        # Load from checkpoint
        # ckpt_file1 = os.path.join(cfg.run.code_dir_abs, f'outputs/{cfg.run.stage1_name}/single/checkpoint-latest.pth')
        # self.load_checkpoint(ckpt_file1, model_stage1)
        # ckpt_file2 = os.path.join(cfg.run.code_dir_abs, f'outputs/{cfg.run.stage2_name}/single/checkpoint-latest.pth')
        # self.load_checkpoint(ckpt_file2, model_stage2)
        # Load ckpt from hf
        ckpt_file1 = hf_hub_download("xiexh20/HDM-models", f'{cfg.run.stage1_name}.pth')
        self.load_checkpoint(ckpt_file1, model_stage1)
        ckpt_file2 = hf_hub_download("xiexh20/HDM-models", f'{cfg.run.stage2_name}.pth')
        self.load_checkpoint(ckpt_file2, model_stage2)

        self.model_stage1, self.model_stage2 = model_stage1, model_stage2
        self.model_stage1.eval()
        self.model_stage2.eval()
        self.model_stage1.to('cuda')
        self.model_stage2.to('cuda')

        self.cfg = cfg
        self.io_pc = IO()

        # For visualization
        self.renderer = PcloudRenderer(image_size=cfg.dataset.image_size, radius=0.0075)
        self.rend_size = cfg.dataset.image_size
        self.device = 'cuda'

    def load_checkpoint(self, ckpt_file1, model_stage1, device='cpu'):
        checkpoint = torch.load(ckpt_file1, map_location=device)
        state_dict, key = checkpoint['model'], 'model'
        if any(k.startswith('module.') for k in state_dict.keys()):
            state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
            print('Removed "module." from checkpoint state dict')
        missing_keys, unexpected_keys = model_stage1.load_state_dict(state_dict, strict=False)
        print(f'Loaded model checkpoint {key} from {ckpt_file1}')
        if len(missing_keys):
            print(f' - Missing_keys: {missing_keys}')
        if len(unexpected_keys):
            print(f' - Unexpected_keys: {unexpected_keys}')

    def reload_checkpoint(self, cat_name):
        "load checkpoint of models fine tuned on specific categories"
        ckpt_file1 = hf_hub_download("xiexh20/HDM-models", f'{self.cfg.run.stage1_name}-{cat_name}.pth')
        self.load_checkpoint(ckpt_file1, self.model_stage1, device=self.device)
        ckpt_file2 = hf_hub_download("xiexh20/HDM-models", f'{self.cfg.run.stage2_name}-{cat_name}.pth')
        self.load_checkpoint(ckpt_file2, self.model_stage2, device=self.device)

    @torch.no_grad()
    def run(self):
        "simply run the demo on given images, and save the results"
        # Set random seed
        training_utils.set_seed(self.cfg.run.seed)

        outdir = osp.join(self.cfg.run.code_dir_abs, 'outputs/demo')
        os.makedirs(outdir, exist_ok=True)
        cfg = self.cfg

        # Init data
        image_files = sorted(glob(cfg.run.image_path))
        data = DemoDataset(image_files,
                           (cfg.dataset.image_size, cfg.dataset.image_size),
                           cfg.dataset.std_coverage)
        dataloader = DataLoader(data, batch_size=cfg.dataloader.batch_size,
                                collate_fn=collate_batched_meshes,
                                num_workers=1, shuffle=False)
        dataloader = dataloader
        progress_bar = tqdm(dataloader)
        for batch_idx, batch in enumerate(progress_bar):
            progress_bar.set_description(f'Processing batch {batch_idx:4d} / {len(dataloader):4d}')

            out_stage1, out_stage2 = self.forward_batch(batch, cfg)

            bs = len(out_stage1)
            camera_full = PerspectiveCameras(
                R=torch.stack(batch['R']),
                T=torch.stack(batch['T']),
                K=torch.stack(batch['K']),
                device='cuda',
                in_ndc=True)

            # save output
            for i in range(bs):
                image_path = str(batch['image_path'])
                folder, fname = osp.basename(osp.dirname(image_path)), osp.splitext(osp.basename(image_path))[0]
                out_i = osp.join(outdir, folder)
                os.makedirs(out_i, exist_ok=True)
                self.io_pc.save_pointcloud(data=out_stage1[i],
                                           path=osp.join(out_i, f'{fname}_stage1.ply'))
                self.io_pc.save_pointcloud(data=out_stage2[i],
                                           path=osp.join(out_i, f'{fname}_stage2.ply'))
                TVF.to_pil_image(batch['images'][i]).save(osp.join(out_i, f'{fname}_input.png'))

                # Save metadata as well
                metadata = dict(index=i,
                                camera=camera_full[i],
                                image_size_hw=batch['image_size_hw'][i],
                                image_path=batch['image_path'][i])
                torch.save(metadata, osp.join(out_i, f'{fname}_meta.pth'))

                # Visualize
                # front_camera = camera_full[i]
                pc_comb = Pointclouds([out_stage1[i].points_packed(), out_stage2[i].points_packed()],
                                      features=[out_stage1[i].features_packed(), out_stage2[i].features_packed()])
                video_file = osp.join(out_i, f'{fname}_360view.mp4')
                video_writer = imageio.get_writer(video_file, format='FFMPEG', mode='I', fps=1)

                # first render front view
                rend_stage1, _ = self.renderer.render(out_stage1[i], camera_full[i], mode='mask')
                rend_stage2, _ = self.renderer.render(out_stage2[i], camera_full[i], mode='mask')
                comb = np.concatenate([batch['images'][i].permute(1, 2, 0).cpu().numpy(), rend_stage1, rend_stage2], 1)
                video_writer.append_data((comb*255).astype(np.uint8))

                for azim in range(180, 180+360, 30):
                    R, T = look_at_view_transform(1.7, 0, azim, up=((0, -1, 0),), )
                    side_camera = PerspectiveCameras(image_size=((self.rend_size, self.rend_size),),
                                              device=self.device,
                                              R=R.repeat(2, 1, 1), T=T.repeat(2, 1),
                                              focal_length=self.rend_size * 1.5,
                                              principal_point=((self.rend_size / 2., self.rend_size / 2.),),
                                              in_ndc=False)
                    rend, mask = self.renderer.render(pc_comb, side_camera, mode='mask')

                    imgs = [batch['images'][i].permute(1, 2, 0).cpu().numpy()]
                    imgs.extend([rend[0], rend[1]])
                    video_writer.append_data((np.concatenate(imgs, 1)*255).astype(np.uint8))
                print(f"Visualization saved to {out_i}")

    @torch.no_grad()
    def forward_batch(self, batch, cfg):
        """
        forward one batch
        :param batch:
        :param cfg:
        :return: predicted point clouds of stage 1 and 2
        """
        camera_full = PerspectiveCameras(
            R=torch.stack(batch['R']),
            T=torch.stack(batch['T']),
            K=torch.stack(batch['K']),
            device='cuda',
            in_ndc=True)
        out_stage1 = self.model_stage1.forward_sample(num_points=cfg.dataset.max_points,
                                                      camera=camera_full,
                                                      image_rgb=torch.stack(batch['images']).to('cuda'),
                                                      mask=torch.stack(batch['masks']).to('cuda'),
                                                      scheduler=cfg.run.diffusion_scheduler,
                                                      num_inference_steps=cfg.run.num_inference_steps,
                                                      )
        # segment and normalize human/object
        bs = len(out_stage1)
        pred_hum, pred_obj = [], []  # predicted human/object points
        cent_hum_pred, cent_obj_pred = [], []
        radius_hum_pred, radius_obj_pred = [], []
        T_hum, T_obj = [], []
        num_samples = int(cfg.dataset.max_points / 2)
        for i in range(bs):
            pc: Pointclouds = out_stage1[i]
            vc = pc.features_packed().cpu()  # (P, 3), human is light blue [0.1, 1.0, 1.0], object light green [0.5, 1.0, 0]
            points = pc.points_packed().cpu()  # (P, 3)
            mask_hum = vc[:, 2] > 0.5
            pc_hum, pc_obj = points[mask_hum], points[~mask_hum]
            # Up/Down-sample the points
            pc_obj = self.upsample_predicted_pc(num_samples, pc_obj)
            pc_hum = self.upsample_predicted_pc(num_samples, pc_hum)

            # Normalize
            cent_hum, cent_obj = torch.mean(pc_hum, 0, keepdim=True), torch.mean(pc_obj, 0, keepdim=True)
            scale_hum = torch.sqrt(torch.sum((pc_hum - cent_hum) ** 2, -1).max())
            scale_obj = torch.sqrt(torch.sum((pc_obj - cent_obj) ** 2, -1).max())
            pc_hum = (pc_hum - cent_hum) / (2 * scale_hum)
            pc_obj = (pc_obj - cent_obj) / (2 * scale_obj)
            # Also update camera parameters for separate human + object
            T_hum_scaled = (batch['T_ho'][i] + cent_hum.squeeze(0)) / (2 * scale_hum)
            T_obj_scaled = (batch['T_ho'][i] + cent_obj.squeeze(0)) / (2 * scale_obj)

            pred_hum.append(pc_hum)
            pred_obj.append(pc_obj)
            cent_hum_pred.append(cent_hum.squeeze(0))
            cent_obj_pred.append(cent_obj.squeeze(0))
            T_hum.append(T_hum_scaled * torch.tensor([-1, -1, 1]))  # apply opencv to pytorch3d transform: flip x and y
            T_obj.append(T_obj_scaled * torch.tensor([-1, -1, 1]))
            radius_hum_pred.append(scale_hum)
            radius_obj_pred.append(scale_obj)
        # Pack data into a new batch dict
        camera_hum = PerspectiveCameras(
            R=torch.stack(batch['R']),
            T=torch.stack(T_hum),
            K=torch.stack(batch['K_hum']),
            device='cuda',
            in_ndc=True
        )
        camera_obj = PerspectiveCameras(
            R=torch.stack(batch['R']),
            T=torch.stack(T_obj),
            K=torch.stack(batch['K_obj']),  # the camera should be human/object specific!!!
            device='cuda',
            in_ndc=True
        )
        # use pc from predicted
        pc_hum = Pointclouds([x.to('cuda') for x in pred_hum])
        pc_obj = Pointclouds([x.to('cuda') for x in pred_obj])
        # use center and radius from predicted
        cent_hum = torch.stack(cent_hum_pred, 0).to('cuda')
        cent_obj = torch.stack(cent_obj_pred, 0).to('cuda')  # B, 3
        radius_hum = torch.stack(radius_hum_pred, 0).to('cuda')  # B, 1
        radius_obj = torch.stack(radius_obj_pred, 0).to('cuda')
        out_stage2: Pointclouds = self.model_stage2.forward_sample(
            num_points=num_samples,
            camera=camera_hum,
            image_rgb=torch.stack(batch['images_hum'], 0).to('cuda'),
            mask=torch.stack(batch['masks_hum'], 0).to('cuda'),
            gt_pc=pc_hum,
            rgb_obj=torch.stack(batch['images_obj'], 0).to('cuda'),
            mask_obj=torch.stack(batch['masks_obj'], 0).to('cuda'),
            pc_obj=pc_obj,
            camera_obj=camera_obj,
            cent_hum=cent_hum,
            cent_obj=cent_obj,
            radius_hum=radius_hum.unsqueeze(-1),
            radius_obj=radius_obj.unsqueeze(-1),
            sample_from_interm=True,
            noise_step=cfg.run.sample_noise_step)
        return out_stage1, out_stage2

    def upsample_predicted_pc(self, num_samples, pc_obj):
        """
        Up/Downsample the points to given number
        :param num_samples: the target number
        :param pc_obj: (N, 3)
        :return: (num_samples, 3)
        """
        if len(pc_obj) > num_samples:
            ind_obj = np.random.choice(len(pc_obj), num_samples)
        else:
            ind_obj = np.concatenate([np.arange(len(pc_obj)), np.random.choice(len(pc_obj), num_samples - len(pc_obj))])
        pc_obj = pc_obj.clone()[torch.from_numpy(ind_obj).long().to(pc_obj.device)]
        return pc_obj


@hydra.main(config_path='configs', config_name='configs', version_base='1.1')
def main(cfg: ProjectConfig):
    runner = DemoRunner(cfg)
    runner.run()


if __name__ == '__main__':
    main()