Spaces:

liruiw
/

hma

Running on Zero

File size: 19,902 Bytes

import cv2
import torch
import numpy as np
import einops
import skimage
import time

from genie.st_mask_git import STMaskGIT
from genie.st_mar import STMAR
from datasets.utils import get_image_encoder
from data import DATA_FREQ_TABLE
from train_diffusion import SVD_SCALE

from typing import Optional, Tuple, Callable, Dict


class Simulator:
    def set_initial_state(self, state):
        """
        the initial state of the simulated scene
        e.g.
        1. in robomimic, it's the scene state vector
        2. in genie, it's the initial frames to prompt the model
        """
        raise NotImplementedError

    @torch.inference_mode()
    def step(self, action):
        raise NotImplementedError
    
    def reset(self):
        raise NotImplementedError
    
    def close(self):
        raise NotImplementedError
    
    @property
    def dt(self):
        raise NotImplementedError


class PhysicsSimulator(Simulator):
    def __init__(self):
        super().__init__()

    # physics engine should be able to update dt
    def set_dt(self, dt):
        raise NotImplementedError
    
    # physics engine should be able to get scene state
    # e.g., robot joint positions, object positions, etc.
    def get_raw_state(self, port: Optional[str] = None):
        raise NotImplementedError
    
    @property
    def action_dimension(self):
        raise NotImplementedError
    

class LearnedSimulator(Simulator):
    def __init__(self):
        super().__init__()


# data replayed respect physics, so we inherit from PhysicsSimulator
# it can be considered as a special case of PhysicsSimulator
class ReplaySimulator(PhysicsSimulator):
    def __init__(self, 
        frames, 
        prompt_horizon: int = 0, 
        dt: Optional[float] = None
    ):
        super().__init__()
        self.frames = frames
        self.frame_idx = prompt_horizon
        assert self.frame_idx < len(self.frames)
        self._dt = dt
        self.prompt_horizon = prompt_horizon
    
    def __len__(self):
        return len(self.frames) - self.prompt_horizon
    
    def step(self, action):
        frame = self.frames[self.frame_idx]
        assert self.frame_idx < len(self.frames)
        self.frame_idx = self.frame_idx + 1
        return {
            'pred_next_frame': frame
        }
    
    def reset(self):    # return current frame = last frame of prompt
        self.frame_idx = self.prompt_horizon
        return self.prompt()[-1]
    
    def prompt(self):
        return self.frames[:self.prompt_horizon]
    
    @property
    def dt(self):
        return self._dt
    

 

class GenieSimulator(LearnedSimulator):

    average_delta_psnr_over = 5

    def __init__(self,
        # image preprocessing
        max_image_resolution: int = 1024,
        resize_image: bool = True,
        resize_image_resolution: int = 256,
        # tokenizer setting
        image_encoder_type: str = "temporalvae",
        image_encoder_ckpt: str = "stabilityai/stable-video-diffusion-img2vid",
        quantize: bool = False,
        quantization_slice_size: int = 16,
        # dynamics backbone setting
        backbone_type: str = "stmar",
        backbone_ckpt: str = "data/mar_ckpt/robomimic",
        prompt_horizon: int = 11,
        inference_iterations: Optional[int] = None,
        sampling_temperature: float = 0.0,
        action_stride: Optional[int] = None,
        domain: str = "robomimic",
        genie_frequency: int = 2,
        # misc
        measure_step_time: bool = False,
        compute_psnr: bool = False,
        compute_delta_psnr: bool = False,   # act as a signal for controlability
        gaussian_action_perturbation_scale: Optional[float] = None,
        device: str = 'cuda',
        physics_simulator: Optional[PhysicsSimulator] = None,
        physics_simulator_teacher_force: Optional[int] = None,
        post_processor: Optional[Callable] = None, # on the predicted image, e.g., add action
        allow_external_prompt: bool = False
    ):
        super().__init__()

        assert quantize == (image_encoder_type == "magvit"), \
            "Currently quantization if and only if magvit is the image encoder."
        assert image_encoder_type in ["magvit", "temporalvae"], \
            "Image encoder type must be either 'magvit' or 'temporalvae'."
        assert not quantize or image_encoder_type == "magvit", \
            "If quantize is enabled, image encoder type must be 'magvit'."
        assert backbone_type in ["stmaskgit", "stmar"], \
            "Backbone type must be either 'stmaskgit' or 'stmar'."
        if physics_simulator is None:
            assert physics_simulator_teacher_force is None, \
                "Physics simulator teacher force is only available when physics simulator is provided."
            assert compute_psnr is False, \
                "PSNR computation is only available when physics simulator is provided."
            assert compute_delta_psnr is False, \
                "Delta PSNR computation is only available when physics simulator is provided."

        if action_stride is None:
            action_stride = DATA_FREQ_TABLE[domain] // genie_frequency
        if compute_delta_psnr:
            compute_psnr = True # to compute delta psnr, psnr must be computed
        if inference_iterations is None:
            if backbone_type == "stmaskgit":
                inference_iterations = 2
            elif backbone_type == "stmar":
                inference_iterations = 2

        # misc     
        self.device = torch.device(device)
        self.measure_step_time = measure_step_time
        self.compute_psnr = compute_psnr
        self.compute_delta_psnr = compute_delta_psnr
        self.allow_external_prompt = allow_external_prompt

        # image preprocessing
        self.max_image_resolution = max_image_resolution
        self.resize_image = resize_image
        self.resize_image_resolution = resize_image_resolution

        # load image encoder
        self.image_encoding_dtype = torch.bfloat16
        self.quantize = quantize
        self.quant_slice_size = quantization_slice_size
        self.image_encoder_type = image_encoder_type
        self.image_encoder = get_image_encoder(
            image_encoder_type,
            image_encoder_ckpt
            ).to(device=self.device, dtype=self.image_encoding_dtype).eval()

        # load STMaskGIT model (STMAR is inherited from STMaskGIT)
        self.prompt_horizon = prompt_horizon
        self.domain = domain
        self.genie_frequency = genie_frequency
        self.inference_iterations = inference_iterations
        self.sampling_temperature = sampling_temperature
        self.action_stride = action_stride
        self.gauss_act_perturb_scale = gaussian_action_perturbation_scale
        self.backbone_type = backbone_type
        if backbone_type == "stmaskgit":
            self.backbone = STMaskGIT.from_pretrained(backbone_ckpt)
        else:
            self.backbone = STMAR.from_pretrained(backbone_ckpt)
        self.backbone = self.backbone.to(device=self.device).eval()
        
        self.post_processor = post_processor
        
        # load physics simulator if available
        # the phys sim to get ground truth image, 
        # assume the phys sim has aligned prompt frames
        self.gt_phys_sim = physics_simulator
        self.gt_teacher_force = physics_simulator_teacher_force

        # history buffer, i.e., the input to the model
        self.cached_actions = None          # (prompt_horizon, action_stride, A)
        self.cached_latent_frames = None    # (prompt_horizon, ...)
        self.init_prompt = None             # (prompt_frames, prompt_actions)

        self.step_count = 0

        # report model size
        print(
            "================ Model Size Report ================\n"
            f"    encoder size: {sum(p.numel() for p in self.image_encoder.parameters()) / 1e6:.3f}M \n"
            f"    backbone size: {sum(p.numel() for p in self.backbone.parameters()) / 1e6:.3f}M\n"
            "==================================================="
        )


    def set_initial_state(self, state: Tuple[np.ndarray, np.ndarray]):
        if not self.allow_external_prompt and self.gt_phys_sim is not None:
            raise NotImplementedError("Initial state is set by the physics simulator.")
        self.init_prompt = state


    @torch.inference_mode()
    def step(self, action: np.ndarray) -> Dict:
        # action: (action_stride, A) OR (A,)
        # return: (H, W, 3)
        assert self.cached_latent_frames is not None and self.cached_actions is not None, \
            "Model is not prompted yet. Please call `set_initial_state` first."
        
        if action.ndim == 1:
            action = np.tile(action, (self.action_stride, 1))

        # perturb action
        if self.gauss_act_perturb_scale is not None:
            action = np.random.normal(action, self.gauss_act_perturb_scale)

        # encoding
        input_latent_states = torch.cat([
            self.cached_latent_frames,
            torch.zeros_like(self.cached_latent_frames[-1:]),
        ]).unsqueeze(0).to(torch.float32)

        # dtype conversion and mask token
        if self.backbone_type == "stmaskgit":
            input_latent_states = input_latent_states.long()
            input_latent_states[:, self.prompt_horizon] = self.backbone.mask_token_id
        elif self.backbone_type == "stmar":
            input_latent_states[:, self.prompt_horizon] = self.backbone.mask_token

        # dynamics rollout
        action = torch.from_numpy(action).to(device=self.device)
        input_actions = torch.cat([     # (1, prompt_horizon + 1, action_stride * A)
            self.cached_actions,
            action.unsqueeze(0)
        ]).view(1, self.prompt_horizon + 1, -1).to(torch.float32)

        if self.measure_step_time:
            start_time = time.time()
        pred_next_latent_state = self.backbone.maskgit_generate(
            input_latent_states,
            out_t=self.prompt_horizon,
            maskgit_steps=self.inference_iterations,
            temperature=self.sampling_temperature,
            action_ids=input_actions,
            domain=[self.domain]
        )[0].squeeze(0)

        # decoding
        pred_next_frame = self._decode_image(pred_next_latent_state)

        # timing
        if self.measure_step_time:
            end_time = time.time()

        step_result = {'pred_next_frame': pred_next_frame,}
        if self.measure_step_time:
            step_result['step_time'] = end_time - start_time

        # physics simulation
        if self.gt_phys_sim is not None:
            for a in action.cpu().numpy():
                gt_result = self.gt_phys_sim.step(a)
            gt_next_frame = cv2.resize(gt_result['pred_next_frame'], pred_next_frame.shape[:2])
            step_result['gt_next_frame'] = gt_next_frame
            gt_result.pop('pred_next_frame')
            step_result.update(gt_result)

            # gt state observation
            try:
                raw_state = self.gt_phys_sim.get_raw_state()
                step_result.update(raw_state)
            except NotImplementedError:
                pass

            # compute PSNR against ground truth
            if self.compute_psnr:
                psnr = skimage.metrics.peak_signal_noise_ratio(
                    image_true=gt_next_frame / 255., 
                    image_test=pred_next_frame / 255.,
                    data_range=1.0
                )
                step_result['psnr'] = psnr

            # controlability metric
            if self.compute_delta_psnr:
                delta_psnr = 0.0
                for _ in range(self.average_delta_psnr_over):
                    # re-mask the input latent states for masked prediction
                    if self.backbone_type == "stmaskgit":
                        input_latent_states = input_latent_states.long()
                        input_latent_states[:, self.prompt_horizon] = self.backbone.mask_token_id
                    elif self.backbone_type == "stmar":
                        input_latent_states[:, self.prompt_horizon] = self.backbone.mask_token
                    # sample random action from N(0, 1)
                    random_input_actions = torch.randn_like(input_actions)
                    random_pred_next_latent_state = self.backbone.maskgit_generate(
                        input_latent_states,
                        out_t=self.prompt_horizon,
                        maskgit_steps=self.inference_iterations,
                        temperature=self.sampling_temperature,
                        action_ids=random_input_actions,
                        domain=[self.domain],
                        skip_normalization=True
                    )[0].squeeze(0)
                    random_pred_next_frame = self._decode_image(random_pred_next_latent_state)
                    this_delta_psnr = step_result['psnr'] - skimage.metrics.peak_signal_noise_ratio(
                        image_true=gt_next_frame / 255.,
                        image_test=random_pred_next_frame / 255.,
                        data_range=1.0
                    )
                    delta_psnr += this_delta_psnr / self.average_delta_psnr_over
                step_result['delta_psnr'] = delta_psnr

            if self.gt_teacher_force is not None and self.step_count % self.gt_teacher_force == 0:
                pred_next_latent_state = self._encode_image(gt_next_frame)
        
        # update history buffer
        self.cached_latent_frames = torch.cat([
            self.cached_latent_frames[1:], pred_next_latent_state.unsqueeze(0)
        ])
        self.cached_actions = torch.cat([
            self.cached_actions[1:], action.unsqueeze(0)
        ])
        
        # post processing
        if self.post_processor is not None:
            pred_next_frame = self.post_processor(pred_next_frame, action)

        self.step_count += 1

        return step_result
    

    @torch.inference_mode()
    def _encode_image(self, image: np.ndarray) -> torch.Tensor:
        # (H, W, 3)
        image = torch.from_numpy(
            self._normalize_image(image).transpose(2, 0, 1)
            ).to(device=self.device, dtype=self.image_encoding_dtype
            ).unsqueeze(0)
        H, W = image.shape[-2:]

        if self.quantize:
            H //= self.quant_slice_size
            W //= self.quant_slice_size
            _, _, indices, _ = self.image_encoder.encode(image, flip=True)
            indices = einops.rearrange(indices, "(h w) -> h w", h=H, w=W)
            indices = indices.to(torch.int32)
            return indices

        else:
            if self.image_encoder_type == "magvit":
                latent = self.image_encoder.encode_without_quantize(image)
            elif self.image_encoder_type == "temporalvae":
                latent_dist = self.image_encoder.encode(image).latent_dist
                latent = latent_dist.mean
                latent *= SVD_SCALE
                latent = einops.rearrange(latent, "b c h w -> b h w c")
            else:
                pass
            latent = latent.squeeze(0).to(torch.float32)
            return latent


    @torch.inference_mode()
    def _decode_image(self, latent: torch.Tensor) -> np.ndarray:
        # latent can be either quantized indices or raw latent
        # return (H, W, 3)

        latent = latent.to(device=self.device).unsqueeze(0)

        if self.quantize:
            latent = self.image_encoder.quantize.get_codebook_entry(
                einops.rearrange(latent, "b h w -> b (h w)"),
                bhwc=(*latent.shape, self.image_encoder.quantize.codebook_dim)
            ).flip(1)

        latent = latent.to(device=self.device, dtype=self.image_encoding_dtype)
        if self.image_encoder_type == "magvit":
            decoded_image = self.image_encoder.decode(latent)
        elif self.image_encoder_type == "temporalvae":
            latent = einops.rearrange(latent, "b h w c -> b c h w")
            latent /= SVD_SCALE
            # HACK: clip for less visual artifacts
            latent = torch.clamp(latent, -25, 25)
            decoded_image = self.image_encoder.decode(latent, num_frames=1).sample
        decoded_image = decoded_image.squeeze(0).to(torch.float32).detach().cpu().numpy()
        decoded_image = self._unnormalize_image(decoded_image).transpose(1, 2, 0)
        return decoded_image
    

    def _normalize_image(self, image: np.ndarray) -> np.ndarray:
        # (H, W, 3) normalized to [-1, 1]
        # if `resize`, resize the shorter side to `resized_res` 
        #   and then do a center crop

        image = np.asarray(image, dtype=np.float32)
        image /= 255.
        H, W = image.shape[:2]

        # resize if asked
        if self.resize_image:
            resized_res = self.resize_image_resolution  
            if H < W:
                Hnew, Wnew = resized_res, int(resized_res * W / H)
            else:
                Hnew, Wnew = int(resized_res * H / W), resized_res
            image = cv2.resize(image, (Wnew, Hnew))

            # center crop
            H, W = image.shape[:2]
            Hstart = (H - resized_res) // 2
            Wstart = (W - resized_res) // 2
            image = image[Hstart:Hstart + resized_res, Wstart:Wstart + resized_res]

        # resize if resolution is too large
        elif H > self.max_image_resolution or W > self.max_image_resolution:
            if H < W:
                Hnew, Wnew = int(self.max_image_resolution * H / W), self.max_image_resolution
            else:
                Hnew, Wnew = self.max_image_resolution, int(self.max_image_resolution * W / H)
            image = cv2.resize(image, (Wnew, Hnew))

        image = (image * 2 - 1.)
        return image


    def _unnormalize_image(self, image: np.ndarray) -> np.ndarray:
        # (H, W, 3) from [-1, 1] to [0, 255]
        # NOTE: clip happens here
        image = (image + 1.) * 127.5
        image = np.clip(image, 0, 255).astype(np.uint8)
        return image


    def reset(self) -> np.ndarray:
        # if ground truth physics simulator is provided, 
        # return the the side-by-side concatenated image

        # get the initial prompt from the physics simulator if not yet set
        if not self.allow_external_prompt and self.gt_phys_sim is not None:
            image_prompt = np.tile(
                self.gt_phys_sim.reset(), (self.prompt_horizon, 1, 1, 1)
            ).astype(np.uint8)
            action_prompt = np.zeros(
                (self.prompt_horizon, self.action_stride, self.gt_phys_sim.action_dimension)
            ).astype(np.float32)
        else: 
            assert self.init_prompt is not None, "Initial state is not set."
            image_prompt, action_prompt = self.init_prompt

        # standardize the image
        image_prompt = [self._unnormalize_image(self._normalize_image(frame)) for frame in image_prompt]

        current_image = image_prompt[-1]

        action_prompt = torch.from_numpy(action_prompt).to(device=self.device)
        self.cached_actions = action_prompt

        # convert to latent
        self.cached_latent_frames = torch.stack([
            self._encode_image(frame) for frame in image_prompt
        ], axis=0)

        if self.resize_image:
            current_image = cv2.resize(current_image, 
                (self.resize_image_resolution, self.resize_image_resolution))

        if self.gt_phys_sim is not None:
            current_image = np.concatenate([current_image, current_image], axis=1)

        self.step_count = 0

        return current_image


    def close(self):
        if self.gt_phys_sim is not None:
            try:
                self.gt_phys_sim.close()
            except NotImplementedError:
                pass


    @property
    def dt(self):
        return 1.0 / self.genie_frequency