from __future__ import annotations

from functools import partial
from math import ceil
import os

from accelerate.utils import DistributedDataParallelKwargs
from beartype.typing import Tuple, Callable, List

from einops import rearrange, repeat, reduce, pack
from gateloop_transformer import SimpleGateLoopLayer
from huggingface_hub import PyTorchModelHubMixin
import numpy as np
import trimesh
from tqdm import tqdm
import torch
from torch import nn, Tensor
from torch.nn import Module, ModuleList
import torch.nn.functional as F
from pytorch3d.loss import chamfer_distance
from pytorch3d.transforms import euler_angles_to_matrix
from x_transformers import Decoder
from x_transformers.x_transformers import LayerIntermediates
from x_transformers.autoregressive_wrapper import eval_decorator

from .michelangelo import ShapeConditioner as ShapeConditioner_miche
from .utils import (
    discretize,
    undiscretize,
    set_module_requires_grad_,
    default,
    exists,
    safe_cat,
    identity,
    is_tensor_empty,
)
from .utils.typing import Float, Int, Bool, typecheck


# constants

DEFAULT_DDP_KWARGS = DistributedDataParallelKwargs(
    find_unused_parameters = True
)
SHAPE_CODE = {
    'CubeBevel': 0,
    'SphereSharp': 1,
    'CylinderSharp': 2,
}
BS_NAME = {
    0: 'CubeBevel',
    1: 'SphereSharp',
    2: 'CylinderSharp',
}

# FiLM block

class FiLM(Module):
    def __init__(self, dim, dim_out = None):
        super().__init__()
        dim_out = default(dim_out, dim)

        self.to_gamma = nn.Linear(dim, dim_out, bias = False)
        self.to_beta = nn.Linear(dim, dim_out)

        self.gamma_mult = nn.Parameter(torch.zeros(1,))
        self.beta_mult = nn.Parameter(torch.zeros(1,))

    def forward(self, x, cond):
        gamma, beta = self.to_gamma(cond), self.to_beta(cond)
        gamma, beta = tuple(rearrange(t, 'b d -> b 1 d') for t in (gamma, beta))

        # for initializing to identity

        gamma = (1 + self.gamma_mult * gamma.tanh())
        beta = beta.tanh() * self.beta_mult

        # classic film

        return x * gamma + beta

# gateloop layers

class GateLoopBlock(Module):
    def __init__(
        self,
        dim,
        *,
        depth,
        use_heinsen = True
    ):
        super().__init__()
        self.gateloops = ModuleList([])

        for _ in range(depth):
            gateloop = SimpleGateLoopLayer(dim = dim, use_heinsen = use_heinsen)
            self.gateloops.append(gateloop)

    def forward(
        self,
        x,
        cache = None
    ):
        received_cache = exists(cache)

        if is_tensor_empty(x):
            return x, None

        if received_cache:
            prev, x = x[:, :-1], x[:, -1:]

        cache = default(cache, [])
        cache = iter(cache)

        new_caches = []
        for gateloop in self.gateloops:
            layer_cache = next(cache, None)
            out, new_cache = gateloop(x, cache = layer_cache, return_cache = True)
            new_caches.append(new_cache)
            x = x + out

        if received_cache:
            x = torch.cat((prev, x), dim = -2)

        return x, new_caches


def top_k_2(logits, frac_num_tokens=0.1, k=None):
    num_tokens = logits.shape[-1]

    k = default(k, ceil(frac_num_tokens * num_tokens))
    k = min(k, num_tokens)

    val, ind = torch.topk(logits, k)
    probs = torch.full_like(logits, float('-inf'))
    probs.scatter_(2, ind, val)
    return probs


def soft_argmax(labels):
    indices = torch.arange(labels.size(-1), dtype=labels.dtype, device=labels.device)
    soft_argmax = torch.sum(labels * indices, dim=-1)
    return soft_argmax


class PrimitiveTransformerDiscrete(Module, PyTorchModelHubMixin):
    @typecheck
    def __init__(
        self,
        *,
        num_discrete_scale = 128,
        continuous_range_scale: List[float, float] = [0, 1],
        dim_scale_embed = 64,
        num_discrete_rotation = 180,
        continuous_range_rotation: List[float, float] = [-180, 180],
        dim_rotation_embed = 64,
        num_discrete_translation = 128,
        continuous_range_translation: List[float, float] = [-1, 1],
        dim_translation_embed = 64,
        num_type = 3,
        dim_type_embed = 64,
        embed_order = 'ctrs',
        bin_smooth_blur_sigma = 0.4,
        dim: int | Tuple[int, int] = 512,
        flash_attn = True,
        attn_depth = 12,
        attn_dim_head = 64,
        attn_heads = 16,
        attn_kwargs: dict = dict(
            ff_glu = True,
            attn_num_mem_kv = 4
        ),
        max_primitive_len = 144,
        dropout = 0.,
        coarse_pre_gateloop_depth = 2,
        coarse_post_gateloop_depth = 0,
        coarse_adaptive_rmsnorm = False,
        gateloop_use_heinsen = False,
        pad_id = -1,
        num_sos_tokens = None,
        condition_on_shape = True,
        shape_cond_with_cross_attn = False,
        shape_cond_with_film = False,
        shape_cond_with_cat = False,
        shape_condition_model_type = 'michelangelo',
        shape_condition_len = 1,
        shape_condition_dim = None,
        cross_attn_num_mem_kv = 4, # needed for preventing nan when dropping out shape condition
        loss_weight: dict = dict(
            eos = 1.0,
            type = 1.0,
            scale = 1.0,
            rotation = 1.0,
            translation = 1.0,
            reconstruction = 1.0,
            scale_huber = 1.0,
            rotation_huber = 1.0,
            translation_huber = 1.0,
        ),
        bs_pc_dir=None,
    ):
        super().__init__()

        # feature embedding
        self.num_discrete_scale = num_discrete_scale
        self.continuous_range_scale = continuous_range_scale
        self.discretize_scale = partial(discretize, num_discrete=num_discrete_scale, continuous_range=continuous_range_scale)
        self.undiscretize_scale = partial(undiscretize, num_discrete=num_discrete_scale, continuous_range=continuous_range_scale)
        self.scale_embed = nn.Embedding(num_discrete_scale, dim_scale_embed)

        self.num_discrete_rotation = num_discrete_rotation
        self.continuous_range_rotation = continuous_range_rotation
        self.discretize_rotation = partial(discretize, num_discrete=num_discrete_rotation, continuous_range=continuous_range_rotation)
        self.undiscretize_rotation = partial(undiscretize, num_discrete=num_discrete_rotation, continuous_range=continuous_range_rotation)
        self.rotation_embed = nn.Embedding(num_discrete_rotation, dim_rotation_embed)

        self.num_discrete_translation = num_discrete_translation
        self.continuous_range_translation = continuous_range_translation
        self.discretize_translation = partial(discretize, num_discrete=num_discrete_translation, continuous_range=continuous_range_translation)
        self.undiscretize_translation = partial(undiscretize, num_discrete=num_discrete_translation, continuous_range=continuous_range_translation)
        self.translation_embed = nn.Embedding(num_discrete_translation, dim_translation_embed)

        self.num_type = num_type
        self.type_embed = nn.Embedding(num_type, dim_type_embed)

        self.embed_order = embed_order
        self.bin_smooth_blur_sigma = bin_smooth_blur_sigma

        # initial dimension
        
        self.dim = dim
        init_dim = 3 * (dim_scale_embed + dim_rotation_embed + dim_translation_embed) + dim_type_embed
        
        # project into model dimension
        self.project_in = nn.Linear(init_dim, dim)

        num_sos_tokens = default(num_sos_tokens, 1 if not condition_on_shape or not shape_cond_with_film else 4)
        assert num_sos_tokens > 0

        self.num_sos_tokens = num_sos_tokens
        self.sos_token = nn.Parameter(torch.randn(num_sos_tokens, dim))

        # the transformer eos token
        self.eos_token = nn.Parameter(torch.randn(1, dim))

        self.emb_layernorm = nn.LayerNorm(dim)
        self.max_seq_len = max_primitive_len

        # shape condition

        self.condition_on_shape = condition_on_shape
        self.shape_cond_with_cross_attn = False
        self.shape_cond_with_cat = False
        self.shape_condition_model_type = ''
        self.conditioner = None
        dim_shape = None

        if condition_on_shape:
            assert shape_cond_with_cross_attn or shape_cond_with_film or shape_cond_with_cat
            self.shape_cond_with_cross_attn = shape_cond_with_cross_attn
            self.shape_cond_with_cat = shape_cond_with_cat
            self.shape_condition_model_type = shape_condition_model_type
            if 'michelangelo' in shape_condition_model_type:
                self.conditioner = ShapeConditioner_miche(dim_latent=shape_condition_dim)
                self.to_cond_dim = nn.Linear(self.conditioner.dim_model_out * 2, self.conditioner.dim_latent)
                self.to_cond_dim_head = nn.Linear(self.conditioner.dim_model_out, self.conditioner.dim_latent)
            else:
                raise ValueError(f'unknown shape_condition_model_type {self.shape_condition_model_type}')

            dim_shape = self.conditioner.dim_latent
            set_module_requires_grad_(self.conditioner, False)

            self.shape_coarse_film_cond = FiLM(dim_shape, dim) if shape_cond_with_film else identity

        self.coarse_gateloop_block = GateLoopBlock(dim, depth=coarse_pre_gateloop_depth, use_heinsen=gateloop_use_heinsen) if coarse_pre_gateloop_depth > 0 else None
        self.coarse_post_gateloop_block = GateLoopBlock(dim, depth=coarse_post_gateloop_depth, use_heinsen=gateloop_use_heinsen) if coarse_post_gateloop_depth > 0 else None
        self.coarse_adaptive_rmsnorm = coarse_adaptive_rmsnorm

        self.decoder = Decoder(
            dim=dim,
            depth=attn_depth,
            heads=attn_heads,
            attn_dim_head=attn_dim_head,
            attn_flash=flash_attn,
            attn_dropout=dropout,
            ff_dropout=dropout,
            use_adaptive_rmsnorm=coarse_adaptive_rmsnorm,
            dim_condition=dim_shape,
            cross_attend=self.shape_cond_with_cross_attn,
            cross_attn_dim_context=dim_shape,
            cross_attn_num_mem_kv=cross_attn_num_mem_kv,
            **attn_kwargs
        )

        # to logits
        self.to_eos_logits = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, 1)
        )
        self.to_type_logits = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, num_type)
        )
        self.to_translation_logits = nn.Sequential(
            nn.Linear(dim + dim_type_embed, dim),
            nn.ReLU(),
            nn.Linear(dim, 3 * num_discrete_translation)
        )
        self.to_rotation_logits = nn.Sequential(
            nn.Linear(dim + dim_type_embed + 3 * dim_translation_embed, dim),
            nn.ReLU(),
            nn.Linear(dim, 3 * num_discrete_rotation)
        )
        self.to_scale_logits = nn.Sequential(
            nn.Linear(dim + dim_type_embed + 3 * (dim_translation_embed + dim_rotation_embed), dim),
            nn.ReLU(),
            nn.Linear(dim, 3 * num_discrete_scale)
        )

        self.pad_id = pad_id

        bs_pc_map = {}
        for bs_name, type_code in SHAPE_CODE.items():
            pc = trimesh.load(os.path.join(bs_pc_dir, f'SM_GR_BS_{bs_name}_001.ply'))
            bs_pc_map[type_code] = torch.from_numpy(np.asarray(pc.vertices)).float()
        bs_pc_list = []
        for i in range(len(bs_pc_map)):
            bs_pc_list.append(bs_pc_map[i])
        self.bs_pc = torch.stack(bs_pc_list, dim=0)

        self.rotation_matrix_align_coord = euler_angles_to_matrix(
                                            torch.Tensor([np.pi/2, 0, 0]), 'XYZ').unsqueeze(0).unsqueeze(0)

    @property
    def device(self):
        return next(self.parameters()).device

    @typecheck
    @torch.no_grad()
    def embed_pc(self, pc: Tensor):
        if 'michelangelo' in self.shape_condition_model_type:
            pc_head, pc_embed = self.conditioner(shape=pc)
            pc_embed = torch.cat([self.to_cond_dim_head(pc_head), self.to_cond_dim(pc_embed)], dim=-2).detach()
        else:
            raise ValueError(f'unknown shape_condition_model_type {self.shape_condition_model_type}')

        return pc_embed

    @typecheck
    def recon_primitives(
        self,
        scale_logits: Float['b np 3 nd'],
        rotation_logits: Float['b np 3 nd'],
        translation_logits: Float['b np 3 nd'],
        type_logits: Int['b np nd'],
        primitive_mask: Bool['b np']
    ):
        recon_scale = self.undiscretize_scale(scale_logits.argmax(dim=-1))
        recon_scale = recon_scale.masked_fill(~primitive_mask.unsqueeze(-1), float('nan'))
        recon_rotation = self.undiscretize_rotation(rotation_logits.argmax(dim=-1))
        recon_rotation = recon_rotation.masked_fill(~primitive_mask.unsqueeze(-1), float('nan'))
        recon_translation = self.undiscretize_translation(translation_logits.argmax(dim=-1))
        recon_translation = recon_translation.masked_fill(~primitive_mask.unsqueeze(-1), float('nan'))
        recon_type_code = type_logits.argmax(dim=-1)
        recon_type_code = recon_type_code.masked_fill(~primitive_mask, -1)
        
        return {
            'scale': recon_scale,
            'rotation': recon_rotation,
            'translation': recon_translation,
            'type_code': recon_type_code
        }
    
    @typecheck
    def sample_primitives(
        self,
        scale: Float['b np 3 nd'],
        rotation: Float['b np 3 nd'],
        translation: Float['b np 3 nd'],
        type_code: Int['b np nd'],
        next_embed: Float['b 1 nd'],
        temperature: float = 1.,
        filter_logits_fn: Callable = top_k_2,
        filter_kwargs: dict = dict()
    ):
        def sample_func(logits):
            if logits.ndim == 4:
                enable_squeeze = True
                logits = logits.squeeze(1)
            else:
                enable_squeeze = False

            filtered_logits = filter_logits_fn(logits, **filter_kwargs)

            if temperature == 0.:
                sample = filtered_logits.argmax(dim=-1)
            else:
                probs = F.softmax(filtered_logits / temperature, dim=-1)

                sample = torch.zeros((probs.shape[0], probs.shape[1]), dtype=torch.long, device=probs.device)
                for b_i in range(probs.shape[0]):
                    sample[b_i] = torch.multinomial(probs[b_i], 1).squeeze()

            if enable_squeeze:
                sample = sample.unsqueeze(1)

            return sample
        
        next_type_logits = self.to_type_logits(next_embed)
        next_type_code = sample_func(next_type_logits)
        type_code_new, _ = pack([type_code, next_type_code], 'b *')

        type_embed = self.type_embed(next_type_code)
        next_embed_packed, _ = pack([next_embed, type_embed], 'b np *')
        next_translation_logits = rearrange(self.to_translation_logits(next_embed_packed), 'b np (c nd) -> b np c nd', nd=self.num_discrete_translation)
        next_discretize_translation = sample_func(next_translation_logits)
        next_translation = self.undiscretize_translation(next_discretize_translation)
        translation_new, _ = pack([translation, next_translation], 'b * nd')
        
        next_translation_embed = self.translation_embed(next_discretize_translation)
        next_embed_packed, _ = pack([next_embed_packed, next_translation_embed], 'b np *')
        next_rotation_logits = rearrange(self.to_rotation_logits(next_embed_packed), 'b np (c nd) -> b np c nd', nd=self.num_discrete_rotation)
        next_discretize_rotation = sample_func(next_rotation_logits)
        next_rotation = self.undiscretize_rotation(next_discretize_rotation)
        rotation_new, _ = pack([rotation, next_rotation], 'b * nd')

        next_rotation_embed = self.rotation_embed(next_discretize_rotation)
        next_embed_packed, _ = pack([next_embed_packed, next_rotation_embed], 'b np *')
        next_scale_logits = rearrange(self.to_scale_logits(next_embed_packed), 'b np (c nd) -> b np c nd', nd=self.num_discrete_scale)
        next_discretize_scale = sample_func(next_scale_logits)
        next_scale = self.undiscretize_scale(next_discretize_scale)
        scale_new, _ = pack([scale, next_scale], 'b * nd')

        return (
            scale_new,
            rotation_new,
            translation_new,
            type_code_new
        )

    @eval_decorator
    @torch.no_grad()
    @typecheck
    def generate(
        self,
        batch_size: int | None = None,
        filter_logits_fn: Callable = top_k_2,
        filter_kwargs: dict = dict(),
        temperature: float = 1.,
        scale: Float['b np 3'] | None = None,
        rotation: Float['b np 3'] | None = None,
        translation: Float['b np 3'] | None = None,
        type_code: Int['b np'] | None = None,
        pc: Tensor | None = None,
        pc_embed: Tensor | None = None,
        cache_kv = True,
        max_seq_len = None,
    ):
        max_seq_len = default(max_seq_len, self.max_seq_len)

        if exists(scale) and exists(rotation) and exists(translation) and exists(type_code):
            assert not exists(batch_size)
            assert scale.shape[1] == rotation.shape[1] == translation.shape[1] == type_code.shape[1]
            assert scale.shape[1] <= self.max_seq_len
            
            batch_size = scale.shape[0]

        if self.condition_on_shape:
            assert exists(pc) ^ exists(pc_embed), '`pc` or `pc_embed` must be passed in'
            if exists(pc):
                pc_embed = self.embed_pc(pc)

            batch_size = default(batch_size, pc_embed.shape[0])

        batch_size = default(batch_size, 1)

        scale = default(scale, torch.empty((batch_size, 0, 3), dtype=torch.float64, device=self.device))
        rotation = default(rotation, torch.empty((batch_size, 0, 3), dtype=torch.float64, device=self.device))
        translation = default(translation, torch.empty((batch_size, 0, 3), dtype=torch.float64, device=self.device))
        type_code = default(type_code, torch.empty((batch_size, 0), dtype=torch.int64, device=self.device))

        curr_length = scale.shape[1]

        cache = None
        eos_codes = None

        for i in tqdm(range(curr_length, max_seq_len)):
            can_eos = i != 0
            output = self.forward(
                scale=scale,
                rotation=rotation,
                translation=translation,
                type_code=type_code,
                pc_embed=pc_embed,
                return_loss=False,
                return_cache=cache_kv,
                append_eos=False,
                cache=cache
            )
            if cache_kv:
                next_embed, cache = output
            else:
                next_embed = output
            (
                scale,
                rotation,
                translation,
                type_code
            ) = self.sample_primitives(
                scale,
                rotation,
                translation,
                type_code,
                next_embed,
                temperature=temperature,
                filter_logits_fn=filter_logits_fn,
                filter_kwargs=filter_kwargs
            )

            next_eos_logits = self.to_eos_logits(next_embed).squeeze(-1)
            next_eos_code = (F.sigmoid(next_eos_logits) > 0.5)
            eos_codes = safe_cat([eos_codes, next_eos_code], 1)
            if can_eos and eos_codes.any(dim=-1).all():
                break

        # mask out to padding anything after the first eos
        mask = eos_codes.float().cumsum(dim=-1) >= 1

        # concat cur_length to mask
        mask = torch.cat((torch.zeros((batch_size, curr_length), dtype=torch.bool, device=self.device), mask), dim=-1)
        type_code = type_code.masked_fill(mask, self.pad_id)
        scale = scale.masked_fill(mask.unsqueeze(-1), self.pad_id)
        rotation = rotation.masked_fill(mask.unsqueeze(-1), self.pad_id)
        translation = translation.masked_fill(mask.unsqueeze(-1), self.pad_id)

        recon_primitives = {
            'scale': scale,
            'rotation': rotation,
            'translation': translation,
            'type_code': type_code
        }
        primitive_mask = ~eos_codes

        return recon_primitives, primitive_mask


    @eval_decorator
    @torch.no_grad()
    @typecheck
    def generate_w_recon_loss(
        self,
        batch_size: int | None = None,
        filter_logits_fn: Callable = top_k_2,
        filter_kwargs: dict = dict(),
        temperature: float = 1.,
        scale: Float['b np 3'] | None = None,
        rotation: Float['b np 3'] | None = None,
        translation: Float['b np 3'] | None = None,
        type_code: Int['b np'] | None = None,
        pc: Tensor | None = None,
        pc_embed: Tensor | None = None,
        cache_kv = True,
        max_seq_len = None,
        single_directional = True,
    ):
        max_seq_len = default(max_seq_len, self.max_seq_len)

        if exists(scale) and exists(rotation) and exists(translation) and exists(type_code):
            assert not exists(batch_size)
            assert scale.shape[1] == rotation.shape[1] == translation.shape[1] == type_code.shape[1]
            assert scale.shape[1] <= self.max_seq_len
            
            batch_size = scale.shape[0]

        if self.condition_on_shape:
            assert exists(pc) ^ exists(pc_embed), '`pc` or `pc_embed` must be passed in'
            if exists(pc):
                pc_embed = self.embed_pc(pc)

            batch_size = default(batch_size, pc_embed.shape[0])

        batch_size = default(batch_size, 1)
        assert batch_size == 1 # TODO: support any batch size

        scale = default(scale, torch.empty((batch_size, 0, 3), dtype=torch.float32, device=self.device))
        rotation = default(rotation, torch.empty((batch_size, 0, 3), dtype=torch.float32, device=self.device))
        translation = default(translation, torch.empty((batch_size, 0, 3), dtype=torch.float32, device=self.device))
        type_code = default(type_code, torch.empty((batch_size, 0), dtype=torch.int64, device=self.device))

        curr_length = scale.shape[1]

        cache = None
        eos_codes = None
        last_recon_loss = 1
        for i in tqdm(range(curr_length, max_seq_len)):
            can_eos = i != 0
            output = self.forward(
                scale=scale,
                rotation=rotation,
                translation=translation,
                type_code=type_code,
                pc_embed=pc_embed,
                return_loss=False,
                return_cache=cache_kv,
                append_eos=False,
                cache=cache
            )
            if cache_kv:
                next_embed, cache = output
            else:
                next_embed = output
            (
                scale_new,
                rotation_new,
                translation_new,
                type_code_new
            ) = self.sample_primitives(
                scale,
                rotation,
                translation,
                type_code,
                next_embed,
                temperature=temperature,
                filter_logits_fn=filter_logits_fn,
                filter_kwargs=filter_kwargs
            )

            next_eos_logits = self.to_eos_logits(next_embed).squeeze(-1)
            next_eos_code = (F.sigmoid(next_eos_logits) > 0.5)
            eos_codes = safe_cat([eos_codes, next_eos_code], 1)
            if can_eos and eos_codes.any(dim=-1).all():
                scale, rotation, translation, type_code = (
                    scale_new, rotation_new, translation_new, type_code_new)
                break

            recon_loss = self.compute_chamfer_distance(scale_new, rotation_new, translation_new, type_code_new, ~eos_codes, pc, single_directional)
            if recon_loss < last_recon_loss:
                last_recon_loss = recon_loss
                scale, rotation, translation, type_code = (
                    scale_new, rotation_new, translation_new, type_code_new)
            else:
                best_recon_loss = recon_loss
                best_primitives = dict(
                    scale=scale_new, rotation=rotation_new, translation=translation_new, type_code=type_code_new)
                success_flag = False
                print(f'last_recon_loss:{last_recon_loss}, recon_loss:{recon_loss} -> to find better primitive')
                for try_i in range(5):
                    (
                        scale_new,
                        rotation_new,
                        translation_new,
                        type_code_new
                    ) = self.sample_primitives(
                        scale,
                        rotation,
                        translation,
                        type_code,
                        next_embed,
                        temperature=1.0,
                        filter_logits_fn=filter_logits_fn,
                        filter_kwargs=filter_kwargs
                    )
                    recon_loss = self.compute_chamfer_distance(scale_new, rotation_new, translation_new, type_code_new, ~eos_codes, pc)
                    print(f'[try_{try_i}] last_recon_loss:{last_recon_loss}, best_recon_loss:{best_recon_loss}, cur_recon_loss:{recon_loss}')
                    if recon_loss < last_recon_loss:
                        last_recon_loss = recon_loss
                        scale, rotation, translation, type_code = (
                            scale_new, rotation_new, translation_new, type_code_new)
                        success_flag = True
                        break
                    else:
                        if recon_loss < best_recon_loss:
                            best_recon_loss = recon_loss
                            best_primitives = dict(
                                scale=scale_new, rotation=rotation_new, translation=translation_new, type_code=type_code_new)

                if not success_flag:
                    last_recon_loss = best_recon_loss
                    scale, rotation, translation, type_code = (
                        best_primitives['scale'], best_primitives['rotation'], best_primitives['translation'], best_primitives['type_code'])
                print(f'new_last_recon_loss:{last_recon_loss}')

        # mask out to padding anything after the first eos
        mask = eos_codes.float().cumsum(dim=-1) >= 1
        type_code = type_code.masked_fill(mask, self.pad_id)
        scale = scale.masked_fill(mask.unsqueeze(-1), self.pad_id)
        rotation = rotation.masked_fill(mask.unsqueeze(-1), self.pad_id)
        translation = translation.masked_fill(mask.unsqueeze(-1), self.pad_id)

        recon_primitives = {
            'scale': scale,
            'rotation': rotation,
            'translation': translation,
            'type_code': type_code
        }
        primitive_mask = ~eos_codes

        return recon_primitives, primitive_mask


    @typecheck
    def encode(
        self,
        *,
        scale: Float['b np 3'],
        rotation: Float['b np 3'],
        translation: Float['b np 3'],
        type_code: Int['b np'],
        primitive_mask: Bool['b np'],
        return_primitives = False
    ):
        """
        einops:
        b - batch
        np - number of primitives
        c - coordinates (3)
        d - embed dim
        """
    
        # compute feature embedding
        discretize_scale = self.discretize_scale(scale)
        scale_embed = self.scale_embed(discretize_scale)
        scale_embed = rearrange(scale_embed, 'b np c d -> b np (c d)')

        discretize_rotation = self.discretize_rotation(rotation)
        rotation_embed = self.rotation_embed(discretize_rotation)
        rotation_embed = rearrange(rotation_embed, 'b np c d -> b np (c d)')

        discretize_translation = self.discretize_translation(translation)
        translation_embed = self.translation_embed(discretize_translation)
        translation_embed = rearrange(translation_embed, 'b np c d -> b np (c d)')

        type_embed = self.type_embed(type_code.masked_fill(~primitive_mask, 0))

        # combine all features and project into model dimension
        if self.embed_order == 'srtc':
            primitive_embed, _ = pack([scale_embed, rotation_embed, translation_embed, type_embed], 'b np *')
        else:
            primitive_embed, _ = pack([type_embed, translation_embed, rotation_embed, scale_embed], 'b np *')

        primitive_embed = self.project_in(primitive_embed)
        primitive_embed = primitive_embed.masked_fill(~primitive_mask.unsqueeze(-1), 0.)

        if not return_primitives:
            return primitive_embed
        
        primitive_embed_unpacked = {
            'scale': scale_embed,
            'rotation': rotation_embed,
            'translation': translation_embed,
            'type_code': type_embed
        }

        primitives_gt = {
            'scale': discretize_scale,
            'rotation': discretize_rotation,
            'translation': discretize_translation,
            'type_code': type_code
        }
        
        return primitive_embed, primitive_embed_unpacked, primitives_gt

    @typecheck
    def compute_chamfer_distance(
        self,
        scale_pred: Float['b np 3'],
        rotation_pred: Float['b np 3'],
        translation_pred: Float['b np 3'],
        type_pred: Int['b np'],
        primitive_mask: Bool['b np'],
        pc: Tensor, # b, num_points, c
        single_directional = True
    ):
        scale_pred = scale_pred.float()
        rotation_pred = rotation_pred.float()
        translation_pred = translation_pred.float()

        pc_pred = apply_transformation(self.bs_pc.to(type_pred.device)[type_pred], scale_pred, torch.deg2rad(rotation_pred), translation_pred)
        pc_pred = torch.matmul(pc_pred, self.rotation_matrix_align_coord.to(type_pred.device))
        pc_pred_flat = rearrange(pc_pred, 'b np p c -> b (np p) c')
        pc_pred_sampled = random_sample_pc(pc_pred_flat, primitive_mask.sum(dim=-1, keepdim=True), n_points=self.bs_pc.shape[1])

        if single_directional:
            recon_loss, _ = chamfer_distance(pc[:, :, :3].float(), pc_pred_sampled.float(), single_directional=True) # single directional
        else:
            recon_loss, _ = chamfer_distance(pc_pred_sampled.float(), pc[:, :, :3].float())

        return recon_loss

    def forward(
        self,
        *,
        scale: Float['b np 3'],
        rotation: Float['b np 3'],
        translation: Float['b np 3'],
        type_code: Int['b np'],
        loss_reduction: str = 'mean',
        return_cache = False,
        append_eos = True,
        cache: LayerIntermediates | None = None,
        pc: Tensor | None = None,
        pc_embed: Tensor | None = None,
        **kwargs
    ):

        primitive_mask = reduce(scale != self.pad_id, 'b np 3 -> b np', 'all')

        if scale.shape[1] > 0:
            codes, primitives_embeds, primitives_gt = self.encode(
                scale=scale,
                rotation=rotation,
                translation=translation,
                type_code=type_code,
                primitive_mask=primitive_mask,
                return_primitives=True
            )
        else:
            codes = torch.empty((scale.shape[0], 0, self.dim), dtype=torch.float32, device=self.device)

        # handle shape conditions

        attn_context_kwargs = dict()

        if self.condition_on_shape:
            assert exists(pc) ^ exists(pc_embed), '`pc` or `pc_embed` must be passed in'

            if exists(pc):
                if 'michelangelo' in self.shape_condition_model_type:
                    pc_head, pc_embed = self.conditioner(shape=pc)
                    pc_embed = torch.cat([self.to_cond_dim_head(pc_head), self.to_cond_dim(pc_embed)], dim=-2)
                else:
                    raise ValueError(f'unknown shape_condition_model_type {self.shape_condition_model_type}')

            assert pc_embed.shape[0] == codes.shape[0], 'batch size of point cloud is not equal to the batch size of the primitive codes'

            pooled_pc_embed = pc_embed.mean(dim=1) # (b, shape_condition_dim)

            if self.shape_cond_with_cross_attn:
                attn_context_kwargs = dict(
                    context=pc_embed
                )

            if self.coarse_adaptive_rmsnorm:
                attn_context_kwargs.update(
                    condition=pooled_pc_embed
                )

        batch, seq_len, _ = codes.shape # (b, np, dim)
        device = codes.device
        assert seq_len <= self.max_seq_len, f'received codes of length {seq_len} but needs to be less than or equal to set max_seq_len {self.max_seq_len}'

        if append_eos:
            assert exists(codes)
            code_lens = primitive_mask.sum(dim=-1)
            codes = pad_tensor(codes)

            batch_arange = torch.arange(batch, device=device)
            batch_arange = rearrange(batch_arange, '... -> ... 1')
            code_lens = rearrange(code_lens, '... -> ... 1')
            codes[batch_arange, code_lens] = self.eos_token # (b, np+1, dim)

        primitive_codes = codes # (b, np, dim)

        primitive_codes_len = primitive_codes.shape[-2]

        (
            coarse_cache,
            coarse_gateloop_cache,
            coarse_post_gateloop_cache,
        ) = cache if exists(cache) else ((None,) * 3)

        if not exists(cache):
            sos = repeat(self.sos_token, 'n d -> b n d', b=batch)

            if self.shape_cond_with_cat:
                sos, _ = pack([pc_embed, sos], 'b * d')
            primitive_codes, packed_sos_shape = pack([sos, primitive_codes], 'b * d') # (b, n_sos+np, dim)

        # condition primitive codes with shape if needed
        if self.condition_on_shape:
            primitive_codes = self.shape_coarse_film_cond(primitive_codes, pooled_pc_embed)

        # attention on primitive codes (coarse)

        if exists(self.coarse_gateloop_block):
            primitive_codes, coarse_gateloop_cache = self.coarse_gateloop_block(primitive_codes, cache=coarse_gateloop_cache)

        attended_primitive_codes, coarse_cache = self.decoder( # (b, n_sos+np, dim) 
            primitive_codes,
            cache=coarse_cache,
            return_hiddens=True,
            **attn_context_kwargs
        )

        if exists(self.coarse_post_gateloop_block):
            primitive_codes, coarse_post_gateloop_cache = self.coarse_post_gateloop_block(primitive_codes, cache=coarse_post_gateloop_cache)

        embed = attended_primitive_codes[:, -(primitive_codes_len + 1):] # (b, np+1, dim)

        if not return_cache:
            return embed[:, -1:]

        next_cache = (
            coarse_cache,
            coarse_gateloop_cache,
            coarse_post_gateloop_cache
        )

        return embed[:, -1:], next_cache


def pad_tensor(tensor):
    if tensor.dim() == 3:
        bs, seq_len, dim = tensor.shape
        padding = torch.zeros((bs, 1, dim), dtype=tensor.dtype, device=tensor.device)
    elif tensor.dim() == 2:
        bs, seq_len = tensor.shape
        padding = torch.zeros((bs, 1), dtype=tensor.dtype, device=tensor.device)
    else:
        raise ValueError('Unsupported tensor shape: {}'.format(tensor.shape))
    
    return torch.cat([tensor, padding], dim=1)


def apply_transformation(pc, scale, rotation_vector, translation):
    bs, np, num_points, _ = pc.shape
    scaled_pc = pc * scale.unsqueeze(2)

    rotation_matrix = euler_angles_to_matrix(rotation_vector.view(-1, 3), 'XYZ').view(bs, np, 3, 3) # euler tmp
    rotated_pc = torch.einsum('bnij,bnpj->bnpi', rotation_matrix, scaled_pc)

    transformed_pc = rotated_pc + translation.unsqueeze(2)

    return transformed_pc


def random_sample_pc(pc, max_lens, n_points=10000):
    bs = max_lens.shape[0]
    max_len = max_lens.max().item() * n_points

    random_values = torch.rand(bs, max_len, device=max_lens.device)
    mask = torch.arange(max_len).expand(bs, max_len).to(max_lens.device) < (max_lens * n_points)
    masked_random_values = random_values * mask.float()
    _, indices = torch.topk(masked_random_values, n_points, dim=1)
    
    return pc[torch.arange(bs).unsqueeze(1), indices]