Spaces:

yslan
/

LN3Diff_I23D

Running on Zero

LN3Diff_I23D / vit /vision_transformer.py

NIRVANALAN

update dep

829eca9 3 months ago

98.6 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Mostly copy-paste from timm library.
	https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
	"""
	from copy import deepcopy
	import math
	from functools import partial
	from sympy import flatten

	import torch
	import torch.nn as nn
	from torch import Tensor, pixel_shuffle

	from einops import rearrange, repeat
	from einops.layers.torch import Rearrange
	from torch.nn.modules import GELU

	# from vit.vision_transformer import Conv3DCrossAttentionBlock

	from .utils import trunc_normal_

	from pdb import set_trace as st
	# import apex
	try:
	from apex.normalization import FusedRMSNorm as RMSNorm
	except:
	from dit.norm import RMSNorm

	# from apex.normalization import FusedLayerNorm as LayerNorm

	try:
	from xformers.ops import memory_efficient_attention, unbind, fmha
	from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
	# from xformers.ops import RMSNorm

	XFORMERS_AVAILABLE = True
	except ImportError:
	# logger.warning("xFormers not available")
	XFORMERS_AVAILABLE = False


	class Attention(nn.Module):

	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0.,
	proj_drop=0.,
	enable_rmsnorm=False,
	qk_norm=False,):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim**-0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)
	# https://github.com/huggingface/pytorch-image-models/blob/5dce71010174ad6599653da4e8ba37fd5f9fa572/timm/models/vision_transformer.py#L79C1-L80C78
	self.q_norm = RMSNorm(head_dim, elementwise_affine=True) if qk_norm else nn.Identity() # sd-3
	self.k_norm = RMSNorm(head_dim, elementwise_affine=True) if qk_norm else nn.Identity()

	# if qk_norm:
	# self.q_norm = LayerNorm(dim, eps=1e-5)
	# self.k_norm = LayerNorm(dim, eps=1e-5)
	self.qk_norm = qk_norm

	def forward(self, x):
	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
	C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2]

	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	# return x, attn
	return x


	class MemEffAttention(Attention):

	def forward(self, x: Tensor, attn_bias=None) -> Tensor:
	if not XFORMERS_AVAILABLE:
	assert attn_bias is None, "xFormers is required for nested tensors usage"
	return super().forward(x)

	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
	q, k, v = unbind(qkv, 2)
	q, k = self.q_norm(q), self.k_norm(k)

	x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) # if not bf16, no flash-attn here.
	# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp) # force flash attention
	x = x.reshape([B, N, C])

	x = self.proj(x)
	x = self.proj_drop(x)
	return x

	class MemEffCrossAttention(MemEffAttention):
	# for cross attention, where context serves as k and v
	def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0, proj_drop=0):
	super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop, proj_drop)
	del self.qkv
	self.q = nn.Linear(dim, dim * 1, bias=qkv_bias)
	self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)

	def forward(self, x: Tensor, context: Tensor, attn_bias=None) -> Tensor:
	if not XFORMERS_AVAILABLE:
	assert attn_bias is None, "xFormers is required for nested tensors usage"
	return super().forward(x)

	B, N, C = x.shape
	# qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)

	q = self.q(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
	kv = self.kv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)

	k, v = unbind(kv, 2)

	# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
	x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
	x = x.reshape([B, N, C])

	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	# https://github.com/IBM/CrossViT/blob/main/models/crossvit.py
	class CrossAttention(nn.Module):

	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0.,
	proj_drop=0.):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
	self.scale = qk_scale or head_dim**-0.5

	self.wq = nn.Linear(dim, dim, bias=qkv_bias)
	self.wk = nn.Linear(dim, dim, bias=qkv_bias)
	self.wv = nn.Linear(dim, dim, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x):

	B, N, C = x.shape
	q = self.wq(x[:,
	0:1, ...]).reshape(B, 1, self.num_heads,
	C // self.num_heads).permute(
	0, 2, 1,
	3) # B1C -> B1H(C/H) -> BH1(C/H)
	k = self.wk(x).reshape(B, N,
	self.num_heads, C // self.num_heads).permute(
	0, 2, 1, 3) # BNC -> BNH(C/H) -> BHN(C/H)
	v = self.wv(x).reshape(B, N,
	self.num_heads, C // self.num_heads).permute(
	0, 2, 1, 3) # BNC -> BNH(C/H) -> BHN(C/H)

	attn = (q @ k.transpose(
	-2, -1)) * self.scale # BH1(C/H) @ BH(C/H)N -> BH1N
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(
	B, 1, C) # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class Conv3D_Aware_CrossAttention(nn.Module):

	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0.,
	proj_drop=0.):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
	self.scale = qk_scale or head_dim**-0.5

	self.wq = nn.Linear(dim, dim, bias=qkv_bias)
	self.wk = nn.Linear(dim, dim, bias=qkv_bias)
	self.wv = nn.Linear(dim, dim, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x):

	B, group_size, N, C = x.shape # B 3 N C
	p = int(N**0.5) # patch size
	assert p**2 == N, 'check input dim, no [cls] needed here'
	assert group_size == 3, 'designed for triplane here'

	x = x.reshape(B, group_size, p, p, C) # expand patch token dim

	# * init qkv
	# q = torch.empty(B * group_size * N,
	# 1,
	# self.num_heads,
	# C // self.num_heads,
	# device=x.device).permute(0, 2, 1, 3)
	# k = torch.empty(B * group_size * N,
	# 2 * p,
	# self.num_heads,
	# C // self.num_heads,
	# device=x.device).permute(0, 2, 1, 3)
	# v = torch.empty_like(k)

	q_x = torch.empty(
	B * group_size * N,
	1,
	# self.num_heads,
	# C // self.num_heads,
	C,
	device=x.device)
	k_x = torch.empty(
	B * group_size * N,
	2 * p,
	# self.num_heads,
	# C // self.num_heads,
	C,
	device=x.device)
	v_x = torch.empty_like(k_x)

	# ! refer to the following plane order
	# N, M, _ = coordinates.shape
	# xy_coords = coordinates[..., [0, 1]]
	# yz_coords = coordinates[..., [1, 2]]
	# zx_coords = coordinates[..., [2, 0]]
	# return torch.stack([xy_coords, yz_coords, zx_coords],
	# dim=1).reshape(N * 3, M, 2)

	index_i, index_j = torch.meshgrid(torch.arange(0, p),
	torch.arange(0, p),
	indexing='ij') # 16*16
	index_mesh_grid = torch.stack([index_i, index_j], 0).to(
	x.device).unsqueeze(0).repeat_interleave(B,
	0).reshape(B, 2, p,
	p) # B 2 p p.

	for i in range(group_size):
	q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
	0, 2, 3, 1, 4).reshape(B * N, 1, C) # B 1 p p C -> B*N, 1, C

	# TODO, how to batchify gather ops?
	plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
	1] # B 1 p p C
	plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]

	assert plane_yz.shape == plane_zx.shape == (
	B, 1, p, p, C), 'check sub plane dimensions'

	pooling_plane_yz = torch.gather(
	plane_yz,
	dim=2,
	index=index_mesh_grid[:, 0:1].reshape(B, 1, N, 1, 1).expand(
	-1, -1, -1, p,
	C)).permute(0, 2, 1, 3, 4) # B 1 256 16 C => B 256 1 16 C
	pooling_plane_zx = torch.gather(
	plane_zx,
	dim=3,
	index=index_mesh_grid[:, 1:2].reshape(B, 1, 1, N, 1).expand(
	-1, -1, p, -1,
	C)).permute(0, 3, 1, 2, 4) # B 1 16 256 C => B 256 1 16 C

	k_x[B * i * N:B * (i + 1) *
	N] = v_x[B * i * N:B * (i + 1) * N] = torch.cat(
	[pooling_plane_yz, pooling_plane_zx],
	dim=2).reshape(B * N, 2 * p,
	C) # B 256 2 16 C => (B256) 216 C

	# q[B * i * N: B * (i+1) * N] = self.wq(q_x).reshape(B*N, 1, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
	# k[B * i * N: B * (i+1) * N] = self.wk(k_x).reshape(BN, 2p, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
	# v[B * i * N: B * (i+1) * N] = self.wv(v_x).reshape(BN, 2p, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)

	q = self.wq(q_x).reshape(B * group_size * N, 1,
	self.num_heads, C // self.num_heads).permute(
	0, 2, 1,
	3) # merge num_heads into Batch dimention
	k = self.wk(k_x).reshape(B * group_size * N, 2 * p, self.num_heads,
	C // self.num_heads).permute(0, 2, 1, 3)
	v = self.wv(v_x).reshape(B * group_size * N, 2 * p, self.num_heads,
	C // self.num_heads).permute(0, 2, 1, 3)

	attn = (q @ k.transpose(
	-2, -1)) * self.scale # BH1(C/H) @ BH(C/H)N -> BH1N, N=2p here
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(
	B * 3 * N, 1,
	C) # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
	x = self.proj(x)
	x = self.proj_drop(x)

	# reshape x back
	x = x.reshape(B, 3, N, C)

	return x


	class xformer_Conv3D_Aware_CrossAttention(nn.Module):
	# https://github.dev/facebookresearch/dinov2
	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0.,
	proj_drop=0.):
	super().__init__()

	# https://pytorch.org/blog/accelerated-generative-diffusion-models/

	self.num_heads = num_heads
	self.wq = nn.Linear(dim, dim * 1, bias=qkv_bias)
	self.w_kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)

	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	self.index_mesh_grid = None

	def forward(self, x, attn_bias=None):

	B, group_size, N, C = x.shape # B 3 N C
	p = int(N**0.5) # patch size
	assert p**2 == N, 'check input dim, no [cls] needed here'
	assert group_size == 3, 'designed for triplane here'

	x = x.reshape(B, group_size, p, p, C) # expand patch token dim

	q_x = torch.empty(B * group_size * N, 1, C, device=x.device)
	context = torch.empty(B * group_size * N, 2 * p, C,
	device=x.device) # k_x=v_x

	if self.index_mesh_grid is None: # further accelerate
	index_i, index_j = torch.meshgrid(torch.arange(0, p),
	torch.arange(0, p),
	indexing='ij') # 16*16
	index_mesh_grid = torch.stack([index_i, index_j], 0).to(
	x.device).unsqueeze(0).repeat_interleave(B, 0).reshape(
	B, 2, p, p) # B 2 p p.
	self.index_mesh_grid = index_mesh_grid[0:1]
	else:
	index_mesh_grid = self.index_mesh_grid.clone().repeat_interleave(
	B, 0)
	assert index_mesh_grid.shape == (
	B, 2, p, p), 'check index_mesh_grid dimension'

	for i in range(group_size):
	q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
	0, 2, 3, 1, 4).reshape(B * N, 1, C) # B 1 p p C -> B*N, 1, C

	# TODO, how to batchify gather ops?
	plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
	1] # B 1 p p C
	plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]

	assert plane_yz.shape == plane_zx.shape == (
	B, 1, p, p, C), 'check sub plane dimensions'

	pooling_plane_yz = torch.gather(
	plane_yz,
	dim=2,
	index=index_mesh_grid[:, 0:1].reshape(B, 1, N, 1, 1).expand(
	-1, -1, -1, p,
	C)).permute(0, 2, 1, 3, 4) # B 1 256 16 C => B 256 1 16 C
	pooling_plane_zx = torch.gather(
	plane_zx,
	dim=3,
	index=index_mesh_grid[:, 1:2].reshape(B, 1, 1, N, 1).expand(
	-1, -1, p, -1,
	C)).permute(0, 3, 1, 2, 4) # B 1 16 256 C => B 256 1 16 C

	context[B * i * N:B * (i + 1) * N] = torch.cat(
	[pooling_plane_yz, pooling_plane_zx],
	dim=2).reshape(B * N, 2 * p,
	C) # B 256 2 16 C => (B256) 216 C

	# B, N, C = x.shape

	q = self.wq(q_x).reshape(B * group_size * N, 1, self.num_heads,
	C // self.num_heads)

	kv = self.w_kv(context).reshape(B * group_size * N, 2 * p, 2,
	self.num_heads, C // self.num_heads)
	k, v = unbind(kv, 2)

	x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
	# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
	x = x.transpose(1, 2).reshape([B * 3 * N, 1, C]).reshape(B, 3, N, C)

	x = self.proj(x)
	x = self.proj_drop(x)

	return x


	class xformer_Conv3D_Aware_CrossAttention_xygrid(
	xformer_Conv3D_Aware_CrossAttention):
	"""implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
	"""

	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0.0,
	proj_drop=0.0):
	super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop,
	proj_drop)

	def forward(self, x, attn_bias=None):

	B, group_size, N, C = x.shape # B 3 N C
	p = int(N**0.5) # patch size
	assert p**2 == N, 'check input dim, no [cls] needed here'
	assert group_size == 3, 'designed for triplane here'

	x = x.reshape(B, group_size, p, p, C) # expand patch token dim

	q_x = torch.empty(B * group_size * N, 1, C, device=x.device)
	context = torch.empty(B * group_size * N, 2 * p, C,
	device=x.device) # k_x=v_x

	if self.index_mesh_grid is None: # further accelerate
	index_u, index_v = torch.meshgrid(
	torch.arange(0, p), torch.arange(0, p),
	indexing='xy') # ! switch to 'xy' here to match uv coordinate
	index_mesh_grid = torch.stack([index_u, index_v], 0).to(
	x.device).unsqueeze(0).repeat_interleave(B, 0).reshape(
	B, 2, p, p) # B 2 p p.
	self.index_mesh_grid = index_mesh_grid[0:1]
	else:
	index_mesh_grid = self.index_mesh_grid.clone().repeat_interleave(
	B, 0)
	assert index_mesh_grid.shape == (
	B, 2, p, p), 'check index_mesh_grid dimension'

	for i in range(group_size):
	q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
	0, 2, 3, 1, 4).reshape(B * N, 1, C) # B 1 p p C -> B*N, 1, C

	# TODO, how to batchify gather ops?
	plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
	1] # B 1 p p C
	plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]

	assert plane_yz.shape == plane_zx.shape == (
	B, 1, p, p, C), 'check sub plane dimensions'

	pooling_plane_yz = torch.gather(
	plane_yz,
	dim=2,
	index=index_mesh_grid[:, 1:2].reshape(B, 1, N, 1, 1).expand(
	-1, -1, -1, p,
	C)).permute(0, 2, 1, 3, 4) # B 1 256 16 C => B 256 1 16 C
	pooling_plane_zx = torch.gather(
	plane_zx,
	dim=3,
	index=index_mesh_grid[:, 0:1].reshape(B, 1, 1, N, 1).expand(
	-1, -1, p, -1,
	C)).permute(0, 3, 1, 2, 4) # B 1 16 256 C => B 256 1 16 C

	context[B * i * N:B * (i + 1) * N] = torch.cat(
	[pooling_plane_yz, pooling_plane_zx],
	dim=2).reshape(B * N, 2 * p,
	C) # B 256 2 16 C => (B256) 216 C

	# B, N, C = x.shape
	q = self.wq(q_x).reshape(B * group_size * N, 1, self.num_heads,
	C // self.num_heads)

	kv = self.w_kv(context).reshape(B * group_size * N, 2 * p, 2,
	self.num_heads, C // self.num_heads)
	k, v = unbind(kv, 2)

	x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
	# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
	x = x.transpose(1, 2).reshape([B * 3 * N, 1, C]).reshape(B, 3, N, C)

	x = self.proj(x)
	x = self.proj_drop(x)

	return x


	class xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(
	xformer_Conv3D_Aware_CrossAttention_xygrid):

	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0,
	proj_drop=0):
	super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop,
	proj_drop)

	def forward(self, x, attn_bias=None):
	# ! split x: B N C into B 3 N C//3
	B, N, C = x.shape
	x = x.reshape(B, N, C // 3, 3).permute(0, 3, 1,
	2) # B N C 3 -> B 3 N C
	x_out = super().forward(x, attn_bias) # B 3 N C
	x_out = x_out.permute(0, 2, 3, 1)# B 3 N C -> B N C 3
	x_out = x_out.reshape(*x_out.shape[:2], -1) # B N C 3 -> B N C3
	return x_out.contiguous()

	class self_cross_attn(nn.Module):
	def __init__(self, dino_attn, cross_attn, args, *kwargs) -> None:
	super().__init__(args, *kwargs)
	self.dino_attn = dino_attn
	self.cross_attn = cross_attn

	def forward(self, x_norm):
	y = self.dino_attn(x_norm) + x_norm
	return self.cross_attn(y) # will add x in the original code

	# class RodinRollOutConv(nn.Module):
	# """implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
	# Use Group Conv
	# """

	# def __init__(self, in_chans, out_chans=None):
	# super().__init__()
	# # input: B 3C H W
	# if out_chans is None:
	# out_chans = in_chans

	# self.roll_out_convs = nn.Conv2d(in_chans,
	# out_chans,
	# kernel_size=3,
	# groups=3,
	# padding=1)

	# def forward(self, x):
	# return self.roll_out_convs(x)


	class RodinRollOutConv3D(nn.Module):
	"""implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
	"""

	def __init__(self, in_chans, out_chans=None):
	super().__init__()
	if out_chans is None:
	out_chans = in_chans

	self.out_chans = out_chans // 3

	self.roll_out_convs = nn.Conv2d(in_chans,
	self.out_chans,
	kernel_size=3,
	padding=1)

	def forward(self, x):
	# todo, reshape before input?

	B, C3, p, p = x.shape # B 3C H W
	C = C3 // 3
	group_size = C3 // C
	assert group_size == 3

	x = x.reshape(B, 3, C, p, p)

	roll_out_x = torch.empty(B, group_size * C, p, 3 * p,
	device=x.device) # B, 3C, H, 3W

	for i in range(group_size):
	plane_xy = x[:, i] # B C H W

	# TODO, simply do the average pooling?
	plane_yz_pooling = x[:, (i + 1) % group_size].mean(
	dim=-1, keepdim=True).repeat_interleave(
	p, dim=-1) # B C H W -> B C H 1 -> B C H W, reduce z dim
	plane_zx_pooling = x[:, (i + 2) % group_size].mean(
	dim=-2, keepdim=True).repeat_interleave(
	p, dim=-2) # B C H W -> B C 1 W -> B C H W, reduce z dim

	roll_out_x[..., i * p:(i + 1) * p] = torch.cat(
	[plane_xy, plane_yz_pooling, plane_zx_pooling],
	1) # fill in the 3W dim

	x = self.roll_out_convs(roll_out_x) # B C H 3W

	x = x.reshape(B, self.out_chans, p, 3, p)
	x = x.permute(0, 3, 1, 2, 4).reshape(B, 3 * self.out_chans, p,
	p) # B 3C H W

	return x


	class RodinRollOutConv3D_GroupConv(nn.Module):
	"""implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
	"""

	def __init__(self,
	in_chans,
	out_chans=None,
	kernel_size=3,
	stride=1,
	padding=1):
	super().__init__()
	if out_chans is None:
	out_chans = in_chans

	self.roll_out_convs = nn.Conv2d(
	in_chans * 3,
	out_chans,
	kernel_size=kernel_size,
	groups=3, # B 9C H W
	stride=stride,
	padding=padding)

	# @torch.autocast(device_type='cuda')
	def forward(self, x):
	# todo, reshape before input?

	B, C3, p, p = x.shape # B 3C H W
	C = C3 // 3
	group_size = C3 // C
	assert group_size == 3

	x = x.reshape(B, 3, C, p, p)

	roll_out_x = torch.empty(B, group_size * C * 3, p, p,
	device=x.device) # B, 3C, H, 3W

	for i in range(group_size):
	plane_xy = x[:, i] # B C H W

	# # TODO, simply do the average pooling?
	plane_yz_pooling = x[:, (i + 1) % group_size].mean(
	dim=-1, keepdim=True).repeat_interleave(
	p, dim=-1) # B C H W -> B C H 1 -> B C H W, reduce z dim
	plane_zx_pooling = x[:, (i + 2) % group_size].mean(
	dim=-2, keepdim=True).repeat_interleave(
	p, dim=-2) # B C H W -> B C 1 W -> B C H W, reduce z dim

	roll_out_x[:, i * 3 * C:(i + 1) * 3 * C] = torch.cat(
	[plane_xy, plane_yz_pooling, plane_zx_pooling],
	1) # fill in the 3W dim

	# ! directly cat, avoid intermediate vars
	# ? why OOM
	# roll_out_x[:, i * 3 * C:(i + 1) * 3 * C] = torch.cat(
	# [
	# x[:, i],
	# x[:, (i + 1) % group_size].mean(
	# dim=-1, keepdim=True).repeat_interleave(p, dim=-1),
	# x[:, (i + 2) % group_size].mean(
	# dim=-2, keepdim=True).repeat_interleave(
	# p, dim=-2
	# ) # B C H W -> B C 1 W -> B C H W, reduce z dim
	# ],
	# 1) # fill in the 3C dim

	x = self.roll_out_convs(roll_out_x) # B 3C H W

	return x


	class RodinRollOut_GroupConv_noConv3D(nn.Module):
	"""only roll out and do Conv on individual planes
	"""

	def __init__(self,
	in_chans,
	out_chans=None,
	kernel_size=3,
	stride=1,
	padding=1):
	super().__init__()
	if out_chans is None:
	out_chans = in_chans

	self.roll_out_inplane_conv = nn.Conv2d(
	in_chans,
	out_chans,
	kernel_size=kernel_size,
	groups=3, # B 3C H W
	stride=stride,
	padding=padding)

	def forward(self, x):
	x = self.roll_out_inplane_conv(x) # B 3C H W
	return x


	# class RodinConv3D_SynthesisLayer_withact(nn.Module):
	# def __init__(self, in_chans, out_chans) -> None:
	# super().__init__()

	# self.act = nn.LeakyReLU(inplace=True)
	# self.conv = nn.Sequential(
	# RodinRollOutConv3D_GroupConv(in_chans, out_chans),
	# nn.LeakyReLU(inplace=True),
	# )

	# if in_chans != out_chans:
	# self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
	# else:
	# self.short_cut = None

	# def forward(self, feats):

	# if self.short_cut is not None:
	# res_feats = self.short_cut(feats)
	# else:
	# res_feats = feats

	# # return res_feats + self.conv(feats)
	# feats = res_feats + self.conv(feats)
	# return self.act(feats) # as in resnet, add an act before return


	class RodinConv3D_SynthesisLayer_mlp_unshuffle_as_residual(nn.Module):

	def __init__(self, in_chans, out_chans) -> None:
	super().__init__()

	self.act = nn.LeakyReLU(inplace=True)
	self.conv = nn.Sequential(
	RodinRollOutConv3D_GroupConv(in_chans, out_chans),
	nn.LeakyReLU(inplace=True),
	)

	self.out_chans = out_chans
	if in_chans != out_chans:
	# self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
	self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
	in_chans // 3, # 144 / 3 = 48
	out_chans // 3 * 4 * 4, # 32 * 16
	bias=True) # decoder to pat

	# RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
	else:
	self.short_cut = None

	def shortcut_unpatchify_triplane(self,
	x,
	p=None,
	unpatchify_out_chans=None):
	"""separate triplane version; x shape: B (3*257) 768
	"""

	assert self.short_cut is not None

	# B, L, C = x.shape
	B, C3, h, w = x.shape
	assert h == w
	L = h * w
	x = x.reshape(B, C3 // 3, 3, L).permute(0, 2, 3,
	1) # (B, 3, L // 3, C)

	x = self.short_cut(x)

	p = h * 4

	x = x.reshape(shape=(B, 3, h, w, p, p, unpatchify_out_chans))
	x = torch.einsum('ndhwpqc->ndchpwq',
	x) # nplanes, C order in the renderer.py
	x = x.reshape(shape=(B, 3 * self.out_chans, h * p, h * p))
	return x

	def forward(self, feats):

	if self.short_cut is not None:
	res_feats = self.shortcut_unpatchify_triplane(feats)
	else:
	res_feats = feats

	# return res_feats + self.conv(feats)

	feats = res_feats + self.conv(feats)
	return self.act(feats) # as in resnet, add an act before return


	# class RodinConv3D_SynthesisLayer(nn.Module):
	# def __init__(self, in_chans, out_chans) -> None:
	# super().__init__()

	# self.act = nn.LeakyReLU(inplace=True)
	# self.conv = nn.Sequential(
	# RodinRollOutConv3D_GroupConv(in_chans, out_chans),
	# nn.LeakyReLU(inplace=True),
	# )

	# if in_chans != out_chans:
	# self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
	# else:
	# self.short_cut = None

	# def forward(self, feats):

	# if self.short_cut is not None:
	# res_feats = self.short_cut(feats)
	# else:
	# res_feats = feats

	# # return res_feats + self.conv(feats)

	# feats = res_feats + self.conv(feats)
	# # return self.act(feats) # as in resnet, add an act before return
	# return feats # ! old behaviour, no act


	# previous worked version
	class RodinConv3D_SynthesisLayer(nn.Module):

	def __init__(self, in_chans, out_chans) -> None:
	super().__init__()
	# x2 SR + 1x1 Conv Residual BLK
	# self.conv3D = RodinRollOutConv3D(in_chans, out_chans)

	self.act = nn.LeakyReLU(inplace=True)
	self.conv = nn.Sequential(
	RodinRollOutConv3D_GroupConv(in_chans, out_chans),
	nn.LeakyReLU(inplace=True),
	)

	if in_chans != out_chans:
	self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
	else:
	self.short_cut = None

	def forward(self, feats):
	feats_out = self.conv(feats)
	if self.short_cut is not None:
	# ! failed below
	feats_out = self.short_cut(
	feats
	) + feats_out # ! only difference here, no act() compared with baseline
	# feats_out = self.act(self.short_cut(feats)) + feats_out # ! only difference here, no act() compared with baseline
	else:
	feats_out = feats_out + feats
	return feats_out


	class RodinRollOutConv3DSR2X(nn.Module):

	def __init__(self, in_chans, **kwargs) -> None:
	super().__init__()
	self.conv3D = RodinRollOutConv3D_GroupConv(in_chans)
	# self.conv3D = RodinRollOutConv3D(in_chans)
	self.act = nn.LeakyReLU(inplace=True)
	self.input_resolution = 224

	def forward(self, x):
	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	group_size = C3 // C

	assert group_size == 3
	# p = int(N**0.5) # patch size
	# assert p**2 == N, 'check input dim, no [cls] needed here'
	assert group_size == 3, 'designed for triplane here'

	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if x.shape[-1] != self.input_resolution:
	x = torch.nn.functional.interpolate(x,
	size=(self.input_resolution,
	self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)

	x = x + self.conv3D(x)

	return x


	class RodinRollOutConv3DSR4X_lite(nn.Module):

	def __init__(self, in_chans, input_resolutiopn=256, **kwargs) -> None:
	super().__init__()
	self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans)
	self.conv3D_1 = RodinRollOutConv3D_GroupConv(in_chans)

	self.act = nn.LeakyReLU(inplace=True)
	self.input_resolution = input_resolutiopn

	def forward(self, x):
	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	group_size = C3 // C

	assert group_size == 3
	# p = int(N**0.5) # patch size
	# assert p**2 == N, 'check input dim, no [cls] needed here'
	assert group_size == 3, 'designed for triplane here'

	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if x.shape[-1] != self.input_resolution:
	x = torch.nn.functional.interpolate(x,
	size=(self.input_resolution,
	self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)

	# ! still not convering, not bug here?
	# x = x + self.conv3D_0(x)
	# x = x + self.conv3D_1(x)

	x = x + self.act(self.conv3D_0(x))
	x = x + self.act(self.conv3D_1(x))

	# TODO: which is better, bilinear + conv or PixelUnshuffle?

	return x


	# class RodinConv3D2X_lite_mlp_as_residual(nn.Module):
	# """lite 4X version, with MLP unshuffle to change the dimention
	# """
	# def __init__(self, in_chans, out_chans, input_resolution=256) -> None:
	# super().__init__()

	# self.act = nn.LeakyReLU(inplace=True)

	# self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
	# self.conv3D_1 = RodinRollOutConv3D_GroupConv(out_chans, out_chans)

	# self.act = nn.LeakyReLU(inplace=True)
	# self.input_resolution = input_resolution

	# self.out_chans = out_chans
	# if in_chans != out_chans: # ! only change the dimension
	# self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
	# in_chans//3, # 144 / 3 = 48
	# out_chans//3, # 32 * 16
	# bias=True) # decoder to pat
	# else:
	# self.short_cut = None

	# def shortcut_unpatchify_triplane(self, x, p=None):
	# """separate triplane version; x shape: B (3*257) 768
	# """

	# assert self.short_cut is not None

	# # B, L, C = x.shape
	# B, C3, h, w = x.shape
	# assert h == w
	# L = h*w
	# x = x.reshape(B, C3//3, 3, L).permute(0,2,3,1) # (B, 3, L // 3, C_in)

	# x = self.short_cut(x) # B 3 L//3 C_out

	# x = x.permute(0,1,3,2) # B 3 C_out L//3
	# x = x.reshape(shape=(B, self.out_chans, h, w))

	# # directly resize to the target, no unpatchify here since no 3D ViT is included here
	# if w != self.input_resolution:
	# x = torch.nn.functional.interpolate(x, # 4X SR
	# size=(self.input_resolution,
	# self.input_resolution),
	# mode='bilinear',
	# align_corners=False,
	# antialias=True)

	# return x

	# def forward(self, x):

	# # x: B 3 112*112 C
	# B, C3, p, p = x.shape # after unpachify triplane
	# C = C3 // 3

	# if self.short_cut is not None:
	# res_feats = self.shortcut_unpatchify_triplane(x)
	# else:
	# res_feats = x

	# """following forward code copied from lite4x version
	# """
	# x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	# p) # B 3 C N -> B 3C h W

	# if x.shape[-1] != self.input_resolution:
	# x = torch.nn.functional.interpolate(x, # 4X SR
	# size=(self.input_resolution,
	# self.input_resolution),
	# mode='bilinear',
	# align_corners=False,
	# antialias=True)

	# x = res_feats + self.act(self.conv3D_0(x))
	# x = x + self.act(self.conv3D_1(x))

	# return x


	class RodinConv3D4X_lite_mlp_as_residual(nn.Module):
	"""lite 4X version, with MLP unshuffle to change the dimention
	"""

	def __init__(self,
	in_chans,
	out_chans,
	input_resolution=256,
	interp_mode='bilinear',
	bcg_triplane=False) -> None:
	super().__init__()

	self.interp_mode = interp_mode

	self.act = nn.LeakyReLU(inplace=True)

	self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
	self.conv3D_1 = RodinRollOutConv3D_GroupConv(out_chans, out_chans)
	self.bcg_triplane = bcg_triplane
	if bcg_triplane:
	self.conv3D_1_bg = RodinRollOutConv3D_GroupConv(
	out_chans, out_chans)

	self.act = nn.LeakyReLU(inplace=True)
	self.input_resolution = input_resolution

	self.out_chans = out_chans
	if in_chans != out_chans: # ! only change the dimension
	self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
	in_chans // 3, # 144 / 3 = 48
	out_chans // 3, # 32 * 16
	bias=True) # decoder to pat
	else:
	self.short_cut = None

	def shortcut_unpatchify_triplane(self, x, p=None):
	"""separate triplane version; x shape: B (3*257) 768
	"""

	assert self.short_cut is not None

	B, C3, h, w = x.shape
	assert h == w
	L = h * w
	x = x.reshape(B, C3 // 3, 3, L).permute(0, 2, 3,
	1) # (B, 3, L // 3, C_in)

	x = self.short_cut(x) # B 3 L//3 C_out

	x = x.permute(0, 1, 3, 2) # B 3 C_out L//3
	x = x.reshape(shape=(B, self.out_chans, h, w))

	# directly resize to the target, no unpatchify here since no 3D ViT is included here
	if w != self.input_resolution:
	x = torch.nn.functional.interpolate(
	x, # 4X SR
	size=(self.input_resolution, self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)

	return x

	def interpolate(self, feats):
	if self.interp_mode == 'bilinear':
	return torch.nn.functional.interpolate(
	feats, # 4X SR
	size=(self.input_resolution, self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)
	else:
	return torch.nn.functional.interpolate(
	feats, # 4X SR
	size=(self.input_resolution, self.input_resolution),
	mode='nearest',
	)

	def forward(self, x):

	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3

	if self.short_cut is not None:
	res_feats = self.shortcut_unpatchify_triplane(x)
	else:
	res_feats = x
	if res_feats.shape[-1] != self.input_resolution:
	res_feats = self.interpolate(res_feats)
	"""following forward code copied from lite4x version
	"""
	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if x.shape[-1] != self.input_resolution:
	x = self.interpolate(x)

	x0 = res_feats + self.act(self.conv3D_0(x)) # the base feature
	x = x0 + self.act(self.conv3D_1(x0))
	if self.bcg_triplane:
	x_bcg = x0 + self.act(self.conv3D_1_bg(x0))
	return torch.cat([x, x_bcg], 1)
	else:
	return x


	class RodinConv3D4X_lite_mlp_as_residual_litev2(
	RodinConv3D4X_lite_mlp_as_residual):

	def __init__(self,
	in_chans,
	out_chans,
	num_feat=128,
	input_resolution=256,
	interp_mode='bilinear',
	bcg_triplane=False) -> None:
	super().__init__(in_chans, out_chans, input_resolution, interp_mode,
	bcg_triplane)

	self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, in_chans)
	self.conv_before_upsample = RodinRollOut_GroupConv_noConv3D(
	in_chans, num_feat * 3)
	self.conv3D_1 = RodinRollOut_GroupConv_noConv3D(
	num_feat * 3, num_feat * 3)
	self.conv_last = RodinRollOut_GroupConv_noConv3D(
	num_feat * 3, out_chans)
	self.short_cut = None

	def forward(self, x):

	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3

	# if self.short_cut is not None:
	# res_feats = self.shortcut_unpatchify_triplane(x)
	# else:
	# res_feats = x
	# if res_feats.shape[-1] != self.input_resolution:
	# res_feats = self.interpolate(res_feats)
	"""following forward code copied from lite4x version
	"""
	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	x = x + self.conv3D_0(x) # the base feature
	x = self.act(self.conv_before_upsample(x))

	# if x.shape[-1] != self.input_resolution:
	x = self.conv_last(self.act(self.conv3D_1(self.interpolate(x))))

	return x


	class RodinConv3D4X_lite_mlp_as_residual_lite(
	RodinConv3D4X_lite_mlp_as_residual):

	def __init__(self,
	in_chans,
	out_chans,
	input_resolution=256,
	interp_mode='bilinear') -> None:
	super().__init__(in_chans, out_chans, input_resolution, interp_mode)
	"""replace the first Rodin Conv 3D with ordinary rollout conv to save memory
	"""
	self.conv3D_0 = RodinRollOut_GroupConv_noConv3D(in_chans, out_chans)


	class SR3D(nn.Module):
	# https://github.com/SeanChenxy/Mimic3D/blob/77d313656df3cd5536d2c4c5766db3a56208eea6/training/networks_stylegan2.py#L629
	# roll-out and apply two deconv/pixelUnshuffle layer

	def __init__(self, args, *kwargs) -> None:
	super().__init__(args, *kwargs)


	class RodinConv3D4X_lite_mlp_as_residual_improved(nn.Module):

	def __init__(self,
	in_chans,
	num_feat,
	out_chans,
	input_resolution=256) -> None:
	super().__init__()

	assert in_chans == 4 * out_chans
	assert num_feat == 2 * out_chans
	self.input_resolution = input_resolution

	# refer to https://github.com/JingyunLiang/SwinIR/blob/6545850fbf8df298df73d81f3e8cba638787c8bd/models/network_swinir.py#L750
	self.upscale = 4

	self.conv_after_body = RodinRollOutConv3D_GroupConv(
	in_chans, in_chans, 3, 1, 1)
	self.conv_before_upsample = nn.Sequential(
	RodinRollOutConv3D_GroupConv(in_chans, num_feat, 3, 1, 1),
	nn.LeakyReLU(inplace=True))
	self.conv_up1 = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
	1)
	if self.upscale == 4:
	self.conv_up2 = RodinRollOutConv3D_GroupConv(
	num_feat, num_feat, 3, 1, 1)
	self.conv_hr = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
	1)
	self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, out_chans, 3,
	1, 1)

	self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

	def forward(self, x):

	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	"""following forward code copied from lite4x version
	"""
	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	# ? nearest or bilinear
	x = self.conv_after_body(x) + x
	x = self.conv_before_upsample(x)
	x = self.lrelu(
	self.conv_up1(
	torch.nn.functional.interpolate(
	x,
	scale_factor=2,
	mode='nearest',
	# align_corners=False,
	# antialias=True
	)))
	if self.upscale == 4:
	x = self.lrelu(
	self.conv_up2(
	torch.nn.functional.interpolate(
	x,
	scale_factor=2,
	mode='nearest',
	# align_corners=False,
	# antialias=True
	)))
	x = self.conv_last(self.lrelu(self.conv_hr(x)))

	assert x.shape[-1] == self.input_resolution

	return x


	class RodinConv3D4X_lite_improved_lint_withresidual(nn.Module):

	def __init__(self,
	in_chans,
	num_feat,
	out_chans,
	input_resolution=256) -> None:
	super().__init__()

	assert in_chans == 4 * out_chans
	assert num_feat == 2 * out_chans
	self.input_resolution = input_resolution

	# refer to https://github.com/JingyunLiang/SwinIR/blob/6545850fbf8df298df73d81f3e8cba638787c8bd/models/network_swinir.py#L750
	self.upscale = 4

	self.conv_after_body = RodinRollOutConv3D_GroupConv(
	in_chans, in_chans, 3, 1, 1)
	self.conv_before_upsample = nn.Sequential(
	RodinRollOutConv3D_GroupConv(in_chans, num_feat, 3, 1, 1),
	nn.LeakyReLU(inplace=True))
	self.conv_up1 = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
	1)
	if self.upscale == 4:
	self.conv_up2 = RodinRollOutConv3D_GroupConv(
	num_feat, num_feat, 3, 1, 1)
	self.conv_hr = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
	1)
	self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, out_chans, 3,
	1, 1)

	self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

	def forward(self, x):

	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	"""following forward code copied from lite4x version
	"""
	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	# ? nearest or bilinear
	x = self.conv_after_body(x) + x
	x = self.conv_before_upsample(x)
	x = self.lrelu(
	self.conv_up1(
	torch.nn.functional.interpolate(
	x,
	scale_factor=2,
	mode='nearest',
	# align_corners=False,
	# antialias=True
	)))
	if self.upscale == 4:
	x = self.lrelu(
	self.conv_up2(
	torch.nn.functional.interpolate(
	x,
	scale_factor=2,
	mode='nearest',
	# align_corners=False,
	# antialias=True
	)))
	x = self.conv_last(self.lrelu(self.conv_hr(x) + x))

	assert x.shape[-1] == self.input_resolution

	return x


	class RodinRollOutConv3DSR_FlexibleChannels(nn.Module):

	def __init__(self,
	in_chans,
	num_out_ch=96,
	input_resolution=256,
	**kwargs) -> None:
	super().__init__()

	self.block0 = RodinConv3D_SynthesisLayer(in_chans,
	num_out_ch) # in_chans=48
	self.block1 = RodinConv3D_SynthesisLayer(num_out_ch, num_out_ch)

	self.input_resolution = input_resolution # 64 -> 256 SR

	def forward(self, x):
	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	# group_size = C3 // C

	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if x.shape[-1] != self.input_resolution:
	x = torch.nn.functional.interpolate(x,
	size=(self.input_resolution,
	self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)

	x = self.block0(x)
	x = self.block1(x)

	return x


	# previous worked version
	class RodinRollOutConv3DSR4X(nn.Module):
	# follow PixelUnshuffleUpsample

	def __init__(self, in_chans, **kwargs) -> None:
	super().__init__()
	# self.block0 = RodinConv3D_SynthesisLayer(in_chans, 96 * 2) # TODO, match the old behaviour now.
	# self.block1 = RodinConv3D_SynthesisLayer(96 * 2, 96)

	self.block0 = RodinConv3D_SynthesisLayer(in_chans, 96)
	self.block1 = RodinConv3D_SynthesisLayer(
	96, 96) # baseline choice, validate with no LPIPS loss here

	self.input_resolution = 64 # 64 -> 256

	def forward(self, x):
	# x: B 3 112*112 C
	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	# group_size = C3 // C

	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if x.shape[-1] != self.input_resolution:
	x = torch.nn.functional.interpolate(x,
	size=(self.input_resolution,
	self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)

	x = self.block0(x)
	x = self.block1(x)

	return x


	class Upsample3D(nn.Module):
	"""Upsample module.
	Args:
	scale (int): Scale factor. Supported scales: 2^n and 3.
	num_feat (int): Channel number of intermediate features.
	"""

	def __init__(self, scale, num_feat):
	super().__init__()

	m_convs = []
	m_pixelshuffle = []

	assert (scale & (scale - 1)) == 0, 'scale = 2^n'
	self.scale = scale

	for _ in range(int(math.log(scale, 2))):
	m_convs.append(
	RodinRollOutConv3D_GroupConv(num_feat, 4 * num_feat, 3, 1, 1))
	m_pixelshuffle.append(nn.PixelShuffle(2))

	self.m_convs = nn.ModuleList(m_convs)
	self.m_pixelshuffle = nn.ModuleList(m_pixelshuffle)

	# @torch.autocast(device_type='cuda')
	def forward(self, x):
	for scale_idx in range(int(math.log(self.scale, 2))):
	x = self.m_convs[scale_idx](x) # B 3C H W
	# x =
	# B, C3, H, W = x.shape
	x = x.reshape(x.shape[0] * 3, x.shape[1] // 3, *x.shape[2:])
	x = self.m_pixelshuffle[scale_idx](x)
	x = x.reshape(x.shape[0] // 3, x.shape[1] * 3, *x.shape[2:])

	return x


	class RodinConv3DPixelUnshuffleUpsample(nn.Module):

	def __init__(self,
	output_dim,
	num_feat=32 * 6,
	num_out_ch=32 * 3,
	sr_ratio=4,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.conv_after_body = RodinRollOutConv3D_GroupConv(
	output_dim, output_dim, 3, 1, 1)
	self.conv_before_upsample = nn.Sequential(
	RodinRollOutConv3D_GroupConv(output_dim, num_feat, 3, 1, 1),
	nn.LeakyReLU(inplace=True))
	self.upsample = Upsample3D(sr_ratio, num_feat) # 4 time SR
	self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, num_out_ch, 3,
	1, 1)

	# @torch.autocast(device_type='cuda')
	def forward(self, x, input_skip_connection=True, args, *kwargs):
	# x = self.conv_first(x)
	if input_skip_connection:
	x = self.conv_after_body(x) + x
	else:
	x = self.conv_after_body(x)

	x = self.conv_before_upsample(x)
	x = self.upsample(x)
	x = self.conv_last(x)
	return x


	class RodinConv3DPixelUnshuffleUpsample_improvedVersion(nn.Module):

	def __init__(
	self,
	output_dim,
	num_out_ch=32 * 3,
	sr_ratio=4,
	input_resolution=256,
	) -> None:
	super().__init__()

	self.input_resolution = input_resolution

	# self.conv_first = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
	# 3, 1, 1)
	self.upsample = Upsample3D(sr_ratio, output_dim) # 4 time SR
	self.conv_last = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
	3, 1, 1)

	def forward(self, x, bilinear_upsample=True):

	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	group_size = C3 // C

	assert group_size == 3, 'designed for triplane here'

	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if bilinear_upsample and x.shape[-1] != self.input_resolution:
	x_bilinear_upsample = torch.nn.functional.interpolate(
	x,
	size=(self.input_resolution, self.input_resolution),
	mode='bilinear',
	align_corners=False,
	antialias=True)
	x = self.upsample(x) + x_bilinear_upsample
	else:
	# x_bilinear_upsample = x
	x = self.upsample(x)

	x = self.conv_last(x)

	return x


	class RodinConv3DPixelUnshuffleUpsample_improvedVersion2(nn.Module):
	"""removed nearest neighbour residual conenctions, add a conv layer residual conenction
	"""

	def __init__(
	self,
	output_dim,
	num_out_ch=32 * 3,
	sr_ratio=4,
	input_resolution=256,
	) -> None:
	super().__init__()

	self.input_resolution = input_resolution

	self.conv_after_body = RodinRollOutConv3D_GroupConv(
	output_dim, num_out_ch, 3, 1, 1)
	self.upsample = Upsample3D(sr_ratio, output_dim) # 4 time SR
	self.conv_last = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
	3, 1, 1)

	def forward(self, x, input_skip_connection=True):

	B, C3, p, p = x.shape # after unpachify triplane
	C = C3 // 3
	group_size = C3 // C

	assert group_size == 3, 'designed for triplane here'

	x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
	p) # B 3 C N -> B 3C h W

	if input_skip_connection:
	x = self.conv_after_body(x) + x
	else:
	x = self.conv_after_body(x)

	x = self.upsample(x)
	x = self.conv_last(x)

	return x


	class CLSCrossAttentionBlock(nn.Module):

	def __init__(self,
	dim,
	num_heads,
	mlp_ratio=4.,
	qkv_bias=False,
	qk_scale=None,
	drop=0.,
	attn_drop=0.,
	drop_path=0.,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	has_mlp=False):
	super().__init__()
	self.norm1 = norm_layer(dim)
	self.attn = CrossAttention(dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=drop)
	# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
	self.drop_path = DropPath(
	drop_path) if drop_path > 0. else nn.Identity()
	self.has_mlp = has_mlp
	if has_mlp:
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim,
	hidden_features=mlp_hidden_dim,
	act_layer=act_layer,
	drop=drop)

	def forward(self, x):
	x = x[:, 0:1, ...] + self.drop_path(self.attn(self.norm1(x)))
	if self.has_mlp:
	x = x + self.drop_path(self.mlp(self.norm2(x)))

	return x


	class Conv3DCrossAttentionBlock(nn.Module):

	def __init__(self,
	dim,
	num_heads,
	mlp_ratio=4.,
	qkv_bias=False,
	qk_scale=None,
	drop=0.,
	attn_drop=0.,
	drop_path=0.,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	has_mlp=False):
	super().__init__()
	self.norm1 = norm_layer(dim)
	self.attn = Conv3D_Aware_CrossAttention(dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=drop)
	# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
	self.drop_path = DropPath(
	drop_path) if drop_path > 0. else nn.Identity()
	self.has_mlp = has_mlp
	if has_mlp:
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim,
	hidden_features=mlp_hidden_dim,
	act_layer=act_layer,
	drop=drop)

	def forward(self, x):
	x = x + self.drop_path(self.attn(self.norm1(x)))
	if self.has_mlp:
	x = x + self.drop_path(self.mlp(self.norm2(x)))

	return x


	class Conv3DCrossAttentionBlockXformerMHA(Conv3DCrossAttentionBlock):

	def __init__(self,
	dim,
	num_heads,
	mlp_ratio=4,
	qkv_bias=False,
	qk_scale=None,
	drop=0,
	attn_drop=0,
	drop_path=0,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	has_mlp=False):
	super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
	attn_drop, drop_path, act_layer, norm_layer, has_mlp)
	# self.attn = xformer_Conv3D_Aware_CrossAttention(dim,
	self.attn = xformer_Conv3D_Aware_CrossAttention_xygrid(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=drop)


	class Conv3DCrossAttentionBlockXformerMHANested(
	Conv3DCrossAttentionBlockXformerMHA):

	def __init__(self,
	dim,
	num_heads,
	mlp_ratio=4,
	qkv_bias=False,
	qk_scale=None,
	drop=0.,
	attn_drop=0.,
	drop_path=0.,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	has_mlp=False):
	super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
	attn_drop, drop_path, act_layer, norm_layer, has_mlp)
	"""for in-place replaing the internal attn in Dino ViT.
	"""

	def forward(self, x):
	Bx3, N, C = x.shape
	B, group_size = Bx3 // 3, 3
	x = x.reshape(B, group_size, N, C) # in plane vit
	x = super().forward(x)
	return x.reshape(B * group_size, N,
	C) # to match the original attn size


	class Conv3DCrossAttentionBlockXformerMHANested_withinC(
	Conv3DCrossAttentionBlockXformerMHANested):

	def __init__(self,
	dim,
	num_heads,
	mlp_ratio=4,
	qkv_bias=False,
	qk_scale=None,
	drop=0,
	attn_drop=0,
	drop_path=0,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	has_mlp=False):
	super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
	attn_drop, drop_path, act_layer, norm_layer, has_mlp)
	self.attn = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=drop)

	def forward(self, x):
	# basic TX attention forward function
	x = x + self.drop_path(self.attn(self.norm1(x)))
	if self.has_mlp:
	x = x + self.drop_path(self.mlp(self.norm2(x)))

	return x


	class TriplaneFusionBlock(nn.Module):
	"""4 ViT blocks + 1 CrossAttentionBlock
	"""

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	cross_attention_blk=CLSCrossAttentionBlock,
	*args,
	**kwargs) -> None:
	super().__init__(args, *kwargs)

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	if use_fusion_blk:
	self.fusion = nn.ModuleList()

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	for d in range(self.num_branches):
	self.fusion.append(
	cross_attention_blk(
	dim=dim,
	num_heads=nh,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	# drop=drop,
	drop=proj_drop,
	attn_drop=attn_drop,
	drop_path=drop_path_rate,
	norm_layer=norm_layer, # type: ignore
	has_mlp=False))
	else:
	self.fusion = None

	def forward(self, x):
	# modified from https://github.com/IBM/CrossViT/blob/main/models/crossvit.py#L132
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'
	x = x.view(B * group_size, N, C)

	for blk in self.vit_blks:
	x = blk(x) # B 3 N C

	if self.fusion is None:
	return x.view(B, group_size, N, C)

	# outs_b = x.view(B, group_size, N,
	# C).chunk(chunks=3,
	# dim=1) # 3 * [B, 1, N//3, C] Tensors, for fusion

	outs_b = x.chunk(chunks=3,
	dim=0) # 3 * [B, N//3, C] Tensors, for fusion

	# only take the cls token out
	proj_cls_token = [x[:, 0:1] for x in outs_b]
	# cross attention
	outs = []
	for i in range(self.num_branches):
	tmp = torch.cat(
	(proj_cls_token[i], outs_b[(i + 1) % self.num_branches][:, 1:,
	...]),
	dim=1)
	tmp = self.fusion[i](tmp)
	# reverted_proj_cls_token = self.revert_projs[i](tmp[:, 0:1, ...])
	reverted_proj_cls_token = tmp[:, 0:1, ...]
	tmp = torch.cat((reverted_proj_cls_token, outs_b[i][:, 1:, ...]),
	dim=1)
	outs.append(tmp)
	# outs = ? needs to merge back?
	outs = torch.stack(outs, 1) # B 3 N C
	return outs


	class TriplaneFusionBlockv2(nn.Module):
	"""4 ViT blocks + 1 CrossAttentionBlock
	"""

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlock,
	*args,
	**kwargs) -> None:
	super().__init__(args, *kwargs)

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	if use_fusion_blk:
	# self.fusion = nn.ModuleList()

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	# for d in range(self.num_branches):
	self.fusion = fusion_ca_blk( # one fusion is enough
	dim=dim,
	num_heads=nh,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	# drop=drop,
	drop=proj_drop,
	attn_drop=attn_drop,
	drop_path=drop_path_rate,
	norm_layer=norm_layer, # type: ignore
	has_mlp=False)
	else:
	self.fusion = None

	def forward(self, x):
	# modified from https://github.com/IBM/CrossViT/blob/main/models/crossvit.py#L132
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'
	x = x.reshape(B * group_size, N, C)

	for blk in self.vit_blks:
	x = blk(x) # B 3 N C

	if self.fusion is None:
	return x.reshape(B, group_size, N, C)

	x = x.reshape(B, group_size, N, C) # .chunk(chunks=3,
	# dim=1) # 3 * [B, N//3, C] Tensors, for fusion
	return self.fusion(x)


	class TriplaneFusionBlockv3(TriplaneFusionBlockv2):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHA,
	*args,
	**kwargs) -> None:
	super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
	fusion_ca_blk, args, *kwargs)


	class TriplaneFusionBlockv4(TriplaneFusionBlockv3):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHA,
	*args,
	**kwargs) -> None:
	super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
	fusion_ca_blk, args, *kwargs)
	"""OOM? directly replace the atten here
	"""

	assert len(vit_blks) == 2
	# del self.vit_blks[1].attn
	del self.vit_blks[1].attn, self.vit_blks[1].ls1, self.vit_blks[1].norm1

	def ffn_residual_func(self, tx_blk, x: Tensor) -> Tensor:
	return tx_blk.ls2(
	tx_blk.mlp(tx_blk.norm2(x))
	) # https://github.com/facebookresearch/dinov2/blob/c3c2683a13cde94d4d99f523cf4170384b00c34c/dinov2/layers/block.py#L86C1-L87C53

	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""
	assert self.fusion is not None

	B, group_size, N, C = x.shape # has [cls] token in N
	x = x.reshape(B * group_size, N, C) # in plane vit

	# in plane self attention
	x = self.vit_blks[0](x)

	# 3D cross attention blk + ffn
	x = x + self.fusion(x.reshape(B, group_size, N, C)).reshape(
	B * group_size, N, C)
	x = x + self.ffn_residual_func(self.vit_blks[1], x)
	return x.reshape(B, group_size, N, C)


	class TriplaneFusionBlockv4_nested(nn.Module):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	assert use_fusion_blk

	assert len(vit_blks) == 2

	# ! replace vit_blks[1] attn layer with 3D aware attention
	del self.vit_blks[
	1].attn # , self.vit_blks[1].ls1, self.vit_blks[1].norm1

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	self.vit_blks[1].attn = fusion_ca_blk( # one fusion is enough
	dim=dim,
	num_heads=nh,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	# drop=drop,
	drop=proj_drop,
	attn_drop=attn_drop,
	drop_path=drop_path_rate,
	norm_layer=norm_layer, # type: ignore
	has_mlp=False)

	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'
	x = x.reshape(B * group_size, N, C)

	for blk in self.vit_blks:
	x = blk(x) # B 3 N C

	# TODO, avoid the reshape overhead?
	return x.reshape(B, group_size, N, C)


	class TriplaneFusionBlockv4_nested_init_from_dino(nn.Module):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
	init_from_dino=True,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	assert use_fusion_blk

	assert len(vit_blks) == 2

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	attn_3d = fusion_ca_blk( # one fusion is enough
	dim=dim,
	num_heads=nh,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	# drop=drop,
	drop=proj_drop,
	attn_drop=attn_drop,
	drop_path=drop_path_rate,
	norm_layer=norm_layer, # type: ignore
	has_mlp=False)

	# ! initialize 3dattn from dino attn
	if init_from_dino:
	merged_qkv_linear = self.vit_blks[1].attn.qkv
	attn_3d.attn.proj.load_state_dict(
	self.vit_blks[1].attn.proj.state_dict())

	# Initialize the Q, K, and V linear layers using the weights of the merged QKV linear layer
	attn_3d.attn.wq.weight.data = merged_qkv_linear.weight.data[:
	dim, :]
	attn_3d.attn.w_kv.weight.data = merged_qkv_linear.weight.data[
	dim:, :]

	# Optionally, you can initialize the biases as well (if your QKV linear layer has biases)
	if qkv_bias:
	attn_3d.attn.wq.bias.data = merged_qkv_linear.bias.data[:dim]
	attn_3d.attn.w_kv.bias.data = merged_qkv_linear.bias.data[dim:]

	del self.vit_blks[1].attn
	# ! assign
	self.vit_blks[1].attn = attn_3d

	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'
	x = x.reshape(B * group_size, N, C)

	for blk in self.vit_blks:
	x = blk(x) # B 3 N C

	# TODO, avoid the reshape overhead?
	return x.reshape(B, group_size, N, C)


	class TriplaneFusionBlockv4_nested_init_from_dino_lite(nn.Module):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=None,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	assert use_fusion_blk

	assert len(vit_blks) == 2

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC( # ! raw 3D attn layer
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=proj_drop)

	del self.vit_blks[1].attn
	# ! assign
	self.vit_blks[1].attn = attn_3d

	def forward(self, x):
	"""x: B N C, where N = H*W tokens. Just raw ViT forward pass
	"""

	# ! move the below to the front of the first call
	B, N, C = x.shape # has [cls] token in N

	for blk in self.vit_blks:
	x = blk(x) # B N C

	return x

	class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge(nn.Module):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=None,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.vit_blks = vit_blks

	assert use_fusion_blk
	assert len(vit_blks) == 2

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim
	qkv_bias = True
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	if False: # abla
	for blk in self.vit_blks:
	attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC( # ! raw 3D attn layer
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=proj_drop)
	blk.attn = self_cross_attn(blk.attn, attn_3d)

	def forward(self, x):
	"""x: B N C, where N = H*W tokens. Just raw ViT forward pass
	"""

	# ! move the below to the front of the first call
	B, N, C = x.shape # has [cls] token in N

	for blk in self.vit_blks:
	x = blk(x) # B N C

	return x

	class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_B_3L_C(TriplaneFusionBlockv4_nested_init_from_dino_lite_merge):
	# on roll out + B 3L C
	def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, args, *kwargs) -> None:
	super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, args, *kwargs)


	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# ! move the below to the front of the first call

	# B, N, C = x.shape # has [cls] token in N
	B, group_size, N, C = x.shape # has [cls] token in N
	x = x.reshape(B, group_size*N, C)

	for blk in self.vit_blks:
	x = blk(x) # B N C

	x = x.reshape(B, group_size, N, C) # outer loop tradition

	return x

	class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_B_3L_C_withrollout(TriplaneFusionBlockv4_nested_init_from_dino_lite_merge):
	# roll out + B 3L C
	def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, args, *kwargs) -> None:
	super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, args, *kwargs)


	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# ! move the below to the front of the first call

	# B, N, C = x.shape # has [cls] token in N
	B, group_size, N, C = x.shape # has [cls] token in N
	x = x.reshape(B*group_size, N, C)
	x = self.vit_blks[0](x)

	x = x.reshape(B,group_size*N, C)
	x = self.vit_blks[1](x)

	x = x.reshape(B, group_size, N, C) # outer loop tradition

	return x


	class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_add3DAttn(TriplaneFusionBlockv4_nested_init_from_dino):
	# no roll out + 3D Attention
	def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, args, *kwargs) -> None:
	super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, args, *kwargs)


	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	B, group_size, N, C = x.shape # has [cls] token in N
	x = x.reshape(B, group_size*N, C)
	x = self.vit_blks[0](x) # B 3 L C

	# ! move the below to the front of the first call
	x = x.reshape(B, group_size, N, C).reshape(B*group_size, N, C)
	x = self.vit_blks[1](x) # has 3D attention
	return x.reshape(B, group_size, N, C)

	return x


	class TriplaneFusionBlockv5_ldm_addCA(nn.Module):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	assert use_fusion_blk

	assert len(vit_blks) == 2

	# ! rather than replacing, add a 3D attention block after.
	# del self.vit_blks[
	# 1].attn # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
	self.norm_for_atten_3d = deepcopy(self.vit_blks[1].norm1)

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	self.attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=proj_drop)

	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'

	flatten_token = lambda x: x.reshape(B * group_size, N, C)
	unflatten_token = lambda x: x.reshape(B, group_size, N, C)

	x = flatten_token(x)
	x = self.vit_blks[0](x)

	x = unflatten_token(x)
	x = self.attn_3d(self.norm_for_atten_3d(x)) + x

	x = flatten_token(x)
	x = self.vit_blks[1](x)

	return unflatten_token(x)


	class TriplaneFusionBlockv6_ldm_addCA_Init3DAttnfrom2D(
	TriplaneFusionBlockv5_ldm_addCA):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
	*args,
	**kwargs) -> None:
	super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
	fusion_ca_blk, args, *kwargs)

	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'

	flatten_token = lambda x: x.reshape(B * group_size, N, C)
	unflatten_token = lambda x: x.reshape(B, group_size, N, C)

	x = flatten_token(x)
	x = self.vit_blks[0](x)

	x = unflatten_token(x)
	x = self.attn_3d(self.norm_for_atten_3d(x)) + x

	x = flatten_token(x)
	x = self.vit_blks[1](x)

	return unflatten_token(x)


	class TriplaneFusionBlockv5_ldm_add_dualCA(nn.Module):

	def __init__(self,
	vit_blks,
	num_heads,
	embed_dim,
	use_fusion_blk=True,
	fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
	*args,
	**kwargs) -> None:
	super().__init__()

	self.num_branches = 3 # triplane
	self.vit_blks = vit_blks

	assert use_fusion_blk

	assert len(vit_blks) == 2

	# ! rather than replacing, add a 3D attention block after.
	# del self.vit_blks[
	# 1].attn # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
	self.norm_for_atten_3d_0 = deepcopy(self.vit_blks[0].norm1)
	self.norm_for_atten_3d_1 = deepcopy(self.vit_blks[1].norm1)

	# copied vit settings from https://github.dev/facebookresearch/dinov2
	nh = num_heads
	dim = embed_dim

	mlp_ratio = 4 # defined for all dino2 model
	qkv_bias = True
	norm_layer = partial(nn.LayerNorm, eps=1e-6)
	drop_path_rate = 0.3 # default setting
	attn_drop = proj_drop = 0.0
	qk_scale = None # TODO, double check

	self.attn_3d_0 = xformer_Conv3D_Aware_CrossAttention_xygrid(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=proj_drop)

	self.attn_3d_1 = deepcopy(self.attn_3d_0)

	def forward(self, x):
	"""x: B 3 N C, where N = H*W tokens
	"""

	# self attention, by merging the triplane channel into B for parallel computation

	# ! move the below to the front of the first call
	B, group_size, N, C = x.shape # has [cls] token in N
	assert group_size == 3, 'triplane'

	flatten_token = lambda x: x.reshape(B * group_size, N, C)
	unflatten_token = lambda x: x.reshape(B, group_size, N, C)

	x = flatten_token(x)
	x = self.vit_blks[0](x)

	x = unflatten_token(x)
	x = self.attn_3d_0(self.norm_for_atten_3d_0(x)) + x

	x = flatten_token(x)
	x = self.vit_blks[1](x)

	x = unflatten_token(x)
	x = self.attn_3d_1(self.norm_for_atten_3d_1(x)) + x

	return unflatten_token(x)


	def drop_path(x, drop_prob: float = 0., training: bool = False):
	if drop_prob == 0. or not training:
	return x
	keep_prob = 1 - drop_prob
	shape = (x.shape[0], ) + (1, ) * (
	x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
	random_tensor = keep_prob + torch.rand(
	shape, dtype=x.dtype, device=x.device)
	random_tensor.floor_() # binarize
	output = x.div(keep_prob) * random_tensor
	return output


	class DropPath(nn.Module):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
	"""

	def __init__(self, drop_prob=None):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training)


	class Mlp(nn.Module):

	def __init__(self,
	in_features,
	hidden_features=None,
	out_features=None,
	act_layer=nn.GELU,
	drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class Block(nn.Module):

	def __init__(self,
	dim,
	num_heads,
	mlp_ratio=4.,
	qkv_bias=False,
	qk_scale=None,
	drop=0.,
	attn_drop=0.,
	drop_path=0.,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm):
	super().__init__()
	self.norm1 = norm_layer(dim)
	# self.attn = Attention(dim,
	self.attn = MemEffAttention(dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=drop)
	self.drop_path = DropPath(
	drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim,
	hidden_features=mlp_hidden_dim,
	act_layer=act_layer,
	drop=drop)

	def forward(self, x, return_attention=False):
	y, attn = self.attn(self.norm1(x))
	if return_attention:
	return attn
	x = x + self.drop_path(y)
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x


	class PatchEmbed(nn.Module):
	""" Image to Patch Embedding
	"""

	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
	super().__init__()
	num_patches = (img_size // patch_size) * (img_size // patch_size)
	self.img_size = img_size
	self.patch_size = patch_size
	self.num_patches = num_patches

	self.proj = nn.Conv2d(in_chans,
	embed_dim,
	kernel_size=patch_size,
	stride=patch_size)

	def forward(self, x):
	B, C, H, W = x.shape
	x = self.proj(x).flatten(2).transpose(1, 2) # B, C, L -> B, L, C
	return x


	class VisionTransformer(nn.Module):
	""" Vision Transformer """

	def __init__(self,
	img_size=[224],
	patch_size=16,
	in_chans=3,
	num_classes=0,
	embed_dim=768,
	depth=12,
	num_heads=12,
	mlp_ratio=4.,
	qkv_bias=False,
	qk_scale=None,
	drop_rate=0.,
	attn_drop_rate=0.,
	drop_path_rate=0.,
	norm_layer='nn.LayerNorm',
	patch_embedding=True,
	cls_token=True,
	pixel_unshuffle=False,
	**kwargs):
	super().__init__()
	self.num_features = self.embed_dim = embed_dim
	self.patch_size = patch_size

	# if norm_layer == 'nn.LayerNorm':
	norm_layer = partial(nn.LayerNorm, eps=1e-6)

	if patch_embedding:
	self.patch_embed = PatchEmbed(img_size=img_size[0],
	patch_size=patch_size,
	in_chans=in_chans,
	embed_dim=embed_dim)
	num_patches = self.patch_embed.num_patches
	self.img_size = self.patch_embed.img_size
	else:
	self.patch_embed = None
	self.img_size = img_size[0]
	num_patches = (img_size[0] // patch_size) * (img_size[0] //
	patch_size)

	if cls_token:
	self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
	self.pos_embed = nn.Parameter(
	torch.zeros(1, num_patches + 1, embed_dim))
	else:
	self.cls_token = None
	self.pos_embed = nn.Parameter(
	torch.zeros(1, num_patches, embed_dim))

	self.pos_drop = nn.Dropout(p=drop_rate)

	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
	] # stochastic depth decay rule
	self.blocks = nn.ModuleList([
	Block(dim=embed_dim,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	drop=drop_rate,
	attn_drop=attn_drop_rate,
	drop_path=dpr[i],
	norm_layer=norm_layer) for i in range(depth)
	])
	self.norm = norm_layer(embed_dim)

	# Classifier head
	self.head = nn.Linear(
	embed_dim, num_classes) if num_classes > 0 else nn.Identity()

	trunc_normal_(self.pos_embed, std=.02)
	if cls_token:
	trunc_normal_(self.cls_token, std=.02)
	self.apply(self._init_weights)

	# if pixel_unshuffle:
	# self.decoder_pred = nn.Linear(embed_dim,
	# patch_size*2 out_chans,
	# bias=True) # decoder to patch

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	def interpolate_pos_encoding(self, x, w, h):
	npatch = x.shape[1] - 1
	N = self.pos_embed.shape[1] - 1
	if npatch == N and w == h:
	return self.pos_embed
	patch_pos_embed = self.pos_embed[:, 1:]
	dim = x.shape[-1]
	w0 = w // self.patch_size
	h0 = h // self.patch_size
	# we add a small number to avoid floating point error in the interpolation
	# see discussion at https://github.com/facebookresearch/dino/issues/8
	w0, h0 = w0 + 0.1, h0 + 0.1

	patch_pos_embed = nn.functional.interpolate(
	patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
	dim).permute(0, 3, 1, 2),
	scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
	mode='bicubic',
	)
	assert int(w0) == patch_pos_embed.shape[-2] and int(
	h0) == patch_pos_embed.shape[-1]
	patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(2, -1, dim)

	if self.cls_token is not None:
	class_pos_embed = self.pos_embed[:, 0]
	return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed),
	dim=1)
	return patch_pos_embed

	def prepare_tokens(self, x):
	B, nc, w, h = x.shape
	x = self.patch_embed(x) # patch linear embedding

	# add the [CLS] token to the embed patch tokens
	cls_tokens = self.cls_token.expand(B, -1, -1)
	x = torch.cat((cls_tokens, x), dim=1)

	# add positional encoding to each token
	x = x + self.interpolate_pos_encoding(x, w, h)

	return self.pos_drop(x)

	def forward(self, x):
	x = self.prepare_tokens(x)
	for blk in self.blocks:
	x = blk(x)
	x = self.norm(x)
	return x[:, 1:] # return spatial feature maps, not the [CLS] token
	# return x[:, 0]

	def get_last_selfattention(self, x):
	x = self.prepare_tokens(x)
	for i, blk in enumerate(self.blocks):
	if i < len(self.blocks) - 1:
	x = blk(x)
	else:
	# return attention of the last block
	return blk(x, return_attention=True)

	def get_intermediate_layers(self, x, n=1):
	x = self.prepare_tokens(x)
	# we return the output tokens from the `n` last blocks
	output = []
	for i, blk in enumerate(self.blocks):
	x = blk(x)
	if len(self.blocks) - i <= n:
	output.append(self.norm(x))
	return output


	def vit_tiny(patch_size=16, **kwargs):
	model = VisionTransformer(patch_size=patch_size,
	embed_dim=192,
	depth=12,
	num_heads=3,
	mlp_ratio=4,
	qkv_bias=True,
	norm_layer=partial(nn.LayerNorm, eps=1e-6),
	**kwargs)
	return model


	def vit_small(patch_size=16, **kwargs):
	model = VisionTransformer(
	patch_size=patch_size,
	embed_dim=384,
	depth=12,
	num_heads=6,
	mlp_ratio=4,
	qkv_bias=True,
	norm_layer=partial(nn.LayerNorm, eps=1e-6), # type: ignore
	**kwargs)
	return model


	def vit_base(patch_size=16, **kwargs):
	model = VisionTransformer(patch_size=patch_size,
	embed_dim=768,
	depth=12,
	num_heads=12,
	mlp_ratio=4,
	qkv_bias=True,
	norm_layer=partial(nn.LayerNorm, eps=1e-6),
	**kwargs)
	return model


	vits = vit_small
	vitb = vit_base