Upload 6 files

Browse files

Files changed (6) hide show

models/Blocks.py +476 -0
models/STNR.py +327 -0
models/__init__.py +13 -0
models/loss.py +155 -0
models/mamba_customer.py +569 -0
models/resnet.py +358 -0

models/Blocks.py ADDED Viewed

	@@ -0,0 +1,476 @@

+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monai.networks.layers.utils import get_act_layer
+import warnings
+warnings.filterwarnings("ignore")
+import math
+from functools import partial
+from typing import Callable
+from timm.models.layers import DropPath, to_2tuple
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from einops import rearrange, repeat
+class CAB(nn.Module):
+    def __init__(self, in_channels, out_channels=None, ratio=16, activation='relu'):
+        super(CAB, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if self.in_channels < ratio:
+            ratio = self.in_channels
+        self.reduced_channels = self.in_channels // ratio
+        if self.out_channels == None:
+            self.out_channels = in_channels
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.activation = get_act_layer(activation)
+        self.fc1 = nn.Conv2d(self.in_channels, self.reduced_channels, 1, bias=False)
+        self.fc2 = nn.Conv2d(self.reduced_channels, self.out_channels, 1, bias=False)
+        self.sigmoid = nn.Sigmoid()
+        nn.init.kaiming_normal_(self.fc1.weight, mode='fan_out', nonlinearity='relu')
+        nn.init.kaiming_normal_(self.fc2.weight, mode='fan_out', nonlinearity='relu')
+    def forward(self, x):
+        avg = self.fc2(self.activation(self.fc1(self.avg_pool(x))))
+        max = self.fc2(self.activation(self.fc1(self.max_pool(x))))
+        attention = self.sigmoid(avg + max)
+        return attention
+class SAB(nn.Module):
+    def __init__(self, kernel_size=7):
+        super(SAB, self).__init__()
+        assert kernel_size in (3, 7, 11), "kernel_size must be 3, 7 or 11"
+        padding = kernel_size // 2
+        self.conv = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
+        self.sigmoid = nn.Sigmoid()
+        nn.init.kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')
+    def forward(self, x):
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out, _ = torch.max(x, dim=1, keepdim=True)
+        x_cat = torch.cat([avg_out, max_out], dim=1)  # shape: [B, 2, H, W]
+        attention = self.sigmoid(self.conv(x_cat))
+        return attention
+#--------------------------------
+class ChannelAttention(nn.Module):
+    """Channel attention used in RCAN.
+    Args:
+        num_feat (int): Channel number of intermediate features.
+        squeeze_factor (int): Channel squeeze factor. Default: 16.
+    """
+    def __init__(self, num_feat, squeeze_factor=16):
+        super(ChannelAttention, self).__init__()
+        squeeze_channels = max(num_feat // squeeze_factor, 4)  # 防止为0
+        self.attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(num_feat,  squeeze_channels, 1, padding=0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(squeeze_channels, num_feat, 1, padding=0),
+            nn.Sigmoid())
+    def forward(self, x):
+        y = self.attention(x)
+        return x * y
+class CAB(nn.Module):
+    def __init__(self, num_feat, is_light_sr= False, compress_ratio=3,squeeze_factor=30):
+        super(CAB, self).__init__()
+        mid_channels = max(num_feat // compress_ratio, 4)  # 防止为0
+        if is_light_sr: # we use depth-wise conv for light-SR to achieve more efficient
+            self.cab = nn.Sequential(
+                nn.Conv2d(num_feat, num_feat, 3, 1, 1, groups=num_feat),
+                ChannelAttention(num_feat, squeeze_factor)
+            )
+        else: # for classic SR
+            self.cab = nn.Sequential(
+                nn.Conv2d(num_feat, mid_channels, 3, 1, 1),
+                nn.GELU(),
+                nn.Conv2d(mid_channels, num_feat, 3, 1, 1),
+                ChannelAttention(num_feat, squeeze_factor)
+            )
+    def forward(self, x):
+        return self.cab(x)
+class SS2D(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            d_state=16,
+            d_conv=3,
+            expand=2.,
+            dt_rank="auto",
+            dt_min=0.001,
+            dt_max=0.1,
+            dt_init="random",
+            dt_scale=1.0,
+            dt_init_floor=1e-4,
+            dropout=0.,
+            conv_bias=True,
+            bias=False,
+            device=None,
+            dtype=None,
+            **kwargs,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv2d = nn.Conv2d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            groups=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            padding=(d_conv - 1) // 2,
+            **factory_kwargs,
+        )
+        self.act = nn.SiLU()
+        self.x_proj = (
+            nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
+            nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
+            nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
+            nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
+        )
+        self.x_proj_weight = nn.Parameter(torch.stack([t.weight for t in self.x_proj], dim=0))  # (K=4, N, inner)
+        del self.x_proj
+        self.dt_projs = (
+            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
+                         **factory_kwargs),
+            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
+                         **factory_kwargs),
+            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
+                         **factory_kwargs),
+            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
+                         **factory_kwargs),
+        )
+        self.dt_projs_weight = nn.Parameter(torch.stack([t.weight for t in self.dt_projs], dim=0))  # (K=4, inner, rank)
+        self.dt_projs_bias = nn.Parameter(torch.stack([t.bias for t in self.dt_projs], dim=0))  # (K=4, inner)
+        del self.dt_projs
+        self.A_logs = self.A_log_init(self.d_state, self.d_inner, copies=4, merge=True)  # (K=4, D, N)
+        self.Ds = self.D_init(self.d_inner, copies=4, merge=True)  # (K=4, D, N)
+        self.selective_scan = selective_scan_fn
+        self.out_norm = nn.LayerNorm(self.d_inner)
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+        self.dropout = nn.Dropout(dropout) if dropout > 0. else None
+    @staticmethod
+    def dt_init(dt_rank, d_inner, dt_scale=1.0, dt_init="random", dt_min=0.001, dt_max=0.1, dt_init_floor=1e-4,
+                **factory_kwargs):
+        dt_proj = nn.Linear(dt_rank, d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = dt_rank ** -0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        dt_proj.bias._no_reinit = True
+        return dt_proj
+    @staticmethod
+    def A_log_init(d_state, d_inner, copies=1, device=None, merge=True):
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        if copies > 1:
+            A_log = repeat(A_log, "d n -> r d n", r=copies)
+            if merge:
+                A_log = A_log.flatten(0, 1)
+        A_log = nn.Parameter(A_log)
+        A_log._no_weight_decay = True
+        return A_log
+    @staticmethod
+    def D_init(d_inner, copies=1, device=None, merge=True):
+        # D "skip" parameter
+        D = torch.ones(d_inner, device=device)
+        if copies > 1:
+            D = repeat(D, "n1 -> r n1", r=copies)
+            if merge:
+                D = D.flatten(0, 1)
+        D = nn.Parameter(D)  # Keep in fp32
+        D._no_weight_decay = True
+        return D
+    def forward_core(self, x: torch.Tensor):
+        B, C, H, W = x.shape
+        L = H * W
+        K = 4
+        x_hwwh = torch.stack([x.view(B, -1, L), torch.transpose(x, dim0=2, dim1=3).contiguous().view(B, -1, L)], dim=1).view(B, 2, -1, L)
+        xs = torch.cat([x_hwwh, torch.flip(x_hwwh, dims=[-1])], dim=1) # (1, 4, 192, 3136)
+        x_dbl = torch.einsum("b k d l, k c d -> b k c l", xs.view(B, K, -1, L), self.x_proj_weight)
+        dts, Bs, Cs = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=2)
+        dts = torch.einsum("b k r l, k d r -> b k d l", dts.view(B, K, -1, L), self.dt_projs_weight)
+        xs = xs.float().view(B, -1, L)
+        dts = dts.contiguous().float().view(B, -1, L) # (b, k * d, l)
+        Bs = Bs.float().view(B, K, -1, L)
+        Cs = Cs.float().view(B, K, -1, L) # (b, k, d_state, l)
+        Ds = self.Ds.float().view(-1)
+        As = -torch.exp(self.A_logs.float()).view(-1, self.d_state)
+        dt_projs_bias = self.dt_projs_bias.float().view(-1) # (k * d)
+        out_y = self.selective_scan(
+            xs, dts,
+            As, Bs, Cs, Ds, z=None,
+            delta_bias=dt_projs_bias,
+            delta_softplus=True,
+            return_last_state=False,
+        ).view(B, K, -1, L)
+        assert out_y.dtype == torch.float
+        inv_y = torch.flip(out_y[:, 2:4], dims=[-1]).view(B, 2, -1, L)
+        wh_y = torch.transpose(out_y[:, 1].view(B, -1, W, H), dim0=2, dim1=3).contiguous().view(B, -1, L)
+        invwh_y = torch.transpose(inv_y[:, 1].view(B, -1, W, H), dim0=2, dim1=3).contiguous().view(B, -1, L)
+        return out_y[:, 0], inv_y[:, 0], wh_y, invwh_y
+    def forward(self, x: torch.Tensor, **kwargs):
+        B, H, W, C = x.shape
+        xz = self.in_proj(x)
+        x, z = xz.chunk(2, dim=-1)
+        x = x.permute(0, 3, 1, 2).contiguous()
+        x = self.act(self.conv2d(x))
+        y1, y2, y3, y4 = self.forward_core(x)
+        assert y1.dtype == torch.float32
+        y = y1 + y2 + y3 + y4
+        y = torch.transpose(y, dim0=1, dim1=2).contiguous().view(B, H, W, -1)
+        y = self.out_norm(y)
+        y = y * F.silu(z)
+        out = self.out_proj(y)
+        if self.dropout is not None:
+            out = self.dropout(out)
+        return out
+class VSSBlock(nn.Module):
+    def __init__(
+            self,
+            hidden_dim: int = 0,
+            drop_path: float = 0,
+            norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+            attn_drop_rate: float = 0,
+            d_state: int = 16,
+            expand: float = 2.,
+            is_light_sr: bool = False,
+            **kwargs,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(hidden_dim)
+        self.self_attention = SS2D(d_model=hidden_dim, d_state=d_state,expand=expand,dropout=attn_drop_rate, **kwargs)
+        self.drop_path = DropPath(drop_path)
+        self.skip_scale= nn.Parameter(torch.ones(hidden_dim))
+        self.conv_blk = CAB(hidden_dim,is_light_sr)
+        self.ln_2 = nn.LayerNorm(hidden_dim)
+        self.skip_scale2 = nn.Parameter(torch.ones(hidden_dim))
+    def forward(self, input, x_size):
+        # x [B,HW,C]
+        B, L, C = input.shape
+        input = input.view(B, *x_size, C).contiguous()  # [B,H,W,C]
+        x = self.ln_1(input)
+        x = input*self.skip_scale + self.drop_path(self.self_attention(x))
+        x = x*self.skip_scale2 + self.conv_blk(self.ln_2(x).permute(0, 3, 1, 2).contiguous()).permute(0, 2, 3, 1).contiguous()
+        x = x.view(B, -1, C).contiguous()
+        return x
+class RoPE(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.head_dim = embed_dim // num_heads
+        self.num_heads = num_heads
+    def forward(self, x_size):
+        H, W = x_size
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        pos_h = torch.arange(H, dtype=torch.float32, device=device)
+        pos_w = torch.arange(W, dtype=torch.float32, device=device)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, self.head_dim, 2, device=device).float() / self.head_dim))
+        sin_h = torch.sin(torch.einsum("i,j->ij", pos_h, inv_freq))
+        cos_h = torch.cos(torch.einsum("i,j->ij", pos_h, inv_freq))
+        sin_w = torch.sin(torch.einsum("i,j->ij", pos_w, inv_freq))
+        cos_w = torch.cos(torch.einsum("i,j->ij", pos_w, inv_freq))
+        sin = torch.einsum("i,j->ij", sin_h[:, 0], sin_w[:, 0]).unsqueeze(0).unsqueeze(0)
+        cos = torch.einsum("i,j->ij", cos_h[:, 0], cos_w[:, 0]).unsqueeze(0).unsqueeze(0)
+        sin = sin.expand(self.num_heads, -1, -1, -1).contiguous()
+        cos = cos.expand(self.num_heads, -1, -1, -1).contiguous()
+        return sin, cos
+def rotate_every_two(x):
+    if x.shape[-1] % 2 != 0:
+        x = F.pad(x, (0, 1), mode='constant', value=0)
+        pad = True
+    else:
+        pad = False
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    out = torch.stack((-x2, x1), -1).reshape(*x.shape[:-1], -1)
+    return out[..., :x.shape[-1]-1] if pad else out
+def theta_shift(x, sin, cos):
+    if sin.shape[-1] < x.shape[-1]:
+        pad = x.shape[-1] - sin.shape[-1]
+        sin = F.pad(sin, (0, pad), mode='constant', value=0)
+        cos = F.pad(cos, (0, pad), mode='constant', value=1)
+    elif sin.shape[-1] > x.shape[-1]:
+        sin = sin[..., :x.shape[-1]]
+        cos = cos[..., :x.shape[-1]]
+    return (x * cos) + (rotate_every_two(x) * sin)
+class OverlapWindowAttention(nn.Module):
+    def __init__(self, dim, num_heads=4, window_size=7, shift_size=3):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.qkv = nn.Conv2d(dim, dim * 3, kernel_size=1)
+        self.proj = nn.Conv2d(dim, dim, kernel_size=1)
+    def forward(self, x, sin, cos):
+        B, C, H, W = x.shape
+        ws = self.window_size
+        pad_h = (ws - H % ws) % ws
+        pad_w = (ws - W % ws) % ws
+        x = F.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
+        H_pad, W_pad = x.shape[2], x.shape[3]
+        if self.shift_size > 0:
+            x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(2, 3))
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b (m c) h w -> m b c h w', m=3)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q.view(B, self.num_heads, self.head_dim, H_pad, W_pad)
+        k = k.view(B, self.num_heads, self.head_dim, H_pad, W_pad)
+        v = v.view(B, self.num_heads, self.head_dim, H_pad, W_pad)
+        q = theta_shift(q, sin, cos) * self.scale
+        k = theta_shift(k, sin, cos)
+        q = q.view(B, C, H_pad, W_pad)
+        k = k.view(B, C, H_pad, W_pad)
+        v = v.view(B, C, H_pad, W_pad)
+        q = rearrange(q, 'b c (h ws1) (w ws2) -> b (h w) (ws1 ws2) c', ws1=ws, ws2=ws)
+        k = rearrange(k, 'b c (h ws1) (w ws2) -> b (h w) (ws1 ws2) c', ws1=ws, ws2=ws)
+        v = rearrange(v, 'b c (h ws1) (w ws2) -> b (h w) (ws1 ws2) c', ws1=ws, ws2=ws)
+        B, num_windows, window_len, C_new = q.shape
+        assert C_new % self.num_heads == 0, f"C_new={C_new} 不能整除 num_heads={self.num_heads}"
+        head_dim_new = C_new // self.num_heads
+        q = q.view(B, num_windows, window_len, self.num_heads, head_dim_new).transpose(2, 3)
+        k = k.view(B, num_windows, window_len, self.num_heads, head_dim_new).transpose(2, 3)
+        v = v.view(B, num_windows, window_len, self.num_heads, head_dim_new).transpose(2, 3)
+        attn = torch.softmax(q @ k.transpose(-2, -1), dim=-1)
+        out = (attn @ v).transpose(2, 3).reshape(B, num_windows, window_len, self.num_heads * head_dim_new)
+        out = rearrange(out, 'b (h w) (ws1 ws2) c -> b c (h ws1) (w ws2)', h=H_pad // ws, ws1=ws, ws2=ws, w=W_pad // ws)
+        if self.shift_size > 0:
+            out = torch.roll(out, shifts=(self.shift_size, self.shift_size), dims=(2, 3))
+        out = out[:, :, :H, :W]
+        out = self.proj(out)
+        return out
+class ShallowFusionAttnBlock(nn.Module):
+    def __init__(self, dim, num_heads=4, window_size=7, shift_size=3):
+        super().__init__()
+        self.dim = dim
+        self.attn = OverlapWindowAttention(dim, num_heads=num_heads, window_size=window_size, shift_size=shift_size)
+        self.rope = RoPE(embed_dim=dim, num_heads=num_heads)
+        self.conv1 = nn.Conv2d(dim * 2, dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv2d(dim * 2, dim, kernel_size=3, padding=1)
+        self.vss = VSSBlock(dim)
+    def patch_unembed(self, x, h, w):
+        return x.transpose(1, 2).reshape(x.size(0), -1, h, w)
+    def patch_embed(self, x):
+        return x.flatten(2).transpose(1, 2)
+    def forward(self, I1, I2, h, w):
+        B, C, H, W = I1.shape
+        diff = torch.abs(I1 - I2)
+        H_pad = (self.attn.window_size - h % self.attn.window_size) % self.attn.window_size + h
+        W_pad = (self.attn.window_size - w % self.attn.window_size) % self.attn.window_size + w
+        sin, cos = self.rope((H_pad, W_pad))
+        diff_attn = self.attn(diff, sin, cos)
+        token_attn = self.patch_embed(diff_attn) # [B, N, C]
+        I1_token = self.patch_embed(I1)
+        I2_token = self.patch_embed(I2)
+        I1 = I1_token + token_attn
+        I2 = I2_token + token_attn
+        I1_un = self.patch_unembed(I1, h, w)
+        I2_un = self.patch_unembed(I2, h, w)
+        I1_local = self.conv1(torch.cat([I1_un, I2_un], dim=1)) + I1_un
+        I2_local = self.conv2(torch.cat([I2_un, I1_un], dim=1)) + I2_un
+        I1_token = self.patch_embed(I1_local)
+        I2_token = self.patch_embed(I2_local)
+        vss_feat_1 = self.vss(I1_token, (h, w)).transpose(1, 2).view(B, C, h, w)
+        vss_feat_2 = self.vss(I2_token, (h, w)).transpose(1, 2).view(B, C, h, w)
+        I1_fuse = I1_local + vss_feat_1
+        I2_fuse = I2_local + vss_feat_2
+        return I1_fuse, I2_fuse

models/STNR.py ADDED Viewed

	@@ -0,0 +1,327 @@

+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monai.networks.blocks.convolutions import Convolution
+from monai.networks.blocks.segresnet_block import get_conv_layer, get_upsample_layer
+from monai.networks.layers.factories import Dropout
+from monai.networks.layers.utils import get_act_layer, get_norm_layer
+from monai.utils import UpsampleMode
+from einops import rearrange
+from models.mamba_customer import ConvMamba, M3, PatchEmbed, PatchUnEmbed
+from models.Blocks import CAB, SAB, VSSBlock, ShallowFusionAttnBlock
+import warnings
+warnings.filterwarnings("ignore")
+def get_dwconv_layer(
+        spatial_dims: int, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1,
+        bias: bool = False
+):
+    depth_conv = Convolution(spatial_dims=spatial_dims, in_channels=in_channels, out_channels=in_channels,
+                             strides=stride, kernel_size=kernel_size, bias=bias, conv_only=True, groups=in_channels)
+    point_conv = Convolution(spatial_dims=spatial_dims, in_channels=in_channels, out_channels=out_channels,
+                             strides=stride, kernel_size=1, bias=bias, conv_only=True, groups=1)
+    return torch.nn.Sequential(depth_conv, point_conv)
+class SRCMLayer(nn.Module):
+    def __init__(self, input_dim, output_dim, d_state=16, d_conv=4, expand=2, conv_mode='deepwise'):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.norm = nn.LayerNorm(input_dim)
+        self.convmamba = ConvMamba(
+            d_model=input_dim,
+            d_state=d_state,
+            d_conv=d_conv,
+            expand=expand,
+            bimamba_type="v2",
+            conv_mode=conv_mode
+        )
+        self.proj = nn.Linear(input_dim, output_dim)
+        self.skip_scale = nn.Parameter(torch.ones(1))
+    def forward(self, x):
+        if x.dtype == torch.float16:
+            x = x.type(torch.float32)
+        B, C = x.shape[:2]
+        assert C == self.input_dim
+        n_tokens = x.shape[2:].numel()
+        img_dims = x.shape[2:]
+        x_flat = x.reshape(B, C, n_tokens).transpose(-1, -2)
+        x_norm = self.norm(x_flat)
+        x_mamba = self.convmamba(x_norm) + self.skip_scale * x_flat
+        x_mamba = self.norm(x_mamba)
+        x_mamba = self.proj(x_mamba)
+        out = x_mamba.transpose(-1, -2).reshape(B, self.output_dim, *img_dims)
+        return out
+def get_srcm_layer(
+        spatial_dims: int, in_channels: int, out_channels: int, stride: int = 1, conv_mode: str = "deepwise"
+):
+    srcm_layer = SRCMLayer(input_dim=in_channels, output_dim=out_channels, conv_mode=conv_mode)
+    if stride != 1:
+        if spatial_dims == 2:
+            return nn.Sequential(srcm_layer, nn.MaxPool2d(kernel_size=stride, stride=stride))
+    return srcm_layer
+class SRCMBlock(nn.Module):
+    def __init__(
+            self,
+            spatial_dims: int,
+            in_channels: int,
+            norm: tuple | str,
+            kernel_size: int = 3,
+            conv_mode: str = "deepwise",
+            act: tuple | str = ("RELU", {"inplace": True}),
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions, could be 1, 2 or 3.
+            in_channels: number of input channels.
+            norm: feature normalization type and arguments.
+            kernel_size: convolution kernel size, the value should be an odd number. Defaults to 3.
+            act: activation type and arguments. Defaults to ``RELU``.
+        """
+        super().__init__()
+        if kernel_size % 2 != 1:
+            raise AssertionError("kernel_size should be an odd number.")
+        # print(conv_mode)
+        self.norm1 = get_norm_layer(name=norm, spatial_dims=spatial_dims, channels=in_channels)
+        self.norm2 = get_norm_layer(name=norm, spatial_dims=spatial_dims, channels=in_channels)
+        self.act = get_act_layer(act)
+        self.conv1 = get_srcm_layer(
+            spatial_dims, in_channels=in_channels, out_channels=in_channels, conv_mode=conv_mode
+        )
+        self.conv2 = get_srcm_layer(
+            spatial_dims, in_channels=in_channels, out_channels=in_channels, conv_mode=conv_mode
+        )
+    def forward(self, x):
+        identity = x
+        x = self.norm1(x)
+        x = self.act(x)
+        x = self.conv1(x)
+        x = self.norm2(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x += identity
+        return x
+class CSI(nn.Module):
+    def __init__(self, dim):
+        super(CSI, self).__init__()
+        self.shallow_fusion_attn = ShallowFusionAttnBlock(dim)
+        self.m3 = M3(dim)
+        self.vss = VSSBlock(hidden_dim=dim)
+        self.patch_embed = PatchEmbed(in_chans=dim, embed_dim=dim)
+        self.patch_unembed = PatchUnEmbed(in_chans=dim, embed_dim=dim)
+    def forward(self, I1, I2, h, w):
+        I1_fuse, I2_fuse = self.shallow_fusion_attn(I1, I2, h, w)
+        fusion = torch.abs(I1_fuse - I2_fuse)
+        I1_token = self.patch_embed(I1_fuse)
+        I2_token = self.patch_embed(I2_fuse)
+        fusion_token = self.patch_embed(fusion)
+        test_h, test_w = fusion.shape[2], fusion.shape[3]
+        fusion_token, _ = self.m3(I1_token, I2_token, fusion_token, test_h, test_w)
+        fusion_out = self.patch_unembed(fusion_token, (h, w))
+        return fusion_out
+class STNR(nn.Module):
+    def __init__(
+            self,
+            spatial_dims: int = 2,
+            init_filters: int = 16,
+            in_channels: int = 1,
+            out_channels: int = 2,
+            conv_mode: str = "deepwise",
+            local_query_model = "orignal_dinner",
+            dropout_prob: float | None = None,
+            act: tuple | str = ("RELU", {"inplace": True}),
+            norm: tuple | str = ("GROUP", {"num_groups": 8}),
+            norm_name: str = "",
+            num_groups: int = 8,
+            use_conv_final: bool = True,
+            blocks_down: tuple = (1, 2, 2, 4),
+            blocks_up: tuple = (1, 1, 1),
+            mode: str = "",
+            up_mode="ResMamba",
+            up_conv_mode="deepwise",
+            resdiual=False,
+            stage = 4,
+            diff_abs="later",
+            mamba_act = "silu",
+            upsample_mode: UpsampleMode | str = UpsampleMode.NONTRAINABLE,
+    ):
+        super().__init__()
+        if spatial_dims not in (2, 3):
+            raise ValueError("`spatial_dims` can only be 2 or 3.")
+        self.mode = mode
+        self.stage = stage
+        self.up_conv_mode = up_conv_mode
+        self.mamba_act = mamba_act
+        self.resdiual = resdiual
+        self.up_mode = up_mode
+        self.diff_abs = diff_abs
+        self.conv_mode = conv_mode
+        self.local_query_model = local_query_model
+        self.spatial_dims = spatial_dims
+        self.init_filters = init_filters
+        self.channels_list = [self.init_filters, self.init_filters*2, self.init_filters*4, self.init_filters*8]
+        self.in_channels = in_channels
+        self.blocks_down = blocks_down
+        self.blocks_up = blocks_up
+        print(self.blocks_up)
+        self.dropout_prob = dropout_prob
+        self.act = act  # input options
+        self.act_mod = get_act_layer(act)
+        if norm_name:
+            if norm_name.lower() != "group":
+                raise ValueError(f"Deprecating option 'norm_name={norm_name}', please use 'norm' instead.")
+            norm = ("group", {"num_groups": num_groups})
+        self.norm = norm
+        print(self.norm)
+        self.upsample_mode = UpsampleMode(upsample_mode)
+        self.use_conv_final = use_conv_final
+        self.convInit = get_conv_layer(spatial_dims, in_channels, init_filters)
+        self.srcm_encoder_layers = self._make_srcm_encoder_layers()
+        self.srcm_decoder_layers, self.up_samples = self._make_srcm_decoder_layers(up_mode=self.up_mode)
+        self.conv_final = self._make_final_conv(out_channels)
+        self.fusion_blocks = nn.ModuleList(
+            [CSI(self.channels_list[i]) for i in range(self.stage)]
+        )
+        self.cab_layers = nn.ModuleList([
+            CAB(ch) for ch in self.channels_list[::-1][1:]
+        ])
+        self.sab_layers = nn.ModuleList([
+            SAB(kernel_size=7) for _ in range(len(self.blocks_up))
+        ])
+        self.conv_down_layers = nn.ModuleList([
+            nn.Conv2d(ch * 2, ch, kernel_size=1, stride=1, padding=0) for ch in self.channels_list[::-1][1:]
+        ])
+        if dropout_prob is not None:
+            self.dropout = Dropout[Dropout.DROPOUT, spatial_dims](dropout_prob)
+    def _make_srcm_encoder_layers(self):
+        srcm_encoder_layers = nn.ModuleList()
+        blocks_down, spatial_dims, filters, norm, conv_mode = (self.blocks_down, self.spatial_dims, self.init_filters, self.norm, self.conv_mode)
+        for i, item in enumerate(blocks_down):
+            layer_in_channels = filters * 2 ** i
+            downsample_mamba = (
+                get_srcm_layer(spatial_dims, layer_in_channels // 2, layer_in_channels, stride=2, conv_mode=conv_mode)
+                if i > 0
+                else nn.Identity()
+            )
+            down_layer = nn.Sequential(
+                downsample_mamba,
+                *[SRCMBlock(spatial_dims, layer_in_channels, norm=norm, act=self.act, conv_mode=conv_mode) for _ in range(item)]
+            )
+            srcm_encoder_layers.append(down_layer)
+        return srcm_encoder_layers
+    def _make_srcm_decoder_layers(self, up_mode):
+        srcm_decoder_layers, up_samples = nn.ModuleList(), nn.ModuleList()
+        upsample_mode, blocks_up, spatial_dims, filters, norm = (
+            self.upsample_mode,
+            self.blocks_up,
+            self.spatial_dims,
+            self.init_filters,
+            self.norm,
+        )
+        if up_mode == 'SRCM':
+            Block_up = SRCMBlock
+        n_up = len(blocks_up)
+        for i in range(n_up):
+            sample_in_channels = filters * 2 ** (n_up - i)
+            srcm_decoder_layers.append(
+                nn.Sequential(
+                    *[
+                        Block_up(spatial_dims, sample_in_channels // 2, norm=norm, act=self.act, conv_mode=self.up_conv_mode)
+                        for _ in range(blocks_up[i])
+                    ]
+                )
+            )
+            up_samples.append(
+                nn.Sequential(
+                    *[
+                        get_conv_layer(spatial_dims, sample_in_channels, sample_in_channels // 2, kernel_size=1),
+                        get_upsample_layer(spatial_dims, sample_in_channels // 2, upsample_mode=upsample_mode),
+                    ]
+                )
+            )
+        return srcm_decoder_layers, up_samples
+    def _make_final_conv(self, out_channels: int):
+        return nn.Sequential(
+            get_norm_layer(name=self.norm, spatial_dims=self.spatial_dims, channels=self.init_filters),
+            self.act_mod,
+            get_conv_layer(self.spatial_dims, self.init_filters, out_channels, kernel_size=1, bias=True),
+        )
+    def encode(self, x: torch.Tensor) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        x = self.convInit(x)
+        if self.dropout_prob is not None:
+            x = self.dropout(x)
+        down_x = []
+        for down in self.srcm_encoder_layers:
+            x = down(x)
+            down_x.append(x)
+        return x, down_x
+    def decode(self, x: torch.Tensor, down_x: list[torch.Tensor]) -> torch.Tensor:
+        for i, (up, upl) in enumerate(zip(self.up_samples, self.srcm_decoder_layers)):
+            skip = down_x[i + 1]
+            x_up = up(x) + skip
+            x_cab = self.cab_layers[i](x_up) * x_up
+            x_sab = self.sab_layers[i](x_cab) * x_cab
+            x_srcm = upl(x_up)
+            combined_out = torch.cat([x_sab, x_srcm], dim=1)
+            final_out = self.conv_down_layers[i](combined_out)
+            x = final_out
+        if self.use_conv_final:
+            x = self.conv_final(x)
+        return x
+    def forward(self, x1: torch.Tensor, x2:torch.Tensor) -> torch.Tensor:
+        b, c, h, w = x1.shape
+        x1, down_x1 = self.encode(x1)
+        x2, down_x2 = self.encode(x2)
+        down_x = []
+        for i in range(len(down_x1)):
+            x1_level, x2_level = down_x1[i], down_x2[i]
+            H_i, W_i = x1_level.shape[2], x1_level.shape[3]
+            if self.diff_abs == "later":
+                if self.mode == "FUSION":
+                    if i < self.stage:
+                        zero_res = torch.zeros_like(x1_level)
+                        fusion = self.fusion_blocks[i](x1_level, x2_level, H_i, W_i)
+                    else:
+                        fusion = torch.abs(x1_level - x2_level)
+                else:
+                    fusion = torch.abs(x1_level - x2_level)
+            down_x.append(fusion)
+        down_x.reverse()
+        x = self.decode(down_x[0], down_x)
+        return x
+if __name__ == "__main__":
+    device = "cuda:0"
+    CDMamba = STNR(spatial_dims=2, in_channels=3, out_channels=2, init_filters=16, norm=("GROUP", {"num_groups": 8}),
+                    mode="FUSION", conv_mode='orignal', local_query_model="orignal_dinner",
+                    stage=4, mamba_act="silu", up_mode="SRCM", up_conv_mode='deepwise', blocks_down=(1, 2, 2, 4), blocks_up=(1, 1, 1),
+                    resdiual=False, diff_abs="later").to(device)
+    x = torch.randn(1, 3, 256, 256).to(device)
+    y = CDMamba(x, x)
+    print(y.shape)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .resnet import *
+import logging
+logger = logging.getLogger('base')
+def create_CD_model(opt):
+    # Our CDMamba model
+    from models.STNR import STNR as stnr
+    if opt['model']['name'] == 'STNR':
+        cd_model = stnr(spatial_dims=opt['model']['spatial_dims'], in_channels=opt['model']['in_channels'], init_filters=opt['model']['init_filters'], out_channels=opt['model']['n_classes'],
+                              mode=opt['model']['mode'], conv_mode=opt['model']['conv_mode'], up_mode=opt['model']['up_mode'], up_conv_mode=opt['model']['up_conv_mode'], norm=opt['model']['norm'],
+                              blocks_down=opt['model']['blocks_down'], blocks_up=opt['model']['blocks_up'], resdiual=opt['model']['resdiual'], diff_abs=opt['model']['diff_abs'], stage=opt['model']['stage'],
+                              mamba_act=opt['model']['mamba_act'], local_query_model=opt['model']['local_query_model'])

models/loss.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import torch
+import torch.nn as nn
+from torch import Tensor, einsum
+import torch.nn .functional as F
+from misc.torchutils import class2one_hot,simplex
+from models.darnet_help.loss_help import FocalLoss, dernet_dice_loss
+def cross_entropy(input, target, weight=None, reduction='mean',ignore_index=255):
+    """
+    logSoftmax_with_loss
+    :param input: torch.Tensor, N*C*H*W
+    :param target: torch.Tensor, N*1*H*W,/ N*H*W
+    :param weight: torch.Tensor, C
+    :return: torch.Tensor [0]
+    """
+    target = target.long()
+    if target.dim() == 4:
+        target = torch.squeeze(target, dim=1)
+    if input.shape[-1] != target.shape[-1]:
+        input = F.interpolate(input, size=target.shape[1:], mode='bilinear',align_corners=True)
+    return F.cross_entropy(input=input, target=target, weight=weight,
+                           ignore_index=ignore_index, reduction=reduction)
+def dice_loss(predicts,target,weight=None):
+    idc= [0, 1]
+    probs = torch.softmax(predicts, dim=1)
+    # target = target.unsqueeze(1)
+    target = class2one_hot(target, 7)
+    assert simplex(probs) and simplex(target)
+    pc = probs[:, idc, ...].type(torch.float32)
+    tc = target[:, idc, ...].type(torch.float32)
+    intersection: Tensor = einsum("bcwh,bcwh->bc", pc, tc)
+    union: Tensor = (einsum("bkwh->bk", pc) + einsum("bkwh->bk", tc))
+    divided: Tensor = torch.ones_like(intersection) - (2 * intersection + 1e-10) / (union + 1e-10)
+    loss = divided.mean()
+    return loss
+def ce_dice(input, target, weight=None):
+    ce_loss = cross_entropy(input, target)
+    dice_loss_ = dice_loss(input, target)
+    loss = 0.5 * ce_loss + 0.5 * dice_loss_
+    return loss
+def dice(input, target, weight=None):
+    dice_loss_ = dice_loss(input, target)
+    return dice_loss_
+def ce2_dice1(input, target, weight=None):
+    ce_loss = cross_entropy(input, target)
+    dice_loss_ = dice_loss(input, target)
+    loss = ce_loss + 0.5 * dice_loss_
+    return loss
+def ce1_dice2(input, target, weight=None):
+    ce_loss = cross_entropy(input, target)
+    dice_loss_ = dice_loss(input, target)
+    loss = 0.5 * ce_loss +  dice_loss_
+    return loss
+def ce_scl(input, target, weight=None):
+    ce_loss = cross_entropy(input, target)
+    dice_loss_ = dice_loss(input, target)
+    loss = 0.5 * ce_loss + 0.5 * dice_loss_
+    return loss
+def weighted_BCE_logits(logit_pixel, truth_pixel, weight_pos=0.25, weight_neg=0.75):
+    logit = logit_pixel.view(-1)
+    truth = truth_pixel.view(-1)
+    assert (logit.shape == truth.shape)
+    loss = F.binary_cross_entropy_with_logits(logit.float(), truth.float(), reduction='none')
+    pos = (truth > 0.5).float()
+    neg = (truth < 0.5).float()
+    pos_num = pos.sum().item() + 1e-12
+    neg_num = neg.sum().item() + 1e-12
+    loss = (weight_pos * pos * loss / pos_num + weight_neg * neg * loss / neg_num).sum()
+    return loss
+class ChangeSimilarity(nn.Module):
+    """input: x1, x2 multi-class predictions, c = class_num
+       label_change: changed part
+    """
+    def __init__(self, reduction='mean'):
+        super(ChangeSimilarity, self).__init__()
+        self.loss_f = nn.CosineEmbeddingLoss(margin=0., reduction=reduction)
+    def forward(self, x1, x2, label_change):
+        b, c, h, w = x1.size()
+        x1 = F.softmax(x1, dim=1)
+        x2 = F.softmax(x2, dim=1)
+        x1 = x1.permute(0, 2, 3, 1)
+        x2 = x2.permute(0, 2, 3, 1)
+        x1 = torch.reshape(x1, [b * h * w, c])
+        x2 = torch.reshape(x2, [b * h * w, c])
+        label_unchange = ~label_change.bool()
+        target = label_unchange.float()
+        target = target - label_change.float()
+        target = torch.reshape(target, [b * h * w])
+        loss = self.loss_f(x1, x2, target)
+        return loss
+def hybrid_loss(predictions, target, weight=[0,2,0.2,0.2,0.2,0.2]):
+    """Calculating the loss"""
+    loss = 0
+    # gamma=0, alpha=None --> CE
+    # focal = FocalLoss(gamma=0, alpha=None)
+    # ssim = SSIM()
+    for i,prediction in enumerate(predictions):
+        bce = cross_entropy(prediction, target)
+        dice = dice_loss(prediction, target)
+        # ssimloss = ssim(prediction, target)
+        loss += weight[i]*(bce + dice) #- ssimloss
+    return loss
+class BCL(nn.Module):
+    """
+    batch-balanced contrastive loss
+    no-change，1
+    change，-1
+    """
+    def __init__(self, margin=2.0):
+        super(BCL, self).__init__()
+        self.margin = margin
+    def forward(self, distance, label):
+        label[label == 1] = -1
+        label[label == 0] = 1
+        mask = (label != 255).float()
+        distance = distance * mask
+        pos_num = torch.sum((label==1).float())+0.0001
+        neg_num = torch.sum((label==-1).float())+0.0001
+        loss_1 = torch.sum((1+label) / 2 * torch.pow(distance, 2)) /pos_num
+        loss_2 = torch.sum((1-label) / 2 *
+            torch.pow(torch.clamp(self.margin - distance, min=0.0), 2)
+        ) / neg_num
+        loss = loss_1 + loss_2
+        return loss

models/mamba_customer.py ADDED Viewed

	@@ -0,0 +1,569 @@

+# Copyright (c) 2023, Tri Dao, Albert Gu.
+import numbers
+from mamba_ssm.modules.mamba_simple import Mamba
+import warnings
+warnings.filterwarnings("ignore")
+from timm.models.layers import DropPath, to_2tuple
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None
+try:
+    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn, bimamba_inner_fn, mamba_inner_fn_no_out_proj
+except ImportError:
+    selective_scan_fn, mamba_inner_fn, bimamba_inner_fn, mamba_inner_fn_no_out_proj = None, None, None, None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+class LightweightModel(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(LightweightModel, self).__init__()
+        self.depthwise_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, groups=in_channels)
+        self.pointwise_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+class ConvMamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,
+        layer_idx=None,
+        device=None,
+        dtype=None,
+        bimamba_type="none",
+        conv_mode = "deepwise"
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.conv_mode = conv_mode
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.bimamba_type = bimamba_type
+        if self.conv_mode == "orignal":
+            self.local_relation = nn.Sequential(
+                nn.Conv2d(in_channels=self.d_model, out_channels=self.d_model, kernel_size=3, stride=1, padding=1),
+                nn.SiLU(),
+                nn.Conv2d(in_channels=self.d_model, out_channels=self.d_inner, kernel_size=3, stride=1, padding=1),
+            )
+        elif self.conv_mode == "orignal_1_5_dmodel":
+            self.local_relation = nn.Sequential(
+                nn.Conv2d(in_channels=self.d_model, out_channels=int(1.5*self.d_model), kernel_size=3, stride=1, padding=1),
+                nn.SiLU(),
+                nn.Conv2d(in_channels=int(1.5*self.d_model), out_channels=self.d_inner, kernel_size=3, stride=1, padding=1),
+            )
+        elif self.conv_mode == "orignal_dinner":
+            self.local_relation = nn.Sequential(
+                nn.Conv2d(in_channels=self.d_model, out_channels=self.d_inner, kernel_size=3, stride=1, padding=1),
+                nn.SiLU(),
+                nn.Conv2d(in_channels=self.d_inner, out_channels=self.d_inner, kernel_size=3, stride=1, padding=1),
+            )
+        elif self.conv_mode == "deepwise":
+            self.local_relation = nn.Sequential(
+                LightweightModel(in_channels=self.d_model, out_channels=self.d_model),
+                nn.SiLU(),
+                LightweightModel(in_channels=self.d_model, out_channels=self.d_inner),
+            )
+        elif self.conv_mode == "deepwise_dinner":
+            self.local_relation = nn.Sequential(
+                LightweightModel(in_channels=self.d_model, out_channels=self.d_inner),
+                nn.SiLU(),
+                LightweightModel(in_channels=self.d_inner, out_channels=self.d_inner),
+            )
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        # bidirectional
+        assert bimamba_type == "v2"
+        A_b = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_b_log = torch.log(A_b)  # Keep A_b_log in fp32
+        self.A_b_log = nn.Parameter(A_b_log)
+        self.A_b_log._no_weight_decay = True
+        self.conv1d_b = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.x_proj_b = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj_b = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        self.D_b = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D_b._no_weight_decay = True
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        h = int(math.sqrt(seqlen))
+        local_relation = self.local_relation(rearrange(hidden_states, "b (h w) d -> b d h w", h=h))
+        local_relation = rearrange(local_relation, "b d h w -> b d (h w)")
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and inference_params is None:  # Doesn't support outputting the states
+            if self.bimamba_type == "v2":
+                A_b = -torch.exp(self.A_b_log.float())
+                out = mamba_inner_fn_no_out_proj(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    A,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+                out_b = mamba_inner_fn_no_out_proj(
+                    xz.flip([-1]),
+                    self.conv1d_b.weight,
+                    self.conv1d_b.bias,
+                    self.x_proj_b.weight,
+                    self.dt_proj_b.weight,
+                    A_b,
+                    None,
+                    None,
+                    self.D_b.float(),
+                    delta_bias=self.dt_proj_b.bias.float(),
+                    delta_softplus=True,
+                )
+                # F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+                out = F.linear(rearrange(out + out_b.flip([-1]) + local_relation, "b d l -> b l d"), self.out_proj.weight, self.out_proj.bias)
+            else:
+                out = mamba_inner_fn(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    A,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                conv_state.copy_(x[:, :, -self.d_conv :])  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x,
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    self.conv1d.bias,
+                    self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+def to_4d(x, h, w):
+    return rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class LayerNorm(nn.Module):
+    def __init__(self, dim, norm_type='with_bias'):
+        super(LayerNorm, self).__init__()
+        if norm_type == 'BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        if len(x.shape) == 4:
+            h, w = x.shape[-2:]
+            return to_4d(self.body(to_3d(x)), h, w)
+        else:
+            return self.body(x)
+class M3(nn.Module):
+    def __init__(self, dim):
+        super(M3, self).__init__()
+        self.multi_modal_mamba_block = Mamba(dim, bimamba_type="m3")
+        self.norm1 = LayerNorm(dim, 'with_bias')# fusion
+        self.norm2 = LayerNorm(dim, 'with_bias')# I2
+        self.norm3 = LayerNorm(dim, 'with_bias')# I1
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
+    def forward(self, I1, I2, fusion, test_h, test_w):
+        fusion = self.norm1(fusion)
+        I2 = self.norm2(I2)
+        I1 = self.norm3(I1)
+        global_f = self.multi_modal_mamba_block(fusion, extra_emb1=I2, extra_emb2=I1)# [B, HW, C]
+        B, HW, C = global_f.shape
+        fusion = global_f.transpose(1, 2).view(B, C, test_h, test_w)
+        fusion = (self.dwconv(fusion) + fusion).flatten(2).transpose(1, 2)
+        return fusion, None
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super(PatchEmbed, self).__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.norm = norm_layer(embed_dim) if norm_layer is not None else None
+    def forward(self, x):
+        # x: [B, C, H, W]
+        x = x.flatten(2).transpose(1, 2)  # [B, N, C]
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+class PatchUnEmbed(nn.Module):
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super(PatchUnEmbed, self).__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+    def forward(self, x, x_size):
+        B, HW, C = x.shape
+        x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1])
+        return x
+class Block(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(dim)
+        self.norm = norm_cls(dim)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states, residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)

models/resnet.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import torch
+import torch.nn as nn
+from torch.hub import load_state_dict_from_url
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            dilation = 1
+            # raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None, strides=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.strides = strides
+        if self.strides is None:
+            self.strides = [2, 2, 2, 2, 2]
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=self.strides[0], padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=self.strides[1], padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=self.strides[2],
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=self.strides[3],
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=self.strides[4],
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the models by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+    def forward(self, x):
+        return self._forward_impl(x)
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 models from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 models from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 models from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 models from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 models from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d models from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d models from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 models from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The models is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 models from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The models is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a models pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)