Spaces:

DraconicDragon
/

Kaloscope-artist-style-classifier

Running

App Files Files Community

DraconicDragon commited on Oct 19

Commit

6b5de5c

verified ·

1 Parent(s): c97f7c3

Upload 3 files

Browse files

Files changed (3) hide show

lsnet/lsnet.py +405 -0
lsnet/lsnet_artist.py +248 -0
lsnet/ska.py +61 -0

lsnet/lsnet.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import torch
+import itertools
+from timm.models.vision_transformer import trunc_normal_
+from timm.layers import SqueezeExcite
+from timm.models import register_model
+from .ska import SKA
+from timm.models import build_model_with_cfg
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module('bn', torch.nn.BatchNorm2d(b))
+        torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+        torch.nn.init.constant_(self.bn.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups,
+            device=c.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        self.add_module('l', torch.nn.Linear(a, b, bias=bias))
+        trunc_normal_(self.l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(self.l.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps)**0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0), device=l.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop=0.):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
+                                              device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            return x + self.m(x)
+class FFN(torch.nn.Module):
+    def __init__(self, ed, h):
+        super().__init__()
+        self.pw1 = Conv2d_BN(ed, h)
+        self.act = torch.nn.ReLU()
+        self.pw2 = Conv2d_BN(h, ed, bn_weight_init=0)
+    def forward(self, x):
+        x = self.pw2(self.act(self.pw1(x)))
+        return x
+class Attention(torch.nn.Module):
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 resolution=14):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.qkv = Conv2d_BN(dim, h, ks=1)
+        self.proj = torch.nn.Sequential(torch.nn.ReLU(), Conv2d_BN(
+            self.dh, dim, bn_weight_init=0))
+        self.dw = Conv2d_BN(nh_kd, nh_kd, 3, 1, 1, groups=nh_kd)
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer('attention_bias_idxs',
+                             torch.LongTensor(idxs).view(N, N))
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+    def forward(self, x):
+        B, _, H, W = x.shape
+        N = H * W
+        qkv = self.qkv(x)
+        q, k, v = qkv.view(B, -1, H, W).split([self.nh_kd, self.nh_kd, self.dh], dim=1)
+        q = self.dw(q)
+        q, k, v = q.view(B, self.num_heads, -1, N), k.view(B, self.num_heads, -1, N), v.view(B, self.num_heads, -1, N)
+        attn = (
+            (q.transpose(-2, -1) @ k) * self.scale
+            +
+            (self.attention_biases[:, self.attention_bias_idxs]
+             if self.training else self.ab)
+        )
+        attn = attn.softmax(dim=-1)
+        x = (v @ attn.transpose(-2, -1)).reshape(B, -1, H, W)
+        x = self.proj(x)
+        return x
+class RepVGGDW(torch.nn.Module):
+    def __init__(self, ed) -> None:
+        super().__init__()
+        self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed)
+        self.conv1 = Conv2d_BN(ed, ed, 1, 1, 0, groups=ed)
+        self.dim = ed
+    def forward(self, x):
+        return self.conv(x) + self.conv1(x) + x
+    @torch.no_grad()
+    def fuse(self):
+        conv = self.conv.fuse()
+        conv1 = self.conv1.fuse()
+        conv_w = conv.weight
+        conv_b = conv.bias
+        conv1_w = conv1.weight
+        conv1_b = conv1.bias
+        conv1_w = torch.nn.functional.pad(conv1_w, [1,1,1,1])
+        identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device), [1,1,1,1])
+        final_conv_w = conv_w + conv1_w + identity
+        final_conv_b = conv_b + conv1_b
+        conv.weight.data.copy_(final_conv_w)
+        conv.bias.data.copy_(final_conv_b)
+        return conv
+import torch.nn as nn
+class LKP(nn.Module):
+    def __init__(self, dim, lks, sks, groups):
+        super().__init__()
+        self.cv1 = Conv2d_BN(dim, dim // 2)
+        self.act = nn.ReLU()
+        self.cv2 = Conv2d_BN(dim // 2, dim // 2, ks=lks, pad=(lks - 1) // 2, groups=dim // 2)
+        self.cv3 = Conv2d_BN(dim // 2, dim // 2)
+        self.cv4 = nn.Conv2d(dim // 2, sks ** 2 * dim // groups, kernel_size=1)
+        self.norm = nn.GroupNorm(num_groups=dim // groups, num_channels=sks ** 2 * dim // groups)
+        self.sks = sks
+        self.groups = groups
+        self.dim = dim
+    def forward(self, x):
+        x = self.act(self.cv3(self.cv2(self.act(self.cv1(x)))))
+        w = self.norm(self.cv4(x))
+        b, _, h, width = w.size()
+        w = w.view(b, self.dim // self.groups, self.sks ** 2, h, width)
+        return w
+class LSConv(nn.Module):
+    def __init__(self, dim):
+        super(LSConv, self).__init__()
+        self.lkp = LKP(dim, lks=7, sks=3, groups=8)
+        self.ska = SKA()
+        self.bn = nn.BatchNorm2d(dim)
+    def forward(self, x):
+        return self.bn(self.ska(x, self.lkp(x))) + x
+class Block(torch.nn.Module):
+    def __init__(self,
+                 ed, kd, nh=8,
+                 ar=4,
+                 resolution=14,
+                 stage=-1, depth=-1):
+        super().__init__()
+        if depth % 2 == 0:
+            self.mixer = RepVGGDW(ed)
+            self.se = SqueezeExcite(ed, 0.25)
+        else:
+            self.se = torch.nn.Identity()
+            if stage == 3:
+                self.mixer = Residual(Attention(ed, kd, nh, ar, resolution=resolution))
+            else:
+                self.mixer = LSConv(ed)
+        self.ffn = Residual(FFN(ed, int(ed * 2)))
+    def forward(self, x):
+        return self.ffn(self.se(self.mixer(x)))
+class LSNet(torch.nn.Module):
+    def __init__(self, img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=[64, 128, 192, 256],
+                 key_dim=[16, 16, 16, 16],
+                 depth=[1, 2, 3, 4],
+                 num_heads=[4, 4, 4, 4],
+                 distillation=False,
+                 **kwargs):
+        super().__init__()
+        default_cfg = kwargs.pop('default_cfg', None)
+        pretrained_cfg = kwargs.pop('pretrained_cfg', None)
+        pretrained_cfg_overlay = kwargs.pop('pretrained_cfg_overlay', None)
+        if default_cfg is not None:
+            self.default_cfg = default_cfg
+        if pretrained_cfg is not None:
+            self.pretrained_cfg = pretrained_cfg
+        if pretrained_cfg_overlay is not None:
+            self.pretrained_cfg_overlay = pretrained_cfg_overlay
+        if kwargs:
+            self.extra_init_kwargs = kwargs
+        resolution = img_size
+        self.patch_embed = torch.nn.Sequential(Conv2d_BN(in_chans, embed_dim[0] // 4, 3, 2, 1), torch.nn.ReLU(),
+                                Conv2d_BN(embed_dim[0] // 4, embed_dim[0] // 2, 3, 2, 1), torch.nn.ReLU(),
+                                Conv2d_BN(embed_dim[0] // 2, embed_dim[0], 3, 2, 1)
+                           )
+        resolution = img_size // patch_size
+        attn_ratio = [embed_dim[i] / (key_dim[i] * num_heads[i]) for i in range(len(embed_dim))]
+        self.blocks1 = nn.Sequential()
+        self.blocks2 = nn.Sequential()
+        self.blocks3 = nn.Sequential()
+        self.blocks4 = nn.Sequential()
+        blocks = [self.blocks1, self.blocks2, self.blocks3, self.blocks4]
+        for i, (ed, kd, dpth, nh, ar) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio)):
+            for d in range(dpth):
+                blocks[i].append(Block(ed, kd, nh, ar, resolution, stage=i, depth=d))
+            if i != len(depth) - 1:
+                blk = blocks[i+1]
+                resolution_ = (resolution - 1) // 2 + 1
+                blk.append(Conv2d_BN(embed_dim[i], embed_dim[i], ks=3, stride=2, pad=1, groups=embed_dim[i]))
+                blk.append(Conv2d_BN(embed_dim[i], embed_dim[i+1], ks=1, stride=1, pad=0))
+                resolution = resolution_
+        self.head = BN_Linear(embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.distillation = distillation
+        if distillation:
+            self.head_dist = BN_Linear(embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.num_classes = num_classes
+        self.num_features = embed_dim[-1]
+    @torch.jit.ignore # type: ignore
+    def no_weight_decay(self):
+        return {x for x in self.state_dict().keys() if 'attention_biases' in x}
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.blocks1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = self.blocks4(x)
+        x = torch.nn.functional.adaptive_avg_pool2d(x, 1).flatten(1)
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(x)
+        return x
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (4, 4),
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.0.c', 'classifier': ('head.linear', 'head_dist.linear'),
+        **kwargs
+    }
+def _with_hf_hub(kwargs):
+    """兼容不同 timm 版本的 hf hub 配置字段"""
+    if 'hf_hub' in kwargs and 'hf_hub_id' not in kwargs:
+        kwargs['hf_hub_id'] = kwargs.pop('hf_hub')
+    return kwargs
+default_cfgs = dict(
+    lsnet_t=_cfg(**_with_hf_hub({'hf_hub': 'jameslahm/lsnet_t'})),
+    lsnet_t_distill=_cfg(**_with_hf_hub({'hf_hub': 'jameslahm/lsnet_t_distill'})),
+    lsnet_s=_cfg(**_with_hf_hub({'hf_hub': 'jameslahm/lsnet_s'})),
+    lsnet_s_distill=_cfg(**_with_hf_hub({'hf_hub': 'jameslahm/lsnet_s_distill'})),
+    lsnet_b=_cfg(**_with_hf_hub({'hf_hub': 'jameslahm/lsnet_b'})),
+    lsnet_b_distill=_cfg(**_with_hf_hub({'hf_hub': 'jameslahm/lsnet_b_distill'})),
+)
+def _create_lsnet(variant, pretrained=False, **kwargs):
+    cfg = default_cfgs.get(variant, None)
+    if cfg is not None:
+        kwargs.setdefault('default_cfg', cfg)
+        kwargs.setdefault('pretrained_cfg', cfg)
+    model = build_model_with_cfg(
+        LSNet,
+        variant,
+        pretrained,
+        **kwargs,
+    )
+    return model
+@register_model
+def lsnet_t(num_classes=1000, distillation=False, pretrained=False, **kwargs):
+    model = _create_lsnet("lsnet_t" + ("_distill" if distillation else ""),
+                  pretrained=pretrained,
+                  num_classes=num_classes,
+                  distillation=distillation,
+                  img_size=224,
+                  patch_size=8,
+                  embed_dim=[64, 128, 256, 384],
+                  depth=[0, 2, 8, 10],
+                  num_heads=[3, 3, 3, 4],
+                  )
+    return model
+@register_model
+def lsnet_s(num_classes=1000, distillation=False, pretrained=False, **kwargs):
+    model = _create_lsnet("lsnet_s" + ("_distill" if distillation else ""),
+                  pretrained=pretrained,
+                  num_classes=num_classes,
+                  distillation=distillation,
+                  img_size=224,
+                  patch_size=8,
+                  embed_dim=[96, 192, 320, 448],
+                  depth=[1, 2, 8, 10],
+                  num_heads=[3, 3, 3, 4],
+                  )
+    return model
+@register_model
+def lsnet_b(num_classes=1000, distillation=False, pretrained=False, **kwargs):
+    model = _create_lsnet("lsnet_b" + ("_distill" if distillation else ""),
+                  pretrained=pretrained,
+                  num_classes=num_classes,
+                  distillation=distillation,
+                  img_size=224,
+                  patch_size=8,
+                  embed_dim=[128, 256, 384, 512],
+                  depth=[4, 6, 8, 10],
+                  num_heads=[3, 3, 3, 4],
+                  )
+    return model
+@register_model
+def lsnet_t_distill(**kwargs):
+    kwargs["distillation"] = True
+    return lsnet_t(**kwargs)
+@register_model
+def lsnet_s_distill(**kwargs):
+    kwargs["distillation"] = True
+    return lsnet_s(**kwargs)
+@register_model
+def lsnet_b_distill(**kwargs):
+    kwargs["distillation"] = True
+    return lsnet_b(**kwargs)

lsnet/lsnet_artist.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import torch
+import torch.nn as nn
+from .lsnet import LSNet, Conv2d_BN, BN_Linear
+from timm.models import register_model
+from timm.models import build_model_with_cfg
+class LSNetArtist(LSNet):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=8,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=[64, 128, 256, 384],
+                 key_dim=[16, 16, 16, 16],
+                 depth=[0, 2, 8, 10],
+                 num_heads=[3, 3, 3, 4],
+                 distillation=False,
+                 feature_dim=None,  # 特征向量维度，默认为embed_dim[-1]
+                 use_projection=True,  # 是否使用projection层
+                 **kwargs):
+        default_cfg = kwargs.pop('default_cfg', None)
+        pretrained_cfg = kwargs.pop('pretrained_cfg', None)
+        pretrained_cfg_overlay = kwargs.pop('pretrained_cfg_overlay', None)
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            num_classes=num_classes,
+            embed_dim=embed_dim,
+            key_dim=key_dim,
+            depth=depth,
+            num_heads=num_heads,
+            distillation=distillation,
+            default_cfg=default_cfg,
+            pretrained_cfg=pretrained_cfg,
+            pretrained_cfg_overlay=pretrained_cfg_overlay,
+            **kwargs
+        )
+        self.feature_dim = feature_dim if feature_dim is not None else embed_dim[-1]
+        self.use_projection = use_projection
+        # 如果使用projection层，添加一个映射层来生成固定维度的特征
+        if self.use_projection and self.feature_dim != embed_dim[-1]:
+            self.projection = nn.Sequential(
+                BN_Linear(embed_dim[-1], self.feature_dim),
+                nn.ReLU(),
+            )
+        else:
+            self.projection = nn.Identity()
+        # 重新定义分类头（基于特征维度）
+        if num_classes > 0:
+            self.head = BN_Linear(self.feature_dim, num_classes)
+            if distillation:
+                self.head_dist = BN_Linear(self.feature_dim, num_classes)
+    def forward_features(self, x):
+        """
+        提取特征，不经过分类头
+        用于聚类或特征提取
+        """
+        x = self.patch_embed(x)
+        x = self.blocks1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = self.blocks4(x)
+        x = torch.nn.functional.adaptive_avg_pool2d(x, 1).flatten(1)
+        x = self.projection(x)
+        return x
+    def forward(self, x, return_features=False):
+        """
+        x: 输入图像
+        return_features: 是否只返回特征向量（用于聚类）
+                        False时返回分类logits（用于分类）
+        如果return_features=True: 返回特征向量 (batch_size, feature_dim)
+        如果return_features=False: 返回分类logits (batch_size, num_classes)
+        """
+        features = self.forward_features(x)
+        if return_features:
+            # 返回特征向量用于聚类
+            return features
+        # 返回分类结果
+        if self.distillation:
+            x = self.head(features), self.head_dist(features)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(features)
+        return x
+    def get_features(self, x):
+        """
+        提取特征向量
+        """
+        return self.forward(x, return_features=True)
+    def classify(self, x):
+        """
+        进行分类
+        """
+        return self.forward(x, return_features=False)
+def _cfg_artist(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000,
+        'input_size': (3, 224, 224),
+        'pool_size': (4, 4),
+        'crop_pct': .9,
+        'interpolation': 'bicubic',
+        'mean': (0.485, 0.456, 0.406),
+        'std': (0.229, 0.224, 0.225),
+        'first_conv': 'patch_embed.0.c',
+        'classifier': ('head.linear', 'head_dist.linear'),
+        **kwargs
+    }
+default_cfgs_artist = dict(
+    lsnet_t_artist = _cfg_artist(),
+    lsnet_s_artist = _cfg_artist(),
+    lsnet_b_artist = _cfg_artist(),
+    lsnet_l_artist = _cfg_artist(),
+    lsnet_xl_artist = _cfg_artist(),
+)
+def _create_lsnet_artist(variant, pretrained=False, **kwargs):
+    cfg = default_cfgs_artist.get(variant, None)
+    if cfg is not None:
+        kwargs.setdefault('default_cfg', cfg)
+        kwargs.setdefault('pretrained_cfg', cfg)
+    model = build_model_with_cfg(
+        LSNetArtist,
+        variant,
+        pretrained,
+        **kwargs,
+    )
+    return model
+@register_model
+def lsnet_t_artist(num_classes=1000, distillation=False, pretrained=False,
+                   feature_dim=None, use_projection=True, **kwargs):
+    model = _create_lsnet_artist(
+        "lsnet_t_artist",
+        pretrained=pretrained,
+        num_classes=num_classes,
+        distillation=distillation,
+        img_size=224,
+        patch_size=8,
+        embed_dim=[64, 128, 256, 384],
+        depth=[0, 2, 8, 10],
+        num_heads=[3, 3, 3, 4],
+        feature_dim=feature_dim,
+        use_projection=use_projection,
+        **kwargs
+    )
+    return model
+@register_model
+def lsnet_s_artist(num_classes=1000, distillation=False, pretrained=False,
+                   feature_dim=None, use_projection=True, **kwargs):
+    model = _create_lsnet_artist(
+        "lsnet_s_artist",
+        pretrained=pretrained,
+        num_classes=num_classes,
+        distillation=distillation,
+        img_size=224,
+        patch_size=8,
+        embed_dim=[96, 192, 320, 448],
+        depth=[1, 2, 8, 10],
+        num_heads=[3, 3, 3, 4],
+        feature_dim=feature_dim,
+        use_projection=use_projection,
+        **kwargs
+    )
+    return model
+@register_model
+def lsnet_b_artist(num_classes=1000, distillation=False, pretrained=False,
+                   feature_dim=None, use_projection=True, **kwargs):
+    model = _create_lsnet_artist(
+        "lsnet_b_artist",
+        pretrained=pretrained,
+        num_classes=num_classes,
+        distillation=distillation,
+        img_size=224,
+        patch_size=8,
+        embed_dim=[128, 256, 384, 512],
+        depth=[4, 6, 8, 10],
+        num_heads=[3, 3, 3, 4],
+        feature_dim=feature_dim,
+        use_projection=use_projection,
+        **kwargs
+    )
+    return model
+@register_model
+def lsnet_l_artist(num_classes=1000, distillation=False, pretrained=False,
+                   feature_dim=None, use_projection=True, **kwargs):
+    model = _create_lsnet_artist(
+        "lsnet_l_artist",
+        pretrained=pretrained,
+        num_classes=num_classes,
+        distillation=distillation,
+        img_size=224,
+        patch_size=8,
+        embed_dim=[160, 320, 480, 640],
+        depth=[6, 8, 12, 14],
+        num_heads=[4, 4, 4, 4],
+        feature_dim=feature_dim,
+        use_projection=use_projection,
+        **kwargs
+    )
+    return model
+@register_model
+def lsnet_xl_artist(num_classes=1000, distillation=False, pretrained=False,
+                    feature_dim=None, use_projection=True, **kwargs):
+    model = _create_lsnet_artist(
+        "lsnet_xl_artist",
+        pretrained=pretrained,
+        num_classes=num_classes,
+        distillation=distillation,
+        img_size=224,
+        patch_size=8,
+        embed_dim=[192, 384, 576, 768],
+        depth=[8, 12, 16, 20],
+        num_heads=[6, 6, 6, 6],
+        feature_dim=feature_dim,
+        use_projection=use_projection,
+        **kwargs
+    )
+    return model

lsnet/ska.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import math
+import torch
+from torch.autograd import Function
+from torch.nn import functional as F
+class PyTorchSkaFn(Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
+        # Get kernel size and padding from the weight tensor shape
+        # w shape is (n, wc, ks*ks, h, w)
+        ks = int(math.sqrt(w.shape[2]))
+        pad = (ks - 1) // 2
+        n, ic, h, width = x.shape
+        wc = w.shape[1]  # wc = weight channels
+        # 1. Extract patches from the input tensor
+        # This creates a "view" of the input where each (h*w) column
+        # contains the flattened data for a ks x ks patch.
+        # Shape: (n, ic * ks * ks, h * w)
+        x_unfolded = F.unfold(x, kernel_size=ks, padding=pad)
+        # 2. Reshape the unfolded input for element-wise multiplication
+        # Shape: (n, ic, ks * ks, h * w)
+        x_unfolded = x_unfolded.view(n, ic, ks * ks, h * width)
+        # 3. Prepare the weights for multiplication
+        # The original weights have wc channels, which are repeated across the
+        # input channels 'ic'.
+        # We need to reshape w to match the unfolded input.
+        # w original shape: (n, wc, ks*ks, h, w)
+        # w reshaped:     (n, wc, ks*ks, h*w)
+        w = w.view(n, wc, ks * ks, h * width)
+        # If the number of input channels is not equal to weight channels,
+        # it implies the weights are grouped/repeated.
+        if ic != wc:
+            # This handles the "ci % wc" logic from the Triton kernel,
+            # repeating the weight channels to match the input channels.
+            repeats = ic // wc
+            w = w.repeat(1, repeats, 1, 1)
+        # 4. Perform the core operation: element-wise multiplication and sum
+        # This is the equivalent of the Triton kernel's main loop.
+        # (x_unfolded * w) -> shape: (n, ic, ks*ks, h*w)
+        # .sum(dim=2) sums across the kernel dimension (ks*ks).
+        # output shape: (n, ic, h*w)
+        output = (x_unfolded * w).sum(dim=2)
+        # 5. Reshape the output back to the original image format
+        # Shape: (n, ic, h, w)
+        output = output.view(n, ic, h, width)
+        return output
+class SKA(torch.nn.Module):
+    def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
+        return PyTorchSkaFn.apply(x, w)  # type: ignore