add model

Browse files

Files changed (5) hide show

.gitattributes +35 -0
README.md +190 -0
config.json +1 -0
model.py +797 -0
model.safetensors +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,190 @@

+---
+library_name: pytorch
+tags:
+- audio
+- spoofing-detection
+- anti-spoofing
+- wav2vec2
+- aasist
+license: apache-2.0
+pipeline_tag: audio-classification
+model-index:
+- name: spectra_aasist
+  results:
+  - task:
+      type: Speech Antispoofing
+    dataset:
+      name: ASVspoof19_LA
+      type: ASVspoof19_LA
+    metrics:
+    - name: Equal Error Rate
+      type: Equal Error Rate
+      value: 0.159
+  - task:
+      type: Speech Antispoofing
+    dataset:
+      name: ASVspoof21_LA
+      type: ASVspoof21_LA
+    metrics:
+    - name: Equal Error Rate
+      type: Equal Error Rate
+      value: 5.164
+  - task:
+      type: Speech Antispoofing
+    dataset:
+      name: ASVspoof21_DF
+      type: ASVspoof21_DF
+    metrics:
+    - name: Equal Error Rate
+      type: Equal Error Rate
+      value: 2.568
+  - task:
+      type: Speech Antispoofing
+    dataset:
+      name: ASVspoof5
+      type: ASVspoof5
+    metrics:
+    - name: Equal Error Rate
+      type: Equal Error Rate
+      value: 14.056
+  - task:
+      type: Speech Antispoofing
+    dataset:
+      name: ADD2022
+      type: ADD2022
+    metrics:
+    - name: Equal Error Rate
+      type: Equal Error Rate
+      value: 15.205
+  - task:
+      type: Speech Antispoofing
+    dataset:
+      name: In-the-Wild
+      type: In-the-Wild
+    metrics:
+    - name: Equal Error Rate
+      type: Equal Error Rate
+      value: 1.461
+---
+## Model Card: Spectra-0 (anti-spoofing / bonafide vs spoof)
+`Spectra-AASIST` is a model for **speech spoofing detection** (binary classification: `bonafide` vs `spoof`) from **raw audio waveforms**. Architecture: SSL encoder (`Wav2Vec2`) → MLP projection → `AASIST` 2-class classifier.
+- **Input**: waveform \(float32\), shape `(batch, num_samples)` (typically 16 kHz).
+- **Output**: logits of shape `(batch, 2)`, where **index 0 = spoof**, **index 1 = bonafide**.
+On first run, the model will automatically download the SSL encoder `facebook/wav2vec2-xls-r-300m` via `transformers`.
+## Evaluation Results
+| Model     | ASVspoof19 LA | ASVspoof21 LA | ASVspoof21 DF | ASVspoof5 | ADD2022  | In-the-Wild |
+|-----------|--------|--------|--------|--------|--------|--------|
+| [Res2TCNGuard](https://github.com/mtuciru/Res2TCNGuard)      | 7.487  | 19.130 | 19.883 | 37.620 | 49.538 | 49.246 |
+| [AASIST3](https://huggingface.co/MTUCI/AASIST3)    | 27.585 | 37.407 | 33.099 | 41.001 | 47.192 | 39.626 |
+| [XSLS](https://github.com/QiShanZhang/SLSforASVspoof-2021-DF)      | 0.231  | 7.714  | 4.220  | 17.688 | 33.951 | 7.453 |
+| [TCM-ADD](https://github.com/ductuantruong/tcm_add)       | **0.152** | 6.655  | 3.444 | 19.505 | 35.252 | 7.767 |
+| [DF Arena 1B](https://huggingface.co/Speech-Arena-2025/DF_Arena_1B_V_1)    | 43.793 | 40.137 | 42.994 | 35.333 | 42.139 | 17.598 |
+| **Spectra-AASIST** | 0.159  | **5.164** | **2.568**  | **14.056** | **15.205** | **1.461** |
+## Quickstart
+### Clone from Hugging Face
+This repository is hosted on Hugging Face Hub: `https://huggingface.co/MTUCI/spectra_aasist`.
+```bash
+git lfs install
+git clone https://huggingface.co/MTUCI/spectra_aasist
+cd spectra_aasist
+```
+### Install dependencies
+```bash
+pip install -U torch torchaudio transformers huggingface_hub safetensors soundfile
+```
+### Single-file inference (example preprocessing)
+```python
+import random
+import torch
+import torchaudio
+import soundfile as sf
+from model import spectra_aasist
+def pad_random(x: torch.Tensor, max_len: int = 64600) -> torch.Tensor:
+    # x: (num_samples,) or (1, num_samples)
+    if x.ndim > 1:
+        x = x.squeeze()
+    x_len = x.shape[0]
+    if x_len >= max_len:
+        start = random.randint(0, x_len - max_len)
+        return x[start:start + max_len]
+    num_repeats = int(max_len / x_len) + 1
+    return x.repeat(num_repeats)[:max_len]
+def load_audio_mono(path: str) -> torch.Tensor:
+    audio, sr = sf.read(path, dtype="float32")
+    audio = torch.from_numpy(audio)
+    if audio.ndim > 1:
+        # (num_samples, channels) -> mono
+        audio = audio.mean(dim=1)
+    if sr != 16000:
+        audio = torchaudio.functional.resample(audio, sr, 16000)
+    return audio
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = spectra_aasist.from_pretrained(pretrained_model_name_or_path=".").eval().to(device)
+audio = load_audio_mono("path/to/audio.wav")
+audio = torchaudio.functional.preemphasis(audio.unsqueeze(0))  # (1, T)
+audio = pad_random(audio.squeeze(0), 64600).unsqueeze(0)       # (1, 64600)
+with torch.inference_mode():
+    logits = model(audio.to(device))  # (1, 2)
+    score_spoof = logits[0, 0].item()
+    score_bonafide = logits[0, 1].item()
+print({"score_bonafide": score_bonafide, "score_spoof": score_spoof})
+```
+## Threshold-based classification (and how to tune it)
+In `model.py`, the `SpectraAASIST` class provides `classify()` with a **default threshold** chosen as an “optimal” value for the original setting:
+- **Default threshold**: `-1.0625009` (it thresholds `logit_bonafide = logits[:, 1]`)
+- **Note**: this threshold **may not be optimal** on a different dataset/domain. It’s recommended to tune the threshold on your dataset using **EER** (Equal Error Rate) or a target FAR/FRR.
+Example:
+```python
+with torch.inference_mode():
+    pred = model.classify(audio.to(device), threshold=-1.0625009)  # 1=bonafide, 0=spoof
+```
+### Tuning the threshold via EER (typical workflow)
+1) Run the model on a labeled set and collect scores for both classes.
+2) Compute EER and the threshold
+## Limitations and notes
+- This is a **pre-release** model.
+- Significantly stronger models are planned for **Q3–Q4 2026** — stay tuned.
+## License
+MIT (see the `license` field in the model repo header).
+## Contacts
+TG channel: https://t.me/korallll_ai
+email: k.n.borodin@mtuci.ru
+website: https://lab260.ru/

config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

model.py ADDED Viewed

	@@ -0,0 +1,797 @@

+import torch
+import torch.nn as nn
+from transformers import Wav2Vec2Model
+import torch.nn.functional as F
+from huggingface_hub import PyTorchModelHubMixin
+class Wav2Vec2Encoder(nn.Module):
+    """SSL encoder based on Hugging Face's Wav2Vec2 model."""
+    def __init__(self,
+                 model_name_or_path: str = "facebook/wav2vec2-base-960h",
+                 ssl_out_dim: int = 1024,
+                 use_ssl_n_layers: int = None,
+                 freeze_ssl_n_layers: int = 0,
+                 output_attentions: bool = False,
+                 output_hidden_states: bool = False,
+                 normalize_waveform: bool = True):
+        """Initialize the Wav2Vec2 encoder.
+        Args:
+            model_name_or_path: HuggingFace model name or path to local model.
+            ssl_out_dim: Output dimension of the Wav2Vec2 encoder.
+            use_ssl_n_layers: Number of Wav2Vec2 layers to use. If None, use all layers.
+            freeze_ssl_n_layers: Number of Wav2Vec2 layers to freeze during training.
+            output_attentions: Whether to output attentions.
+            output_hidden_states: Whether to output hidden states.
+            normalize_waveform: Whether to normalize the waveform input.
+        """
+        super().__init__()
+        self.model_name_or_path = model_name_or_path
+        self.ssl_out_dim = ssl_out_dim
+        self.use_ssl_n_layers = use_ssl_n_layers
+        self.freeze_ssl_n_layers = freeze_ssl_n_layers
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.normalize_waveform = normalize_waveform
+        # Load Wav2Vec2 model
+        self.model = Wav2Vec2Model.from_pretrained(
+            model_name_or_path,
+            gradient_checkpointing=False)
+        self.model.config.apply_spec_augment = False
+        self.model.masked_spec_embed = None
+        # Handle layer freezing
+        if freeze_ssl_n_layers > 0:
+            self._freeze_layers(freeze_ssl_n_layers)
+    def _freeze_layers(self, n_layers):
+        """Freeze the first n_layers layers of the Wav2Vec2 encoder.
+        Args:
+            n_layers: Number of layers to freeze.
+        """
+        # Freeze feature extractor
+        if n_layers > 0:
+            for param in self.model.feature_extractor.parameters():
+                param.requires_grad = False
+            # Freeze encoder layers
+            encoder_layers = self.model.encoder.layers
+            total_layers = len(encoder_layers)
+            layers_to_freeze = min(n_layers - 1, total_layers)  # -1 because feature_extractor counts as one layer
+            if layers_to_freeze > 0:
+                for i in range(layers_to_freeze):
+                    for param in encoder_layers[i].parameters():
+                        param.requires_grad = False
+    def forward(self, x):
+        """Forward pass through the Wav2Vec2 encoder.
+        Args:
+            x: Input tensor of shape (batch_size, sequence_length, channels)
+        Returns:
+            Extracted features of shape (batch_size, sequence_length, ssl_out_dim)
+        """
+        # Handle shape: convert (batch_size, sequence_length, channels) to (batch_size, sequence_length)
+        if x.ndim == 3:
+            x = x.squeeze(-1)  # Remove channel dimension if present
+        # Normalize input if specified
+        if self.normalize_waveform:
+            x = x / (torch.max(torch.abs(x), dim=1, keepdim=True)[0] + 1e-8)
+        # Wav2Vec2 forward pass
+        outputs = self.model(
+            x,
+            output_attentions=self.output_attentions,
+            output_hidden_states=self.output_hidden_states,
+            return_dict=True
+        )
+        # Extract last hidden state
+        last_hidden_state = outputs.last_hidden_state
+        # Optionally use only a subset of layers (if use_ssl_n_layers is set and output_hidden_states is True)
+        if self.use_ssl_n_layers is not None and self.output_hidden_states and outputs.hidden_states is not None:
+            # Use the last N hidden states and concatenate or average them
+            selected = outputs.hidden_states[-self.use_ssl_n_layers:]
+            last_hidden_state = torch.mean(torch.stack(selected, dim=0), dim=0)
+        del outputs
+        return last_hidden_state
+class MLPBridge(nn.Module):
+    """MLP bridge between SSL encoder and AASIST model."""
+    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int = None,
+                 dropout: float = 0.1, activation: str = nn.ReLU, n_layers: int = 1):
+        """Initialize the MLP bridge.
+        Args:
+            input_dim: The input dimension from the SSL encoder.
+            output_dim: The output dimension for the AASIST model.
+            hidden_dim: Hidden dimension size. If None, use the average of input and output dims.
+            dropout: Dropout probability to apply between layers.
+            activation: Activation function to use
+            n_layers: Number of MLP layers (repeats of Linear+Activation+Dropout blocks).
+        """
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = (input_dim + output_dim) // 2
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.n_layers = n_layers
+        assert hasattr(activation, 'forward') and callable(getattr(activation, 'forward', None)), "Activation class must have a callable forward() method."
+        act_fn = activation
+        layers = []
+        for i in range(n_layers):
+            in_dim = input_dim if i == 0 else hidden_dim
+            out_dim = hidden_dim
+            layers.append(nn.Linear(in_dim, out_dim))
+            layers.append(act_fn)
+            layers.append(nn.Dropout(dropout) if dropout > 0 else nn.Identity())
+        # Final output layer
+        layers.append(nn.Linear(hidden_dim, output_dim))
+        layers.append(nn.Dropout(dropout) if dropout > 0 else nn.Identity())
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, x):
+        """Forward pass through the bridge.
+        Args:
+            x: The input tensor from the SSL encoder.
+        Returns:
+            The transformed tensor for the AASIST model.
+        """
+        return self.mlp(x)
+class HtrgGraphAttentionLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, size, layer="KANLinear", **kwargs):
+        super().__init__()
+        if layer == "Linear":
+            self.proj_type1 = nn.Linear(in_dim, in_dim)
+            self.proj_type2 = nn.Linear(in_dim, in_dim)
+            self.att_proj = nn.Linear(in_dim, out_dim)
+            self.att_projM = nn.Linear(in_dim, out_dim)
+            self.proj_with_att = nn.Linear(in_dim, out_dim)
+            self.proj_without_att = nn.Linear(in_dim, out_dim)
+            self.proj_with_attM = nn.Linear(in_dim, out_dim)
+            self.proj_without_attM = nn.Linear(in_dim, out_dim)
+        else:
+            raise ValueError(f"Invalid layer type: {layer}")
+        self.att_weight11 = self._init_new_params(out_dim, 1)
+        self.att_weight22 = self._init_new_params(out_dim, 1)
+        self.att_weight12 = self._init_new_params(out_dim, 1)
+        self.att_weightM = self._init_new_params(out_dim, 1)
+        self.bn = nn.BatchNorm1d(out_dim)
+        self.input_drop = nn.Dropout(p=0.2)
+        self.act = nn.SELU(inplace=True)
+        self.temp = 1.
+        if "temperature" in kwargs:
+            self.temp = kwargs["temperature"]
+    def forward(self, x1, x2, master=None):
+        '''
+        x1  :(#bs, #node, #dim)
+        x2  :(#bs, #node, #dim)
+        '''
+        num_type1 = x1.size(1)
+        num_type2 = x2.size(1)
+        x1 = self.proj_type1(x1)
+        x2 = self.proj_type2(x2)
+        x = torch.cat([x1, x2], dim=1)
+        if master is None:
+            master = torch.mean(x, dim=1, keepdim=True)
+        # apply input dropout
+        x = self.input_drop(x)
+        # derive attention map
+        att_map = self._derive_att_map(x, num_type1, num_type2)
+        # directional edge for master node
+        master = self._update_master(x, master)
+        # projection
+        x = self._project(x, att_map)
+        # apply batch norm
+        x = self._apply_BN(x)
+        # x = self.act(x)
+        x1 = x.narrow(1, 0, num_type1)
+        x2 = x.narrow(1, num_type1, num_type2)
+        return x1, x2, master
+    def _update_master(self, x, master):
+        att_map = self._derive_att_map_master(x, master)
+        master = self._project_master(x, master, att_map)
+        return master
+    def _pairwise_mul_nodes(self, x):
+        '''
+        Calculates pairwise multiplication of nodes.
+        - for attention map
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, #dim)
+        '''
+        nb_nodes = x.size(1)
+        x = x.unsqueeze(2).expand(-1, -1, nb_nodes, -1)
+        x_mirror = x.transpose(1, 2)
+        return x * x_mirror
+    def _derive_att_map_master(self, x, master):
+        '''
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, 1)
+        '''
+        att_map = x * master
+        att_map = torch.tanh(self.att_projM(att_map))
+        att_map = torch.matmul(att_map, self.att_weightM)
+        # apply temperature
+        att_map = att_map / self.temp
+        att_map = F.softmax(att_map, dim=-2)
+        return att_map
+    def _derive_att_map(self, x, num_type1, num_type2):
+        '''
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, 1)
+        '''
+        att_map = self._pairwise_mul_nodes(x)
+        # size: (#bs, #node, #node, #dim_out)
+        att_map = torch.tanh(self.att_proj(att_map))
+        # size: (#bs, #node, #node, 1)
+        att_board = torch.zeros_like(att_map[:, :, :, 0]).unsqueeze(-1)
+        att_board[:, :num_type1, :num_type1, :] = torch.matmul(
+            att_map[:, :num_type1, :num_type1, :], self.att_weight11)
+        att_board[:, num_type1:, num_type1:, :] = torch.matmul(
+            att_map[:, num_type1:, num_type1:, :], self.att_weight22)
+        att_board[:, :num_type1, num_type1:, :] = torch.matmul(
+            att_map[:, :num_type1, num_type1:, :], self.att_weight12)
+        att_board[:, num_type1:, :num_type1, :] = torch.matmul(
+            att_map[:, num_type1:, :num_type1, :], self.att_weight12)
+        att_map = att_board
+        # att_map = torch.matmul(att_map, self.att_weight12)
+        # apply temperature
+        att_map = att_map / self.temp
+        att_map = F.softmax(att_map, dim=-2)
+        return att_map
+    def _project(self, x, att_map):
+        x1 = self.proj_with_att(torch.matmul(att_map.squeeze(-1), x))
+        x2 = self.proj_without_att(x)
+        return x1 + x2
+    def _project_master(self, x, master, att_map):
+        x1 = self.proj_with_attM(torch.matmul(
+            att_map.squeeze(-1).unsqueeze(1), x))
+        x2 = self.proj_without_attM(master)
+        return x1 + x2
+    def _apply_BN(self, x):
+        org_size = x.size()
+        x = x.view(-1, org_size[-1])
+        x = self.bn(x)
+        x = x.view(org_size)
+        return x
+    def _init_new_params(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+class GraphPool(nn.Module):
+    def __init__(self, k: float, in_dim: int, p, size, layer="KANLinear"):
+        super().__init__()
+        self.k = k
+        self.sigmoid = nn.Sigmoid()
+        if layer == "Linear":
+            self.proj = nn.Linear(in_dim, 1)
+        else:
+            raise ValueError(f"Invalid layer type: {layer}")
+        self.drop = nn.Dropout(p=p) if p > 0 else nn.Identity()
+        self.in_dim = in_dim
+    def forward(self, h):
+        Z = self.drop(h)
+        weights = self.proj(Z)
+        scores = self.sigmoid(weights)
+        new_h = self.top_k_graph(scores, h, self.k)
+        return new_h
+    def top_k_graph(self, scores, h, k):
+        """
+        args
+        =====
+        scores: attention-based weights (#bs, #node, 1)
+        h: graph data (#bs, #node, #dim)
+        k: ratio of remaining nodes, (float)
+        returns
+        =====
+        h: graph pool applied data (#bs, #node', #dim)
+        """
+        _, n_nodes, n_feat = h.size()
+        n_nodes = max(int(n_nodes * k), 1)
+        _, idx = torch.topk(scores, n_nodes, dim=1)
+        idx = idx.expand(-1, -1, n_feat)
+        h = h * scores
+        h = torch.gather(h, 1, idx)
+        return h
+class GraphAttentionLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, layer="KANLinear", **kwargs):
+        super().__init__()
+        # attention map
+        if layer == "Linear":
+            self.att_proj = nn.Linear(in_dim, out_dim)
+            self.proj_with_att = nn.Linear(in_dim, out_dim)
+            self.proj_without_att = nn.Linear(in_dim, out_dim)
+        else:
+            raise ValueError(f"Invalid layer type: {layer}")
+        self.att_weight = self._init_new_params(out_dim, 1)
+        # batch norm
+        self.bn = nn.BatchNorm1d(out_dim)
+        # dropout for inputs
+        self.input_drop = nn.Dropout(p=0.2)
+        # activate
+        self.act = nn.SELU(inplace=True)
+        # temperature
+        self.temp = 1.
+        if "temperature" in kwargs:
+            self.temp = kwargs["temperature"]
+    def forward(self, x):
+        '''
+        x   :(#bs, #node, #dim)
+        '''
+        # apply input dropout
+        x = self.input_drop(x)
+        # derive attention map
+        att_map = self._derive_att_map(x)
+        # projection
+        x = self._project(x, att_map)
+        # apply batch norm
+        x = self._apply_BN(x)
+        x = self.act(x)
+        return x
+    def _pairwise_mul_nodes(self, x):
+        '''
+        Calculates pairwise multiplication of nodes.
+        - for attention map
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, #dim)
+        '''
+        nb_nodes = x.size(1)
+        x = x.unsqueeze(2).expand(-1, -1, nb_nodes, -1)
+        x_mirror = x.transpose(1, 2)
+        return x * x_mirror
+    def _derive_att_map(self, x):
+        '''
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, 1)
+        '''
+        att_map = self._pairwise_mul_nodes(x)
+        # size: (#bs, #node, #node, #dim_out)
+        att_map = torch.tanh(self.att_proj(att_map))
+        # size: (#bs, #node, #node, 1)
+        att_map = torch.matmul(att_map, self.att_weight)
+        # apply temperature
+        att_map = att_map / self.temp
+        att_map = F.softmax(att_map, dim=-2)
+        return att_map
+    def _project(self, x, att_map):
+        x1 = self.proj_with_att(torch.matmul(att_map.squeeze(-1), x))
+        x2 = self.proj_without_att(x)
+        return x1 + x2
+    def _apply_BN(self, x):
+        org_size = x.size()
+        x = x.view(-1, org_size[-1])
+        x = self.bn(x)
+        x = x.view(org_size)
+        return x
+    def _init_new_params(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+class Res2NetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, scale=4, kernel_size=(2, 3), stride=1, padding=(1, 1)):
+        super().__init__()
+        assert out_channels % scale == 0, "out_channels must be divisible by scale"
+        self.scale = scale
+        self.width = out_channels // scale
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+        self.convs = nn.ModuleList([
+            nn.Conv2d(self.width, self.width, kernel_size=kernel_size, stride=stride, padding=padding)
+            for _ in range(scale)
+        ])
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.selu = nn.SELU(inplace=True)
+        self.conv3 = nn.Conv2d(out_channels, out_channels, kernel_size=1)
+        self.downsample = None
+        if in_channels != out_channels:
+            self.downsample = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        xs = torch.chunk(out, self.scale, dim=1)
+        ys = []
+        for s in range(self.scale):
+            if s == 0:
+                ys.append(self.convs[s](xs[s]))
+            else:
+                ys.append(self.convs[s](xs[s] + ys[s - 1]))
+        out = torch.cat(ys, dim=1)
+        out = self.bn(out)
+        out = self.selu(out)
+        out = self.conv3(out)
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out += identity
+        return out
+class Residual_block(nn.Module):
+    def __init__(self, nb_filts, first=False):
+        super().__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm2d(num_features=nb_filts[0])
+        self.conv1 = nn.Conv2d(in_channels=nb_filts[0],
+                               out_channels=nb_filts[1],
+                               kernel_size=(2, 3),
+                               padding=(1, 1),
+                               stride=1)
+        self.selu = nn.SELU(inplace=True)
+        self.bn2 = nn.BatchNorm2d(num_features=nb_filts[1])
+        self.conv2 = nn.Conv2d(in_channels=nb_filts[1],
+                               out_channels=nb_filts[1],
+                               kernel_size=(2, 3),
+                               padding=(0, 1),
+                               stride=1)
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv2d(in_channels=nb_filts[0],
+                                             out_channels=nb_filts[1],
+                                             padding=(0, 1),
+                                             kernel_size=(1, 3),
+                                             stride=1)
+        else:
+            self.downsample = False
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.selu(out)
+        else:
+            out = x
+        # print('out',out.shape)
+        out = self.conv1(out)
+        # print('aft conv1 out',out.shape)
+        out = self.bn2(out)
+        out = self.selu(out)
+        # print('out',out.shape)
+        out = self.conv2(out)
+        # print('conv2 out',out.shape)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        # out = self.mp(out)
+        return out
+class Encoder(nn.Module):
+    def __init__(self, filts):
+        super().__init__()
+        self.first_bn = nn.BatchNorm2d(num_features=1)
+        self.first_bn1 = nn.BatchNorm2d(num_features=64)
+        self.selu = nn.SELU(inplace=True)
+        self.enc = nn.Sequential(
+            nn.Sequential(Residual_block(nb_filts=filts[1], first=True)),
+            nn.Sequential(Residual_block(nb_filts=filts[2])),
+            nn.Sequential(Residual_block(nb_filts=filts[3])),
+            nn.Sequential(Residual_block(nb_filts=filts[4])),
+            nn.Sequential(Residual_block(nb_filts=filts[4])),
+            nn.Sequential(Residual_block(nb_filts=filts[4]))
+        )
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = x.unsqueeze(dim=1)
+        x = F.max_pool2d(torch.abs(x), (3, 3))
+        x = self.first_bn(x)
+        x = self.selu(x)
+        # # get embeddings using encoder
+        # # (#bs, #filt, #spec, #seq)
+        x = self.enc(x)
+        x = self.first_bn1(x)
+        x = self.selu(x)
+        return x
+class HSGALBranch_v1(nn.Module):
+    def __init__(self, gat_dims, temperatures, pool_ratios, size=200, layer="KANLinear"):
+        super().__init__()
+        self.master = nn.Parameter(torch.randn(1, 1, gat_dims[0]))
+        self.HtrgGAT_layer_ST1 = HtrgGraphAttentionLayer(
+            gat_dims[0], gat_dims[1], temperature=temperatures[2], size=size, layer=layer
+        )
+        self.HtrgGAT_layer_ST2 = HtrgGraphAttentionLayer(
+            gat_dims[1], gat_dims[1], temperature=temperatures[2], size=size, layer=layer
+        )
+        self.pool_hS = GraphPool(pool_ratios[2], gat_dims[1], 0.3, size=size, layer=layer)
+        self.pool_hT = GraphPool(pool_ratios[2], gat_dims[1], 0.3, size=size, layer=layer)
+        self.drop_way = nn.Dropout(0.2, inplace=True)
+    def forward(self, out_t, out_s):
+        out_T, out_S,  master = self.HtrgGAT_layer_ST1(
+            out_t, out_s, master=self.master
+        )
+        out_S = self.pool_hS(out_S)
+        out_T = self.pool_hT(out_T)
+        out_T_aug, out_S_aug, master_aug = self.HtrgGAT_layer_ST2(
+            out_T, out_S, master=master
+        )
+        out_T = out_T + out_T_aug
+        out_S = out_S + out_S_aug
+        master = master + master_aug
+        out_T = self.drop_way(out_T)
+        out_S = self.drop_way(out_S)
+        master = self.drop_way(master)
+        return out_T, out_S, master
+class KANAASIST(nn.Module):
+    """KAN-AASIST model with graph attention layers."""
+    def __init__(
+        self,
+        d_args={
+            "architecture": "AASIST",
+            "nb_samp": 64600,
+            "filts": [512, [1, 32], [32, 32], [32, 64], [64, 64]],
+            "gat_dims": [64, 32],
+            "pool_ratios": [0.5, 0.5, 0.5, 0.5],
+            "temperatures": [2.0, 2.0, 100.0, 100.0]
+        },
+        encoder=Encoder,
+        size=200,
+        n_frames=400,
+        layer_type="Linear",
+        **kwargs
+    ):
+        super().__init__()
+        layer = layer_type
+        self.d_args = d_args
+        filts = d_args["filts"]
+        gat_dims = d_args["gat_dims"]
+        pool_ratios = d_args["pool_ratios"]
+        temperatures = d_args["temperatures"]
+        self.drop = nn.Dropout(0.5, inplace=True)
+        self.drop_way = nn.Dropout(0.2, inplace=True)
+        self.attention = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=(1, 1)),
+            nn.SELU(inplace=True),
+            nn.BatchNorm2d(128),
+            nn.Conv2d(128, 64, kernel_size=(1, 1)),
+        )
+        self.pos_S = nn.Parameter(torch.randn(1, filts[0] // 3, filts[-1][-1]))
+        self.pos_T = nn.Parameter(torch.randn(1, n_frames, filts[0]))
+        self.GAT_layer_S = GraphAttentionLayer(filts[-1][-1],
+                                               gat_dims[0],
+                                               temperature=temperatures[0], size=size, layer=layer)
+        self.GAT_layer_T = GraphAttentionLayer(filts[-1][-1],
+                                               gat_dims[0],
+                                               temperature=temperatures[1], size=size, layer=layer)
+        self.branch1 = HSGALBranch_v1(gat_dims, temperatures, pool_ratios, size, layer=layer)
+        self.branch2 = HSGALBranch_v1(gat_dims, temperatures, pool_ratios, size, layer=layer)
+        self.branch3 = HSGALBranch_v1(gat_dims, temperatures, pool_ratios, size, layer=layer)
+        self.branch4 = HSGALBranch_v1(gat_dims, temperatures, pool_ratios, size, layer=layer)
+        self.pool_S = GraphPool(pool_ratios[0], gat_dims[0], 0.3, size=size, layer=layer)
+        self.pool_T = GraphPool(pool_ratios[1], gat_dims[0], 0.3, size=size, layer=layer)
+        out_features = 2
+        in_features = 5 * gat_dims[1]
+        if layer == 'Linear':
+            self.out_layer = nn.Linear(in_features, out_features)
+        else:
+            raise ValueError(f"Invalid layer type: {layer}")
+        self.enc = encoder(filts=filts)
+    def forward(self, x, Freq_aug=False):
+        """Forward pass through the KAN-AASIST model.
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, channels)
+            Freq_aug: Whether to use frequency augmentation
+        Returns:
+            Model output for binary classification.
+        """
+        x = x + self.pos_T[:, :x.size(1), :]
+        x = self.enc(x)
+        # attention block assumes x is (batch, time, feature_dim)
+        # Adapt attention block if needed for SSL features
+        w = self.attention(x)
+        w1 = F.softmax(w, dim=-1)
+        m = torch.sum(x * w1, dim=-1)
+        e_S = m.transpose(1, 2) + self.pos_S
+        gat_S = self.GAT_layer_S(e_S)
+        out_S = self.pool_S(gat_S)  # (#bs, #node, #dim)
+        w2 = F.softmax(w, dim=-2)
+        m1 = torch.sum(x * w2, dim=-2)
+        e_T = m1.transpose(1, 2)
+        gat_T = self.GAT_layer_T(e_T)
+        out_T = self.pool_T(gat_T)
+        out_T1, out_S1, master1 = self.branch1(out_T, out_S)
+        out_T2, out_S2, master2 = self.branch2(out_T, out_S)
+        out_T3, out_S3, master3 = self.branch3(out_T, out_S)
+        out_T4, out_S4, master4 = self.branch4(out_T, out_S)
+        out_T  = torch.amax(torch.stack([out_T1, out_T2, out_T3, out_T4]),     dim=0)
+        out_S  = torch.amax(torch.stack([out_S1, out_S2, out_S3, out_S4]),     dim=0)
+        master = torch.amax(torch.stack([master1, master2, master3, master4]), dim=0)
+        T_max, _ = torch.max(torch.abs(out_T), dim=1)
+        T_avg = torch.mean(out_T, dim=1)
+        S_max, _ = torch.max(torch.abs(out_S), dim=1)
+        S_avg = torch.mean(out_S, dim=1)
+        last_hidden = torch.cat(
+            [T_max, T_avg, S_max, S_avg, master.squeeze(1)], dim=1)
+        last_hidden = self.drop(last_hidden)
+        output = self.out_layer(last_hidden)
+        return output
+class SpectraAASIST(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.ssl_encoder = Wav2Vec2Encoder("facebook/wav2vec2-xls-r-300m",
+                                           1024,
+                                           None,
+                                           0,
+                                           False,
+                                           False,
+                                           False)
+        self.bridge = MLPBridge(1024,
+                                128,
+                                hidden_dim=128, dropout=0.1, activation=nn.SELU(), n_layers=1)
+        self.aasist = KANAASIST(
+            d_args={
+                "architecture": "AASIST",
+                "nb_samp": 64400,
+                "filts": [128, [1, 32], [32, 32], [32, 64], [64, 64]],
+                "gat_dims": [64, 32],
+                "pool_ratios": [0.5, 0.5, 0.5, 0.5],
+                "temperatures": [2.0, 2.0, 100.0, 100.0]
+            },
+            size=200,
+            layer_type="Linear"
+        )
+    def forward(self, x):
+        x = self.ssl_encoder(x)
+        x = self.bridge(x)
+        x = self.aasist(x)
+        return x
+    @torch.inference_mode()
+    def classify(self, x, threshold: float = -1.0625009):
+        x = self.forward(x)[:, 1]
+        x = (x > threshold).float()
+        return x.item()
+spectra_aasist = SpectraAASIST
+if __name__ == "__main__":
+    model = SpectraAASIST()
+    model.load_state_dict(torch.load("/data/home/maslov/maslov/WEIGHTS/aasist/baseline_v1_linear/weights/model.pt", map_location="cpu"))
+    model.eval()
+    x = torch.randn(1, 64400)
+    print(model(x).shape)
+    print(model(x))

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e2727a7397f78d28b0a2a2b8ee031ff08143b9c431ea7f06fc29a808b0180db
+size 1264151840