Spaces:

mippia
/

AI-Music-Detection-FST

Running on Zero

App Files Files Community

slslslrhfem commited on Sep 4

Commit

be8ccdd

1 Parent(s): 216b804

commit for ICASSP 2026 github repository

Browse files

Files changed (2) hide show

inference.py +6 -23
model.py +8 -218

inference.py CHANGED Viewed

@@ -18,22 +18,6 @@ from preprocess import get_segments_from_wav, find_optimal_segment_length
-def highpass_filter(y, sr, cutoff=1000, order=5):
-    if isinstance(sr, np.ndarray):
-        sr = np.mean(sr)
-    if not isinstance(sr, (int, float)):
-        raise ValueError(f"sr must be a number, but got {type(sr)}: {sr}")
-    nyquist = 0.5 * sr
-    if cutoff <= 0 or cutoff >= nyquist:
-        cutoff = max(10, min(cutoff, nyquist - 1))
-    normal_cutoff = cutoff / nyquist
-    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
-    y_filtered = signal.lfilter(b, a, y)
-    return y_filtered
 def load_audio(audio_path: str, sr: int = 24000) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     오디오 파일을 불러와 세그먼트로 분할합니다.
@@ -216,11 +200,11 @@ def inference(audio_path):
     segments = segments.to('cuda').to(torch.float32)
     padding_mask = padding_mask.to('cuda').unsqueeze(0)
     logits,embedding = backbone_model(segments.squeeze(1))
-    test_dataset = FakeMusicCapsDataset([audio_path], [0], target_duration=10.0)
-    test_data, test_target = test_dataset[0]
-    test_data = test_data.to('cuda').to(torch.float32)
-    test_target = test_target.to('cuda')
-    output, _ = backbone_model(test_data.unsqueeze(0))
@@ -230,7 +214,6 @@ def inference(audio_path):
         input_dim=input_dim,
         #emb_model=backbone_model
         is_emb = True,
-        #mode = 'both'
     )
@@ -247,5 +230,5 @@ def inference(audio_path):
     return results
 if __name__ == "__main__":
-    main()

 def load_audio(audio_path: str, sr: int = 24000) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     오디오 파일을 불러와 세그먼트로 분할합니다.
     segments = segments.to('cuda').to(torch.float32)
     padding_mask = padding_mask.to('cuda').unsqueeze(0)
     logits,embedding = backbone_model(segments.squeeze(1))
+    # test_dataset = FakeMusicCapsDataset([audio_path], [0], target_duration=10.0)
+    # test_data, test_target = test_dataset[0]
+    # test_data = test_data.to('cuda').to(torch.float32)
+    # test_target = test_target.to('cuda')
+    # output, _ = backbone_model(test_data.unsqueeze(0))
         input_dim=input_dim,
         #emb_model=backbone_model
         is_emb = True,
     )
     return results
 if __name__ == "__main__":
+    inference("some path")

model.py CHANGED Viewed

@@ -36,22 +36,12 @@ class MusicAudioClassifier(pl.LightningModule):
                 hidden_dim=hidden_dim,
                 num_classes=num_classes
             )
-        elif backbone == 'guided_segment_transformer':
-            self.model = GuidedSegmentTransformer(
-                input_dim=input_dim,
-                hidden_dim=hidden_dim,
-                num_classes=num_classes
-            )
-        elif backbone == 'ultra_segment_processor':
-            self.model = UltraModernSegmentProcessor(
-                input_dim=input_dim,
-                hidden_dim=hidden_dim,
-                num_classes=num_classes
-            )
-        self.emb_model = emb_model
-        self.learning_rate = learning_rate
-        self.is_emb = is_emb
-        self.num_classes = num_classes
     def _process_audio_batch(self, x: torch.Tensor) -> torch.Tensor:
         B, S = x.shape[:2]  # [B, S, C, M, T] or [B, S, C, T] for wav, [B, S, 1?, embsize] for emb
@@ -529,6 +519,7 @@ class MultiScaleAdaptivePooler(nn.Module):
         Args:
             x: (batch, seq_len, hidden_dim) - sequence features
             padding_mask: (batch, seq_len) - padding mask
         """
         batch_size = x.size(0)
@@ -838,205 +829,4 @@ class FusionSegmentTransformer(nn.Module):
         pooled = pooled.half()
         return self.classification_head(pooled)
-    import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from typing import Optional
-import math
-class RMSNorm(nn.Module):
-    """RMS Normalization - 안정적"""
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
-class SwiGLU(nn.Module):
-    """SwiGLU Activation - 단순 버전"""
-    def __init__(self, dim: int):
-        super().__init__()
-        self.w1 = nn.Linear(dim, dim * 2, bias=False)
-        self.w2 = nn.Linear(dim, dim, bias=False)
-    def forward(self, x):
-        return self.w2(F.silu(self.w1(x)[:, :, :x.size(-1)]))  # 차원 맞춤
-class GroupedQueryAttention(nn.Module):
-    """단순한 GQA - 에러 방지"""
-    def __init__(self, d_model: int, num_heads: int = 8):
-        super().__init__()
-        assert d_model % num_heads == 0
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.head_dim = d_model // num_heads
-        # 모든 projection을 동일한 차원으로
-        self.q_proj = nn.Linear(d_model, d_model, bias=False)
-        self.k_proj = nn.Linear(d_model, d_model, bias=False)
-        self.v_proj = nn.Linear(d_model, d_model, bias=False)
-        self.o_proj = nn.Linear(d_model, d_model, bias=False)
-        self.scale = 1.0 / math.sqrt(self.head_dim)
-    def forward(self, x, pairwise_matrix=None, padding_mask=None):
-        B, L, D = x.shape
-        Q = self.q_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
-        K = self.k_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
-        V = self.v_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
-        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
-        if pairwise_matrix is not None:
-            scores = scores + pairwise_matrix.unsqueeze(1)
-        if padding_mask is not None:
-            mask_4d = padding_mask.unsqueeze(1).unsqueeze(1).expand(-1, self.num_heads, L, -1)
-            scores = scores.masked_fill(mask_4d, float('-inf'))
-        attn_weights = F.softmax(scores, dim=-1)
-        attn_output = torch.matmul(attn_weights, V)
-        attn_output = attn_output.transpose(1, 2).contiguous().view(B, L, D)
-        return self.o_proj(attn_output)
-class SimpleModernLayer(nn.Module):
-    """단순하고 안전한 모던 레이어"""
-    def __init__(self, d_model: int, num_heads: int = 8):
-        super().__init__()
-        # RMSNorm
-        self.norm1 = RMSNorm(d_model)
-        self.norm2 = RMSNorm(d_model)
-        # Attention
-        self.attention = GroupedQueryAttention(d_model, num_heads)
-        # Feed forward
-        self.ffn = SwiGLU(d_model)
-    def forward(self, x, pairwise_matrix=None, padding_mask=None):
-        # Attention with residual
-        normed_x = self.norm1(x)
-        attn_out = self.attention(normed_x, pairwise_matrix, padding_mask)
-        x = x + attn_out
-        # FFN with residual
-        normed_x2 = self.norm2(x)
-        ffn_out = self.ffn(normed_x2)
-        x = x + ffn_out
-        return x
-class SimpleQuantumPooling(nn.Module):
-    """단순한 어텐션 풀링"""
-    def __init__(self, d_model: int):
-        super().__init__()
-        # 3가지 풀링 방법
-        self.attention_pool = nn.MultiheadAttention(d_model, 8, batch_first=True)
-        self.query_token = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
-        # 결합
-        self.final_proj = nn.Linear(d_model * 3, d_model, bias=False)
-    def forward(self, x, padding_mask=None):
-        batch_size = x.size(0)
-        # 1. Average pooling
-        if padding_mask is not None:
-            mask_expanded = (~padding_mask).float().unsqueeze(-1)
-            avg_pooled = (x * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
-        else:
-            avg_pooled = x.mean(dim=1)
-        # 2. Max pooling
-        if padding_mask is not None:
-            x_masked = x.clone()
-            x_masked[padding_mask] = float('-inf')
-            max_pooled = x_masked.max(dim=1)[0]
-        else:
-            max_pooled = x.max(dim=1)[0]
-        # 3. Attention pooling
-        query = self.query_token.expand(batch_size, -1, -1)
-        attn_pooled, _ = self.attention_pool(
-            query, x, x, key_padding_mask=padding_mask
-        )
-        attn_pooled = attn_pooled.squeeze(1)
-        # 결합
-        combined = torch.cat([avg_pooled, max_pooled, attn_pooled], dim=-1).half()
-        return self.final_proj(combined)
-class UltraModernSegmentProcessor(nn.Module):
-    """에러 없는 단순 버전 ✅"""
-    def __init__(self,
-                 input_dim: int,
-                 hidden_dim: int = 512,
-                 num_heads: int = 8,
-                 num_layers: int = 6,
-                 dropout: float = 0.1,
-                 max_sequence_length: int = 1000,
-                 num_classes: int = 2):
-        super().__init__()
-        assert hidden_dim % num_heads == 0
-        self.hidden_dim = hidden_dim
-        self.input_projection = nn.Linear(input_dim, hidden_dim, bias=False)
-        # 모던 레이어들
-        self.layers = nn.ModuleList([
-            SimpleModernLayer(hidden_dim, num_heads)
-            for _ in range(num_layers)
-        ])
-        # 단순 풀링
-        self.pooler = SimpleQuantumPooling(hidden_dim)
-        # 분류 헤드
-        output_dim = 1 if num_classes == 2 else num_classes
-        self.classifier = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim // 2, bias=False),
-            RMSNorm(hidden_dim // 2),
-            nn.SiLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim // 2, hidden_dim // 4, bias=False),
-            RMSNorm(hidden_dim // 4),
-            nn.SiLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim // 4, output_dim, bias=False)
-        )
-    def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # Input projection
-        x_emb = self.input_projection(x)
-        # Pairwise matrix 계산
-        x_expanded = x.unsqueeze(2)
-        x_transposed = x.unsqueeze(1)
-        # 유클리드 거리만 사용 (단순하게)
-        distances = torch.mean((x_expanded - x_transposed) ** 2, dim=-1)
-        pairwise_matrix = torch.exp(-distances)
-        if padding_mask is not None:
-            pairwise_mask = padding_mask.unsqueeze(1) | padding_mask.unsqueeze(2)
-            pairwise_matrix = pairwise_matrix.masked_fill(pairwise_mask, 0.0)
-        # 레이어들 통과
-        for layer in self.layers:
-            x_emb = layer(x_emb, pairwise_matrix, padding_mask)
-        # 풀링
-        pooled = self.pooler(x_emb, padding_mask)
-        # 분류
-        return self.classifier(pooled)

                 hidden_dim=hidden_dim,
                 num_classes=num_classes
             )
+        # elif backbone == 'guided_segment_transformer':
+        #     self.model = GuidedSegmentTransformer(
+        #         input_dim=input_dim,
+        #         hidden_dim=hidden_dim,
+        #         num_classes=num_classes
+        #     )
     def _process_audio_batch(self, x: torch.Tensor) -> torch.Tensor:
         B, S = x.shape[:2]  # [B, S, C, M, T] or [B, S, C, T] for wav, [B, S, 1?, embsize] for emb
         Args:
             x: (batch, seq_len, hidden_dim) - sequence features
             padding_mask: (batch, seq_len) - padding mask
+            actually not better than avg pooling haha
         """
         batch_size = x.size(0)
         pooled = pooled.half()
         return self.classification_head(pooled)
+    import torch