Spaces:

mippia
/

AI-Music-Detection-FST

Running on Zero

App Files Files Community

Seonghyeon Go commited on Aug 28

Commit

0ede85b

1 Parent(s): c3c908f

initial commit for AIGM

Browse files

Files changed (7) hide show

.gitattributes +1 -0
ISMIR_2025/MERT/__pycache__/networks.cpython-312.pyc +0 -0
app.py +2 -1
dataset_f.py +0 -4
inference.py +16 -30
model.py +1042 -0
networks.py +560 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/*.ckpt filter=lfs diff=lfs merge=lfs -text

ISMIR_2025/MERT/__pycache__/networks.cpython-312.pyc CHANGED Viewed

Binary files a/ISMIR_2025/MERT/__pycache__/networks.cpython-312.pyc and b/ISMIR_2025/MERT/__pycache__/networks.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ def detect_ai_audio(audio_file):
     Detect whether the uploaded audio file was generated by AI
     """
     result = inference(audio_file)
     # Format result with better styling
     if "AI" in str(result).upper() or "artificial" in str(result).lower():
@@ -167,7 +168,7 @@ demo = gr.Interface(
     """,
     examples=[
         ["example-ncs-light it up(human).mp3"],
-        ["example-Fading Memories(suno v3.5).wav"]
     ],
     css=custom_css,
     theme=gr.themes.Soft(

     Detect whether the uploaded audio file was generated by AI
     """
     result = inference(audio_file)
+    print(result)
     # Format result with better styling
     if "AI" in str(result).upper() or "artificial" in str(result).lower():
     """,
     examples=[
         ["example-ncs-light it up(human).mp3"],
+        ["example-Strumming Heartbeats(suno v4).mp3"]
     ],
     css=custom_css,
     theme=gr.themes.Soft(

dataset_f.py CHANGED Viewed

@@ -4,13 +4,9 @@ import torch
 import torchaudio
 import librosa
 import numpy as np
-from sklearn.model_selection import train_test_split
 from torch.utils.data import Dataset
-from imblearn.over_sampling import RandomOverSampler
-from transformers import Wav2Vec2Processor
 import torch
 import torchaudio
-from torch.nn.utils.rnn import pad_sequence
 from transformers import Wav2Vec2FeatureExtractor
 import scipy.signal as signal
 import scipy.signal

 import torchaudio
 import librosa
 import numpy as np
 from torch.utils.data import Dataset
 import torch
 import torchaudio
 from transformers import Wav2Vec2FeatureExtractor
 import scipy.signal as signal
 import scipy.signal

inference.py CHANGED Viewed

@@ -12,7 +12,7 @@ import torchaudio
 import scipy.signal as signal
 from typing import Dict, List
 from dataset_f import FakeMusicCapsDataset
 from preprocess import get_segments_from_wav, find_optimal_segment_length
@@ -149,7 +149,7 @@ def run_inference(model, audio_segments: torch.Tensor, padding_mask: torch.Tenso
         # 데이터를 half 타입으로 변환
         if padding_mask.dim() == 1:
             padding_mask = padding_mask.unsqueeze(0)  # [48] -> [1, 48]
-        audio_segments = audio_segments.to(device).half()
         mask = padding_mask.to(device)
@@ -189,14 +189,14 @@ def scaled_sigmoid(x, scale_factor=0.2, linear_property=0.3):
 def get_model(model_type, device):
     """Load the specified model."""
     if model_type == "MERT":
-        from ISMIR_2025.MERT.networks import CCV
         #from model import MusicAudioClassifier
-        model = CCV(embed_dim=768, num_heads=8, num_layers=6, num_classes=2, freeze_feature_extractor=True).to(device)
         #model = MusicAudioClassifier(input_dim=768, is_emb=True, mode = 'both', share_parameter = False).to(device)
-        ckpt_file = 'mert_finetune_10.pth'
-        model.load_state_dict(torch.load(ckpt_file, map_location=device))
         embed_dim = 768
     elif model_type == "pure_MERT":
         from ISMIR_2025.MERT.networks import MERTFeatureExtractor
         model = MERTFeatureExtractor().to(device)
@@ -211,33 +211,22 @@ def get_model(model_type, device):
 def inference(audio_path):
-    parser = argparse.ArgumentParser(description="Music classifier inference")
-    parser.add_argument("--model_type", type=str, required=True, choices=["MERT", "AudioCNN"], help="Type of model")
-    parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to model checkpoint")
-    parser.add_argument("--output_path", type=str, default=None, help="Path to save results (default: print to console)")
-    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to run inference on")
-    args = parser.parse_args()
-    audio_path = "The Chainsmokers & Coldplay - Something Just Like This (Lyric).mp3"
-    # Note: Model loading would be handled by your code
-    print(f"Loading model of type {args.model_type} from {args.checkpoint_path}")
     backbone_model, input_dim = get_model('MERT', 'cuda')
     segments, padding_mask = load_audio(audio_path, sr=24000)
-    segments = segments.to(args.device).to(torch.float32)
-    padding_mask = padding_mask.to(args.device).unsqueeze(0)
     logits,embedding = backbone_model(segments.squeeze(1))
     test_dataset = FakeMusicCapsDataset([audio_path], [0], target_duration=10.0)
     test_data, test_target = test_dataset[0]
-    test_data = test_data.to(args.device).to(torch.float32)
-    test_target = test_target.to(args.device)
     output, _ = backbone_model(test_data.unsqueeze(0))
     # 모델 로드 부분 추가
     model = MusicAudioClassifier.load_from_checkpoint(
-        args.checkpoint_path,
         input_dim=input_dim,
         #emb_model=backbone_model
         is_emb = True,
@@ -248,16 +237,13 @@ def inference(audio_path):
     # Run inference
     print(f"Segments shape: {segments.shape}")
     print("Running inference...")
-    results = run_inference(model, embedding, padding_mask, device=args.device)
     # 결과 출력
     print(f"Results: {results}")
-    # 결과 저장
-    if args.output_path:
-        with open(args.output_path, 'w') as f:
-            json.dump(results, f, indent=4)
-        print(f"Results saved to {args.output_path}")
     return results

 import scipy.signal as signal
 from typing import Dict, List
 from dataset_f import FakeMusicCapsDataset
+from networks import MERT_AudioCNN
 from preprocess import get_segments_from_wav, find_optimal_segment_length
         # 데이터를 half 타입으로 변환
         if padding_mask.dim() == 1:
             padding_mask = padding_mask.unsqueeze(0)  # [48] -> [1, 48]
+        audio_segments = audio_segments.to(device)
         mask = padding_mask.to(device)
 def get_model(model_type, device):
     """Load the specified model."""
     if model_type == "MERT":
         #from model import MusicAudioClassifier
         #model = MusicAudioClassifier(input_dim=768, is_emb=True, mode = 'both', share_parameter = False).to(device)
+        ckpt_file = 'checkpoints/step=007000-val_loss=0.1831-val_acc=0.9278.ckpt'#'mert_finetune_10.pth'
+        model = MERT_AudioCNN.load_from_checkpoint(ckpt_file).to(device)
+        model.eval()
+        # model.load_state_dict(torch.load(ckpt_file, map_location=device))
         embed_dim = 768
     elif model_type == "pure_MERT":
         from ISMIR_2025.MERT.networks import MERTFeatureExtractor
         model = MERTFeatureExtractor().to(device)
 def inference(audio_path):
     backbone_model, input_dim = get_model('MERT', 'cuda')
     segments, padding_mask = load_audio(audio_path, sr=24000)
+    segments = segments.to('cuda').to(torch.float32)
+    padding_mask = padding_mask.to('cuda').unsqueeze(0)
     logits,embedding = backbone_model(segments.squeeze(1))
     test_dataset = FakeMusicCapsDataset([audio_path], [0], target_duration=10.0)
     test_data, test_target = test_dataset[0]
+    test_data = test_data.to('cuda').to(torch.float32)
+    test_target = test_target.to('cuda')
     output, _ = backbone_model(test_data.unsqueeze(0))
     # 모델 로드 부분 추가
     model = MusicAudioClassifier.load_from_checkpoint(
+        checkpoint_path = 'checkpoints/EmbeddingModel_MERT_768-epoch=0073-val_loss=0.1058-val_acc=0.9585-val_f1=0.9366-val_precision=0.9936-val_recall=0.8857.ckpt',
         input_dim=input_dim,
         #emb_model=backbone_model
         is_emb = True,
     # Run inference
     print(f"Segments shape: {segments.shape}")
     print("Running inference...")
+    results = run_inference(model, embedding, padding_mask, 'cuda')
     # 결과 출력
     print(f"Results: {results}")
+    asdf
     return results

model.py ADDED Viewed

	@@ -0,0 +1,1042 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from torch.utils.data import Dataset, DataLoader
+from typing import List, Tuple, Optional
+import numpy as np
+from pathlib import Path
+import math
+# from deepspeed.ops.adam import FusedAdam  # 호환성 문제로 비활성화
+class MusicAudioClassifier(pl.LightningModule):
+    def __init__(self,
+                input_dim: int,
+                hidden_dim: int = 256,
+                learning_rate: float = 1e-4,
+                emb_model: Optional[nn.Module] = None,
+                is_emb: bool = False,
+                backbone: str = 'segment_transformer',
+                num_classes: int = 2):
+        super().__init__()
+        self.save_hyperparameters()
+        if backbone == 'segment_transformer':
+            self.model = SegmentTransformer(
+                input_dim=input_dim,
+                hidden_dim=hidden_dim,
+                num_classes=num_classes,
+                mode = 'both'
+            )
+        elif backbone == 'fusion_segment_transformer':
+            self.model = FusionSegmentTransformer(
+                input_dim=input_dim,
+                hidden_dim=hidden_dim,
+                num_classes=num_classes
+            )
+        elif backbone == 'guided_segment_transformer':
+            self.model = GuidedSegmentTransformer(
+                input_dim=input_dim,
+                hidden_dim=hidden_dim,
+                num_classes=num_classes
+            )
+        elif backbone == 'ultra_segment_processor':
+            self.model = UltraModernSegmentProcessor(
+                input_dim=input_dim,
+                hidden_dim=hidden_dim,
+                num_classes=num_classes
+            )
+        self.emb_model = emb_model
+        self.learning_rate = learning_rate
+        self.is_emb = is_emb
+        self.num_classes = num_classes
+    def _process_audio_batch(self, x: torch.Tensor) -> torch.Tensor:
+        B, S = x.shape[:2]  # [B, S, C, M, T] or [B, S, C, T] for wav, [B, S, 1?, embsize] for emb
+        x = x.view(B*S, *x.shape[2:])  # [B*S, C, M, T]
+        if self.is_emb == False:
+            _, embeddings = self.emb_model(x)  # [B*S, emb_dim]
+        else:
+            embeddings = x
+        if embeddings.dim() == 3:
+            pooled_features = embeddings.mean(dim=1) # transformer
+        else:
+            pooled_features = embeddings # CCV..? no need to pooling
+        return pooled_features.view(B, S, -1)  # [B, S, emb_dim]
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self._process_audio_batch(x) # 이걸 freeze하고 쓰는게 사실상 윗버전임
+        x = x.half()
+        return self.model(x, mask)
+    def _compute_loss_and_probs(self, y_hat: torch.Tensor, y: torch.Tensor):
+        """Compute loss and probabilities based on number of classes"""
+        if y_hat.size(0) == 1:
+            y_hat_flat = y_hat.flatten()
+            y_flat = y.flatten()
+        else:
+            y_hat_flat = y_hat.squeeze() if self.num_classes == 2 else y_hat
+            y_flat = y
+        if self.num_classes == 2:
+            loss = F.binary_cross_entropy_with_logits(y_hat_flat, y_flat.float())
+            probs = torch.sigmoid(y_hat_flat)
+            preds = (probs > 0.5).long()
+        else:
+            loss = F.cross_entropy(y_hat_flat, y_flat.long())
+            probs = F.softmax(y_hat_flat, dim=-1)
+            preds = torch.argmax(y_hat_flat, dim=-1)
+        return loss, probs, preds, y_flat.long()
+    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        x, y, mask = batch
+        x = x.half()
+        y_hat = self(x, mask)
+        loss, probs, preds, y_true = self._compute_loss_and_probs(y_hat, y)
+        # 간단한 배치 손실만 로깅 (step 수준)
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        # 전체 에폭에 대한 메트릭 계산을 위해 예측과 실제값 저장
+        if self.num_classes == 2:
+            self.training_step_outputs.append({'preds': probs, 'targets': y_true, 'binary_preds': preds})
+        else:
+            self.training_step_outputs.append({'probs': probs, 'preds': preds, 'targets': y_true})
+        return loss
+    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> None:
+        x, y, mask = batch
+        x = x.half()
+        y_hat = self(x, mask)
+        loss, probs, preds, y_true = self._compute_loss_and_probs(y_hat, y)
+        # 간단한 배치 손실만 로깅 (step 수준)
+        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        # 전체 에폭에 대한 메트릭 계산을 위해 예측과 실제값 저장
+        if self.num_classes == 2:
+            self.validation_step_outputs.append({'preds': probs, 'targets': y_true, 'binary_preds': preds})
+        else:
+            self.validation_step_outputs.append({'probs': probs, 'preds': preds, 'targets': y_true})
+    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> None:
+        x, y, mask = batch
+        x = x.half()
+        y_hat = self(x, mask)
+        loss, probs, preds, y_true = self._compute_loss_and_probs(y_hat, y)
+        # 간단한 배치 손실만 로깅 (step 수준)
+        self.log('test_loss', loss, on_epoch=True, prog_bar=True)
+        # 전체 에폭에 대한 메트릭 계산을 위해 예측과 실제값 저장
+        if self.num_classes == 2:
+            self.test_step_outputs.append({'preds': probs, 'targets': y_true, 'binary_preds': preds})
+        else:
+            self.test_step_outputs.append({'probs': probs, 'preds': preds, 'targets': y_true})
+    def on_train_epoch_start(self):
+        # 에폭 시작 시 결과 저장용 리스트 초기화
+        self.training_step_outputs = []
+    def on_validation_epoch_start(self):
+        # 에폭 시작 시 결과 저장용 리스트 초기화
+        self.validation_step_outputs = []
+    def on_test_epoch_start(self):
+        # 에폭 시작 시 결과 저장용 리스트 초기화
+        self.test_step_outputs = []
+    def _compute_binary_metrics(self, outputs, prefix):
+        """Binary classification metrics computation"""
+        all_preds = torch.cat([x['preds'] for x in outputs])
+        all_targets = torch.cat([x['targets'] for x in outputs])
+        binary_preds = torch.cat([x['binary_preds'] for x in outputs])
+        # 정확도 계산
+        acc = (binary_preds == all_targets).float().mean()
+        # 혼동 행렬 요소 계산
+        tp = torch.sum((binary_preds == 1) & (all_targets == 1)).float()
+        fp = torch.sum((binary_preds == 1) & (all_targets == 0)).float()
+        tn = torch.sum((binary_preds == 0) & (all_targets == 0)).float()
+        fn = torch.sum((binary_preds == 0) & (all_targets == 1)).float()
+        # 메트릭 계산
+        precision = tp / (tp + fp) if (tp + fp) > 0 else torch.tensor(0.0).to(tp.device)
+        recall = tp / (tp + fn) if (tp + fn) > 0 else torch.tensor(0.0).to(tp.device)
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else torch.tensor(0.0).to(tp.device)
+        specificity = tn / (tn + fp) if (tn + fp) > 0 else torch.tensor(0.0).to(tn.device)
+        # 로깅
+        self.log(f'{prefix}_acc', acc, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log(f'{prefix}_precision', precision, on_epoch=True, sync_dist=True)
+        self.log(f'{prefix}_recall', recall, on_epoch=True, sync_dist=True)
+        self.log(f'{prefix}_f1', f1, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log(f'{prefix}_specificity', specificity, on_epoch=True, sync_dist=True)
+        if prefix in ['val', 'test']:
+            # ROC-AUC 계산 (간단한 근사)
+            sorted_indices = torch.argsort(all_preds, descending=True)
+            sorted_targets = all_targets[sorted_indices]
+            n_pos = torch.sum(all_targets)
+            n_neg = len(all_targets) - n_pos
+            if n_pos > 0 and n_neg > 0:
+                tpr_curve = torch.cumsum(sorted_targets, dim=0) / n_pos
+                fpr_curve = torch.cumsum(1 - sorted_targets, dim=0) / n_neg
+                width = fpr_curve[1:] - fpr_curve[:-1]
+                height = (tpr_curve[1:] + tpr_curve[:-1]) / 2
+                auc_approx = torch.sum(width * height)
+                self.log(f'{prefix}_auc', auc_approx, on_epoch=True, sync_dist=True)
+        if prefix == 'test':
+            balanced_acc = (recall + specificity) / 2
+            self.log('test_balanced_acc', balanced_acc, on_epoch=True)
+    def _compute_multiclass_metrics(self, outputs, prefix):
+        """Multi-class classification metrics computation"""
+        all_probs = torch.cat([x['probs'] for x in outputs])
+        all_preds = torch.cat([x['preds'] for x in outputs])
+        all_targets = torch.cat([x['targets'] for x in outputs])
+        # 전체 정확도
+        acc = (all_preds == all_targets).float().mean()
+        self.log(f'{prefix}_acc', acc, on_epoch=True, prog_bar=True, sync_dist=True)
+        # 클래스별 메트릭 계산
+        for class_idx in range(self.num_classes):
+            # 각 클래스에 대한 이진 분류 메트릭
+            class_targets = (all_targets == class_idx).long()
+            class_preds = (all_preds == class_idx).long()
+            tp = torch.sum((class_preds == 1) & (class_targets == 1)).float()
+            fp = torch.sum((class_preds == 1) & (class_targets == 0)).float()
+            tn = torch.sum((class_preds == 0) & (class_targets == 0)).float()
+            fn = torch.sum((class_preds == 0) & (class_targets == 1)).float()
+            precision = tp / (tp + fp) if (tp + fp) > 0 else torch.tensor(0.0).to(tp.device)
+            recall = tp / (tp + fn) if (tp + fn) > 0 else torch.tensor(0.0).to(tp.device)
+            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else torch.tensor(0.0).to(tp.device)
+            self.log(f'{prefix}_class_{class_idx}_precision', precision, on_epoch=True)
+            self.log(f'{prefix}_class_{class_idx}_recall', recall, on_epoch=True)
+            self.log(f'{prefix}_class_{class_idx}_f1', f1, on_epoch=True)
+        # 매크로 평균 F1 스코어
+        class_f1_scores = []
+        for class_idx in range(self.num_classes):
+            class_targets = (all_targets == class_idx).long()
+            class_preds = (all_preds == class_idx).long()
+            tp = torch.sum((class_preds == 1) & (class_targets == 1)).float()
+            fp = torch.sum((class_preds == 1) & (class_targets == 0)).float()
+            fn = torch.sum((class_preds == 0) & (class_targets == 1)).float()
+            precision = tp / (tp + fp) if (tp + fp) > 0 else torch.tensor(0.0).to(tp.device)
+            recall = tp / (tp + fn) if (tp + fn) > 0 else torch.tensor(0.0).to(tp.device)
+            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else torch.tensor(0.0).to(tp.device)
+            class_f1_scores.append(f1)
+        macro_f1 = torch.stack(class_f1_scores).mean()
+        self.log(f'{prefix}_macro_f1', macro_f1, on_epoch=True, prog_bar=True, sync_dist=True)
+    def on_train_epoch_end(self):
+        if not hasattr(self, 'training_step_outputs') or not self.training_step_outputs:
+            return
+        if self.num_classes == 2:
+            self._compute_binary_metrics(self.training_step_outputs, 'train')
+        else:
+            self._compute_multiclass_metrics(self.training_step_outputs, 'train')
+    def on_validation_epoch_end(self):
+        if not hasattr(self, 'validation_step_outputs') or not self.validation_step_outputs:
+            return
+        if self.num_classes == 2:
+            self._compute_binary_metrics(self.validation_step_outputs, 'val')
+        else:
+            self._compute_multiclass_metrics(self.validation_step_outputs, 'val')
+    def on_test_epoch_end(self):
+        if not hasattr(self, 'test_step_outputs') or not self.test_step_outputs:
+            return
+        if self.num_classes == 2:
+            self._compute_binary_metrics(self.test_step_outputs, 'test')
+        else:
+            self._compute_multiclass_metrics(self.test_step_outputs, 'test')
+    def configure_optimizers(self):
+        # FusedAdam 대신 일반 AdamW 사용 (GLIBC 호환성 문제 해결)
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=0.01
+        )
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            T_max=100,  # Adjust based on your training epochs
+            eta_min=1e-6
+        )
+        return {
+            'optimizer': optimizer,
+            'lr_scheduler': scheduler,
+            'monitor': 'val_loss',
+        }
+def pad_sequence_with_mask(batch: List[Tuple[torch.Tensor, int]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Collate function for DataLoader that creates padded sequences and attention masks with fixed length (48)."""
+    embeddings, labels = zip(*batch)
+    fixed_len = 48  # 고정 길이
+    batch_size = len(embeddings)
+    feat_dim = embeddings[0].shape[-1]
+    padded = torch.zeros((batch_size, fixed_len, feat_dim))  # 고정 길이로 패딩된 텐서
+    mask = torch.ones((batch_size, fixed_len), dtype=torch.bool)  # True는 padding을 의미
+    for i, emb in enumerate(embeddings):
+        length = emb.shape[0]
+        # 길이가 고정 길이보다 길면 자르고, 짧으면 패딩
+        if length > fixed_len:
+            padded[i, :] = emb[:fixed_len]  # fixed_len보다 긴 부분을 잘라서 채운다.
+            mask[i, :] = False
+        else:
+            padded[i, :length] = emb  # 실제 데이터 길이에 맞게 채운다.
+            mask[i, :length] = False  # 패딩이 아닌 부분은 False로 설정
+    return padded, torch.tensor(labels), mask
+class SegmentTransformer(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 hidden_dim: int = 256,
+                 num_heads: int = 8,
+                 num_layers: int = 4,
+                 dropout: float = 0.1,
+                 max_sequence_length: int = 1000,
+                 mode: str = 'both',
+                 share_parameter: bool = False,
+                 num_classes: int = 2):
+        super().__init__()
+        # Original sequence processing
+        self.input_projection = nn.Linear(input_dim, hidden_dim)
+        self.mode = mode
+        self.share_parameter = share_parameter
+        self.num_classes = num_classes
+        # Positional encoding
+        position = torch.arange(max_sequence_length).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, hidden_dim, 2) * (-np.log(10000.0) / hidden_dim))
+        pos_encoding = torch.zeros(max_sequence_length, hidden_dim)
+        pos_encoding[:, 0::2] = torch.sin(position * div_term)
+        pos_encoding[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pos_encoding', pos_encoding)
+        # Transformer for original sequence
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_dim,
+            nhead=num_heads,
+            dim_feedforward=hidden_dim * 4,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.sim_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        # Self-similarity stream processing
+        self.similarity_projection = nn.Sequential(
+            nn.Conv1d(1, hidden_dim // 2, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv1d(hidden_dim // 2, hidden_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Dropout(dropout)
+        )
+        # Transformer for similarity stream
+        self.similarity_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        # Final classification head
+        self.classification_head_dim = hidden_dim * 2 if mode == 'both' else hidden_dim
+        # Output dimension based on number of classes
+        output_dim = 1 if num_classes == 2 else num_classes
+        self.classification_head = nn.Sequential(
+            nn.Linear(self.classification_head_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.LayerNorm(hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, output_dim)
+        )
+    def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, seq_len, _ = x.shape
+        # 1. Process original sequence
+        x = x.half()
+        x1 = self.input_projection(x)
+        x1 = x1 + self.pos_encoding[:seq_len].unsqueeze(0)
+        x1 = self.transformer(x1, src_key_padding_mask=padding_mask)  # padding_mask 사용
+        # 2. Calculate and process self-similarity
+        x_expanded = x.unsqueeze(2)
+        x_transposed = x.unsqueeze(1)
+        distances = torch.mean((x_expanded - x_transposed) ** 2, dim=-1)
+        similarity_matrix = torch.exp(-distances)  # (batch_size, seq_len, seq_len)
+        # 자기 유사도 마스크 생성 및 적용 (각 시점에 대한 마스크 개별 적용)
+        if padding_mask is not None:
+            similarity_mask = padding_mask.unsqueeze(1) | padding_mask.unsqueeze(2)  # (batch_size, seq_len, seq_len)
+            similarity_matrix = similarity_matrix.masked_fill(similarity_mask, 0.0)
+        # Process similarity matrix row by row using Conv1d
+        x2 = similarity_matrix.unsqueeze(1)  # (batch_size, 1, seq_len, seq_len)
+        x2 = x2.view(batch_size * seq_len, 1, seq_len)  # Reshape for Conv1d
+        x2 = self.similarity_projection(x2)  # (batch_size * seq_len, hidden_dim, seq_len)
+        x2 = x2.mean(dim=2)  # Pool across sequence dimension
+        x2 = x2.view(batch_size, seq_len, -1)  # Reshape back
+        x2 = x2 + self.pos_encoding[:seq_len].unsqueeze(0)
+        if self.share_parameter:
+            x2 = self.transformer(x2, src_key_padding_mask=padding_mask)
+        else:
+            x2 = self.sim_transformer(x2, src_key_padding_mask=padding_mask)  # padding_mask 사용
+        # 3. Global average pooling for both streams
+        if padding_mask is not None:
+            mask_expanded = (~padding_mask).float().unsqueeze(-1)
+            x1 = (x1 * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
+            x2 = (x2 * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
+        else:
+            x1 = x1.mean(dim=1)
+            x2 = x2.mean(dim=1)
+        # 4. Combine both streams and classify
+        if self.mode == 'only_emb':
+            x = x1
+        elif self.mode == 'only_structure':
+            x = x2
+        elif self.mode == 'both':
+            x = torch.cat([x1, x2], dim=-1)
+        x= x.half()
+        return self.classification_head(x)
+class PairwiseGuidedTransformer(nn.Module):
+    """Pairwise similarity matrix를 활용한 범용 transformer layer
+    Vision: patch간 유사도, NLP: token간 유사도, Audio: segment간 유사도 등에 활용 가능
+    """
+    def __init__(self, d_model: int, num_heads: int = 8):
+        super().__init__()
+        self.d_model = d_model
+        self.num_heads = num_heads
+        # Standard Q, K projections
+        self.q_proj = nn.Linear(d_model, d_model)
+        self.k_proj = nn.Linear(d_model, d_model)
+        # Pairwise-guided V projection
+        self.v_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x, pairwise_matrix, padding_mask=None):
+        """
+        Args:
+            x: (batch, seq_len, d_model) - sequence embeddings
+            pairwise_matrix: (batch, seq_len, seq_len) - pairwise similarity/distance matrix
+            padding_mask: (batch, seq_len) - padding mask
+        """
+        batch_size, seq_len, d_model = x.shape
+        # Standard Q, K, V
+        Q = self.q_proj(x)
+        K = self.k_proj(x)
+        V = self.v_proj(x)
+        # Reshape for multi-head
+        Q = Q.view(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
+        K = K.view(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
+        V = V.view(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
+        # Standard attention scores
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_model ** 0.5)
+        # ✅ Combine with pairwise matrix
+        #pairwise_expanded = pairwise_matrix.unsqueeze(1).expand(-1, self.num_heads, -1, -1)
+        enhanced_scores = scores# + pairwise_expanded 이거 빼고 하기로 했죠?
+        # Apply padding mask
+        if padding_mask is not None:
+            mask_4d = padding_mask.unsqueeze(1).unsqueeze(1).expand(-1, self.num_heads, seq_len, -1)
+            enhanced_scores = enhanced_scores.masked_fill(mask_4d, float('-inf'))
+        # Softmax and apply to V
+        attn_weights = F.softmax(enhanced_scores, dim=-1)
+        attended = torch.matmul(attn_weights, V)
+        # Reshape and project
+        attended = attended.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
+        output = self.output_proj(attended)
+        return self.norm(x + output)
+class MultiScaleAdaptivePooler(nn.Module):
+    """Multi-scale adaptive pooling - 다양한 도메인에서 활용 가능"""
+    def __init__(self, hidden_dim: int, num_heads: int = 8):
+        super().__init__()
+        # Attention-based pooling
+        self.attention_pool = nn.MultiheadAttention(
+            hidden_dim, num_heads=num_heads, batch_first=True
+        )
+        self.query_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
+        # Complementary pooling strategies
+        self.max_pool_proj = nn.Linear(hidden_dim, hidden_dim)
+        self.fusion = nn.Linear(hidden_dim * 3, hidden_dim)
+    def forward(self, x, padding_mask=None):
+        """
+        Args:
+            x: (batch, seq_len, hidden_dim) - sequence features
+            padding_mask: (batch, seq_len) - padding mask
+        """
+        batch_size = x.size(0)
+        # 1. Global average pooling
+        if padding_mask is not None:
+            mask_expanded = (~padding_mask).float().unsqueeze(-1)
+            global_avg = (x * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
+        else:
+            global_avg = x.mean(dim=1)
+        # # 2. Global max pooling
+        # if padding_mask is not None:
+        #     x_masked = x.clone()
+        #     x_masked[padding_mask] = float('-inf')
+        #     global_max = x_masked.max(dim=1)[0]
+        # else:
+        #     global_max = x.max(dim=1)[0]
+        # global_max = self.max_pool_proj(global_max)
+        # # 3. Attention-based pooling
+        # query = self.query_token.expand(batch_size, -1, -1)
+        # attn_pooled, _ = self.attention_pool(
+        #     query, x, x,
+        #     key_padding_mask=padding_mask
+        # )
+        # attn_pooled = attn_pooled.squeeze(1)
+        # # 4. Fuse all pooling results
+        # #combined = torch.cat([global_avg, global_max, attn_pooled], dim=-1)
+        # #output = self.fusion(combined)
+        output = global_avg
+        return output
+class GuidedSegmentTransformer(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 hidden_dim: int = 256,
+                 num_heads: int = 8,
+                 num_layers: int = 4,
+                 dropout: float = 0.1,
+                 max_sequence_length: int = 1000,
+                 mode: str = 'only_emb',
+                 share_parameter: bool = False,
+                 num_classes: int = 2):
+        super().__init__()
+        # Original sequence processing
+        self.input_projection = nn.Linear(input_dim, hidden_dim)
+        self.mode = mode
+        self.share_parameter = share_parameter
+        self.num_classes = num_classes
+        # Positional encoding
+        position = torch.arange(max_sequence_length).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, hidden_dim, 2) * (-np.log(10000.0) / hidden_dim))
+        pos_encoding = torch.zeros(max_sequence_length, hidden_dim)
+        pos_encoding[:, 0::2] = torch.sin(position * div_term)
+        pos_encoding[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pos_encoding', pos_encoding)
+        # ✅ Pairwise-guided transformer layers (범용적 이름)
+        self.pairwise_guided_layers = nn.ModuleList([
+            PairwiseGuidedTransformer(hidden_dim, num_heads)
+            for _ in range(num_layers)
+        ])
+        # Pairwise matrix processing (기존 similarity processing)
+        self.pairwise_projection = nn.Sequential(
+            nn.Conv1d(1, hidden_dim // 2, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv1d(hidden_dim // 2, hidden_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Dropout(dropout)
+        )
+        # ✅ Multi-scale adaptive pooling (범용적 이름)
+        self.adaptive_pooler = MultiScaleAdaptivePooler(hidden_dim, num_heads)
+        # Final classification head
+        self.classification_head_dim = hidden_dim * 2 if mode == 'both' else hidden_dim
+        output_dim = 1 if num_classes == 2 else num_classes
+        self.classification_head = nn.Sequential(
+            nn.Linear(self.classification_head_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.LayerNorm(hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, output_dim)
+        )
+    def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, seq_len, _ = x.shape
+        # 1. Process sequence
+        x1 = self.input_projection(x)
+        x1 = x1 + self.pos_encoding[:seq_len].unsqueeze(0)
+        # 2. Calculate pairwise matrix (can be similarity, distance, correlation, etc.)
+        x_expanded = x.unsqueeze(2)
+        x_transposed = x.unsqueeze(1)
+        distances = torch.mean((x_expanded - x_transposed) ** 2, dim=-1)
+        pairwise_matrix = torch.exp(-distances)  # Convert distance to similarity
+        # Apply padding mask to pairwise matrix
+        if padding_mask is not None:
+            pairwise_mask = padding_mask.unsqueeze(1) | padding_mask.unsqueeze(2)
+            pairwise_matrix = pairwise_matrix.masked_fill(pairwise_mask, 0.0)
+        # ✅ Pairwise-guided processing
+        for layer in self.pairwise_guided_layers:
+            x1 = layer(x1, pairwise_matrix, padding_mask)
+        # 3. Process pairwise matrix as separate stream (optional)
+        if self.mode in ['only_structure', 'both']:
+            x2 = pairwise_matrix.unsqueeze(1)
+            x2 = x2.view(batch_size * seq_len, 1, seq_len)
+            x2 = self.pairwise_projection(x2)
+            x2 = x2.mean(dim=2)
+            x2 = x2.view(batch_size, seq_len, -1)
+            x2 = x2 + self.pos_encoding[:seq_len].unsqueeze(0)
+        # ✅ Multi-scale adaptive pooling
+        if self.mode == 'only_emb':
+            x = self.adaptive_pooler(x1, padding_mask)
+        elif self.mode == 'only_structure':
+            x = self.adaptive_pooler(x2, padding_mask)
+        elif self.mode == 'both':
+            x1_pooled = self.adaptive_pooler(x1, padding_mask)
+            x2_pooled = self.adaptive_pooler(x2, padding_mask)
+            x = torch.cat([x1_pooled, x2_pooled], dim=-1)
+        x = x
+        return self.classification_head(x)
+class CrossModalFusionLayer(nn.Module):
+    """Structure와 Embedding 정보를 점진적으로 융합"""
+    def __init__(self, d_model: int, num_heads: int = 8):
+        super().__init__()
+        # Cross-attention: embedding이 structure를 query하고, structure가 embedding을 query
+        self.emb_to_struct_attn = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
+        self.struct_to_emb_attn = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
+        # Fusion gate (어느 정보를 얼마나 믿을지)
+        self.fusion_gate = nn.Sequential(
+            nn.Linear(d_model * 2, d_model),
+            nn.Sigmoid()
+        )
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+    def forward(self, emb_features, struct_features, padding_mask=None):
+        """
+        emb_features: (batch, seq_len, d_model) - 메인 embedding 정보
+        struct_features: (batch, seq_len, d_model) - structure 정보
+        """
+        # 1. Embedding이 Structure 정보를 참조
+        emb_enhanced, _ = self.emb_to_struct_attn(
+            emb_features, struct_features, struct_features,
+            key_padding_mask=padding_mask
+        )
+        emb_enhanced = self.norm1(emb_features + emb_enhanced)
+        # 2. Structure가 Embedding 정보를 참조
+        struct_enhanced, _ = self.struct_to_emb_attn(
+            struct_features, emb_features, emb_features,
+            key_padding_mask=padding_mask
+        )
+        struct_enhanced = self.norm2(struct_features + struct_enhanced)
+        # 3. Adaptive fusion (둘 중 어느 것을 더 믿을지 학습)
+        combined = torch.cat([emb_enhanced, struct_enhanced], dim=-1)
+        gate_weight = self.fusion_gate(combined)  # (batch, seq_len, d_model)
+        # Gated combination
+        fused = gate_weight * emb_enhanced + (1 - gate_weight) * struct_enhanced
+        return fused
+class FusionSegmentTransformer(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 hidden_dim: int = 256,
+                 num_heads: int = 8,
+                 num_layers: int = 4,
+                 dropout: float = 0.1,
+                 max_sequence_length: int = 1000,
+                 mode: str = 'both',  # 기본값을 both로
+                 share_parameter: bool = False,
+                 num_classes: int = 2):
+        super().__init__()
+        self.input_projection = nn.Linear(input_dim, hidden_dim)
+        self.mode = mode
+        self.num_classes = num_classes
+        # Positional encoding
+        position = torch.arange(max_sequence_length).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, hidden_dim, 2) * (-np.log(10000.0) / hidden_dim))
+        pos_encoding = torch.zeros(max_sequence_length, hidden_dim)
+        pos_encoding[:, 0::2] = torch.sin(position * div_term)
+        pos_encoding[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pos_encoding', pos_encoding)
+        # ✅ Embedding stream: Pairwise-guided transformer
+        self.embedding_layers = nn.ModuleList([
+            PairwiseGuidedTransformer(hidden_dim, num_heads)
+            for _ in range(num_layers)
+        ])
+        # ✅ Structure stream: Pairwise matrix processing
+        self.pairwise_projection = nn.Sequential(
+            nn.Conv1d(1, hidden_dim // 2, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv1d(hidden_dim // 2, hidden_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Dropout(dropout)
+        )
+        # Structure transformer layers
+        self.structure_layers = nn.ModuleList([
+            nn.TransformerEncoderLayer(
+                d_model=hidden_dim,
+                nhead=num_heads,
+                dim_feedforward=hidden_dim * 4,
+                dropout=dropout,
+                batch_first=True
+            ) for _ in range(num_layers // 2)  # 절반만 사용
+        ])
+        # ✅ Cross-modal fusion layers (핵심!)
+        self.fusion_layers = nn.ModuleList([
+            CrossModalFusionLayer(hidden_dim, num_heads)
+            for _ in range(1)  # fusion은 하나만 써야 gate가 유의미해질듯
+        ])
+        # Adaptive pooling
+        self.adaptive_pooler = MultiScaleAdaptivePooler(hidden_dim, num_heads)
+        # Final classification head (이제 단일 차원)
+        output_dim = 1 if num_classes == 2 else num_classes
+        self.classification_head = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),  # 더 이상 concat 안함
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.LayerNorm(hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, output_dim)
+        )
+    def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, seq_len, _ = x.shape
+        # 1. Initialize both streams
+        x_emb = self.input_projection(x)
+        x_emb = x_emb + self.pos_encoding[:seq_len].unsqueeze(0)
+        # 2. Calculate pairwise matrix
+        x_expanded = x.unsqueeze(2)
+        x_transposed = x.unsqueeze(1)
+        distances = torch.mean((x_expanded - x_transposed) ** 2, dim=-1)
+        pairwise_matrix = torch.exp(-distances)
+        if padding_mask is not None:
+            pairwise_mask = padding_mask.unsqueeze(1) | padding_mask.unsqueeze(2)
+            pairwise_matrix = pairwise_matrix.masked_fill(pairwise_mask, 0.0)
+        # 3. Process structure stream
+        x_struct = pairwise_matrix.unsqueeze(1)
+        x_struct = x_struct.view(batch_size * seq_len, 1, seq_len)
+        x_struct = self.pairwise_projection(x_struct)
+        x_struct = x_struct.mean(dim=2)
+        x_struct = x_struct.view(batch_size, seq_len, -1)
+        x_struct = x_struct + self.pos_encoding[:seq_len].unsqueeze(0)
+        for struct_layer in self.structure_layers:
+            x_struct = struct_layer(x_struct, src_key_padding_mask=padding_mask)
+        # 4. Process embedding stream (with pairwise guidance)
+        for emb_layer in self.embedding_layers:
+            x_emb = emb_layer(x_emb, pairwise_matrix, padding_mask)
+        # ✅ 5. Progressive Cross-modal Fusion (핵심!)
+        fused = x_emb  # 시작은 embedding에서
+        for fusion_layer in self.fusion_layers:
+            fused = fusion_layer(fused, x_struct, padding_mask)
+            # 이제 fused는 embedding + structure 정보를 모두 포함
+        # 6. Final pooling and classification
+        pooled = self.adaptive_pooler(fused, padding_mask)
+        pooled = pooled.half()
+        return self.classification_head(pooled)
+    import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Optional
+import math
+class RMSNorm(nn.Module):
+    """RMS Normalization - 안정적"""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+class SwiGLU(nn.Module):
+    """SwiGLU Activation - 단순 버전"""
+    def __init__(self, dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(dim, dim * 2, bias=False)
+        self.w2 = nn.Linear(dim, dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)[:, :, :x.size(-1)]))  # 차원 맞춤
+class GroupedQueryAttention(nn.Module):
+    """단순한 GQA - 에러 방지"""
+    def __init__(self, d_model: int, num_heads: int = 8):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        # 모든 projection을 동일한 차원으로
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.o_proj = nn.Linear(d_model, d_model, bias=False)
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+    def forward(self, x, pairwise_matrix=None, padding_mask=None):
+        B, L, D = x.shape
+        Q = self.q_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        K = self.k_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        V = self.v_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
+        if pairwise_matrix is not None:
+            scores = scores + pairwise_matrix.unsqueeze(1)
+        if padding_mask is not None:
+            mask_4d = padding_mask.unsqueeze(1).unsqueeze(1).expand(-1, self.num_heads, L, -1)
+            scores = scores.masked_fill(mask_4d, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_output = torch.matmul(attn_weights, V)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(B, L, D)
+        return self.o_proj(attn_output)
+class SimpleModernLayer(nn.Module):
+    """단순하고 안전한 모던 레이어"""
+    def __init__(self, d_model: int, num_heads: int = 8):
+        super().__init__()
+        # RMSNorm
+        self.norm1 = RMSNorm(d_model)
+        self.norm2 = RMSNorm(d_model)
+        # Attention
+        self.attention = GroupedQueryAttention(d_model, num_heads)
+        # Feed forward
+        self.ffn = SwiGLU(d_model)
+    def forward(self, x, pairwise_matrix=None, padding_mask=None):
+        # Attention with residual
+        normed_x = self.norm1(x)
+        attn_out = self.attention(normed_x, pairwise_matrix, padding_mask)
+        x = x + attn_out
+        # FFN with residual
+        normed_x2 = self.norm2(x)
+        ffn_out = self.ffn(normed_x2)
+        x = x + ffn_out
+        return x
+class SimpleQuantumPooling(nn.Module):
+    """단순한 어텐션 풀링"""
+    def __init__(self, d_model: int):
+        super().__init__()
+        # 3가지 풀링 방법
+        self.attention_pool = nn.MultiheadAttention(d_model, 8, batch_first=True)
+        self.query_token = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
+        # 결합
+        self.final_proj = nn.Linear(d_model * 3, d_model, bias=False)
+    def forward(self, x, padding_mask=None):
+        batch_size = x.size(0)
+        # 1. Average pooling
+        if padding_mask is not None:
+            mask_expanded = (~padding_mask).float().unsqueeze(-1)
+            avg_pooled = (x * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
+        else:
+            avg_pooled = x.mean(dim=1)
+        # 2. Max pooling
+        if padding_mask is not None:
+            x_masked = x.clone()
+            x_masked[padding_mask] = float('-inf')
+            max_pooled = x_masked.max(dim=1)[0]
+        else:
+            max_pooled = x.max(dim=1)[0]
+        # 3. Attention pooling
+        query = self.query_token.expand(batch_size, -1, -1)
+        attn_pooled, _ = self.attention_pool(
+            query, x, x, key_padding_mask=padding_mask
+        )
+        attn_pooled = attn_pooled.squeeze(1)
+        # 결합
+        combined = torch.cat([avg_pooled, max_pooled, attn_pooled], dim=-1).half()
+        return self.final_proj(combined)
+class UltraModernSegmentProcessor(nn.Module):
+    """에러 없는 단순 버전 ✅"""
+    def __init__(self,
+                 input_dim: int,
+                 hidden_dim: int = 512,
+                 num_heads: int = 8,
+                 num_layers: int = 6,
+                 dropout: float = 0.1,
+                 max_sequence_length: int = 1000,
+                 num_classes: int = 2):
+        super().__init__()
+        assert hidden_dim % num_heads == 0
+        self.hidden_dim = hidden_dim
+        self.input_projection = nn.Linear(input_dim, hidden_dim, bias=False)
+        # 모던 레이어들
+        self.layers = nn.ModuleList([
+            SimpleModernLayer(hidden_dim, num_heads)
+            for _ in range(num_layers)
+        ])
+        # 단순 풀링
+        self.pooler = SimpleQuantumPooling(hidden_dim)
+        # 분류 헤드
+        output_dim = 1 if num_classes == 2 else num_classes
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim // 2, bias=False),
+            RMSNorm(hidden_dim // 2),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, hidden_dim // 4, bias=False),
+            RMSNorm(hidden_dim // 4),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 4, output_dim, bias=False)
+        )
+    def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Input projection
+        x_emb = self.input_projection(x)
+        # Pairwise matrix 계산
+        x_expanded = x.unsqueeze(2)
+        x_transposed = x.unsqueeze(1)
+        # 유클리드 거리만 사용 (단순하게)
+        distances = torch.mean((x_expanded - x_transposed) ** 2, dim=-1)
+        pairwise_matrix = torch.exp(-distances)
+        if padding_mask is not None:
+            pairwise_mask = padding_mask.unsqueeze(1) | padding_mask.unsqueeze(2)
+            pairwise_matrix = pairwise_matrix.masked_fill(pairwise_mask, 0.0)
+        # 레이어들 통과
+        for layer in self.layers:
+            x_emb = layer(x_emb, pairwise_matrix, padding_mask)
+        # 풀링
+        pooled = self.pooler(x_emb, padding_mask)
+        # 분류
+        return self.classifier(pooled)

networks.py ADDED Viewed

	@@ -0,0 +1,560 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from transformers import AutoModel, AutoConfig
+from transformers import Wav2Vec2Model, Wav2Vec2Processor, Data2VecAudioModel
+import torchmetrics
+class cnnblock(nn.Module):
+    def __init__(self, embed_dim=512):
+        super(cnnblock, self).__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(1, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(16, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.AdaptiveAvgPool2d((4, 4))
+        )
+        self.projection = nn.Linear(32 * 4 * 4, embed_dim)
+    def forward(self, x):
+        x = self.conv_block(x)
+        B, C, H, W = x.shape
+        x = x.view(B, -1)
+        x = self.projection(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(CrossAttention, self).__init__()
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
+        self.layer_norm1 = nn.LayerNorm(embed_dim)
+        self.layer_norm2 = nn.LayerNorm(embed_dim)
+        self.feed_forward = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * 4),
+            nn.ReLU(),
+            nn.Linear(embed_dim * 4, embed_dim)
+        )
+    def forward(self, x, cross_input):
+        attn_output, _ = self.multihead_attn(query=x, key=cross_input, value=cross_input)
+        x = self.layer_norm1(x + attn_output)
+        ff_output = self.feed_forward(x)
+        x = self.layer_norm2(x + ff_output)
+        return x
+class CrossAttn_Transformer(nn.Module):
+    def __init__(self, embed_dim=512, num_heads=8, num_layers=6, num_classes=2):
+        super(CrossAttn_Transformer, self).__init__()
+        self.cross_attention_layers = nn.ModuleList([
+            CrossAttention(embed_dim, num_heads) for _ in range(num_layers)
+        ])
+        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(embed_dim),
+            nn.Linear(embed_dim, num_classes)
+        )
+    def forward(self, x, cross_attention_input):
+        self.attention_maps = []
+        for layer in self.cross_attention_layers:
+            x = layer(x, cross_attention_input)
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.mean(dim=0)
+        x = self.classifier(x)
+        return x
+class MERT(nn.Module):
+    def __init__(self, freeze_feature_extractor=True):
+        super(MERT, self).__init__()
+        config = AutoConfig.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
+        if not hasattr(config, "conv_pos_batch_norm"):
+            setattr(config, "conv_pos_batch_norm", False)
+        self.mert = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", config=config, trust_remote_code=True)
+        if freeze_feature_extractor:
+            self.freeze()
+    def forward(self, input_values):
+        with torch.no_grad():
+            outputs = self.mert(input_values, output_hidden_states=True)
+        hidden_states = torch.stack(outputs.hidden_states)
+        hidden_states = hidden_states.detach().clone().requires_grad_(True)
+        time_reduced = hidden_states.mean(dim=2)
+        time_reduced = time_reduced.permute(1, 0, 2)
+        return time_reduced
+    def freeze(self):
+        for param in self.mert.parameters():
+            param.requires_grad = False
+    def unfreeze(self):
+        for param in self.mert.parameters():
+            param.requires_grad = True
+class MERT_AudioCNN(pl.LightningModule):
+    def __init__(self, embed_dim=768, num_heads=8, num_layers=6, num_classes=2,
+                 freeze_feature_extractor=False, learning_rate=2e-5, weight_decay=0.01):
+        super(MERT_AudioCNN, self).__init__()
+        self.save_hyperparameters()
+        self.feature_extractor = MERT(freeze_feature_extractor=freeze_feature_extractor)
+        self.cross_attention_layers = nn.ModuleList([
+            CrossAttention(embed_dim, num_heads) for _ in range(num_layers)
+        ])
+        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(embed_dim),
+            nn.Linear(embed_dim, 256),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, num_classes)
+        )
+        # Metrics
+        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+    def forward(self, input_values):
+        features = self.feature_extractor(input_values)
+        for layer in self.cross_attention_layers:
+            features = layer(features, features)
+        features = features.mean(dim=1).unsqueeze(1)
+        encoded = self.transformer(features)
+        encoded = encoded.mean(dim=1)
+        output = self.classifier(encoded)
+        return output, encoded
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.train_acc(preds, y)
+        self.train_f1(preds, y)
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+        self.log('train_acc', self.train_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('train_f1', self.train_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.val_acc(preds, y)
+        self.val_f1(preds, y)
+        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.test_acc(preds, y)
+        self.test_f1(preds, y)
+        self.log('test_loss', loss, on_step=False, on_epoch=True)
+        self.log('test_acc', self.test_acc, on_step=False, on_epoch=True)
+        self.log('test_f1', self.test_f1, on_step=False, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay
+        )
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            mode='min',
+            factor=0.5,
+            patience=2,
+            verbose=True
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "monitor": "val_loss",
+                "interval": "epoch",
+                "frequency": 1
+            }
+        }
+    def unfreeze_feature_extractor(self):
+        self.feature_extractor.unfreeze()
+class Wav2vec_AudioCNN(pl.LightningModule):
+    def __init__(self, model_name="facebook/wav2vec2-base", embed_dim=512, num_heads=8,
+                 num_layers=6, num_classes=2, freeze_feature_extractor=True,
+                 learning_rate=2e-5, weight_decay=0.01):
+        super(Wav2vec_AudioCNN, self).__init__()
+        self.save_hyperparameters()
+        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+        self.feature_extractor = Wav2Vec2Model.from_pretrained(model_name)
+        if freeze_feature_extractor:
+            self.feature_extractor.freeze_feature_encoder()
+        self.projection = nn.Linear(self.feature_extractor.config.hidden_size, embed_dim)
+        self.decoder = CrossAttn_Transformer(embed_dim=embed_dim, num_heads=num_heads,
+                                            num_layers=num_layers, num_classes=num_classes)
+        # Metrics
+        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+    def forward(self, x, cross_attention_input=None):
+        x = x.squeeze(1)
+        # Wav2Vec2 Feature Extraction
+        features = self.feature_extractor(x).last_hidden_state
+        features = self.projection(features)
+        if cross_attention_input is None:
+            cross_attention_input = features
+        x = self.decoder(features, cross_attention_input)
+        return x
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.train_acc(preds, y)
+        self.train_f1(preds, y)
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+        self.log('train_acc', self.train_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('train_f1', self.train_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.val_acc(preds, y)
+        self.val_f1(preds, y)
+        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.test_acc(preds, y)
+        self.test_f1(preds, y)
+        self.log('test_loss', loss, on_step=False, on_epoch=True)
+        self.log('test_acc', self.test_acc, on_step=False, on_epoch=True)
+        self.log('test_f1', self.test_f1, on_step=False, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay
+        )
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            mode='min',
+            factor=0.5,
+            patience=2,
+            verbose=True
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "monitor": "val_loss",
+                "interval": "epoch",
+                "frequency": 1
+            }
+        }
+class Music2vec_AudioCNN(pl.LightningModule):
+    def __init__(self, embed_dim=512, num_heads=8, num_layers=6, num_classes=2,
+                 learning_rate=2e-5, weight_decay=0.01):
+        super(Music2vec_AudioCNN, self).__init__()
+        self.save_hyperparameters()
+        self.feature_extractor = Music2vec(freeze_feature_extractor=True)
+        self.projection = nn.Linear(self.feature_extractor.music2vec.config.hidden_size, embed_dim)
+        self.decoder = CrossAttn_Transformer(embed_dim=embed_dim, num_heads=num_heads,
+                                            num_layers=num_layers, num_classes=num_classes)
+        # Metrics
+        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+    def forward(self, x, cross_attention_input=None):
+        x = x.squeeze(1)
+        features = self.feature_extractor(x)
+        features = self.projection(features)
+        if cross_attention_input is None:
+            cross_attention_input = features
+        x = self.decoder(features.unsqueeze(1), cross_attention_input.unsqueeze(1))
+        return x
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.train_acc(preds, y)
+        self.train_f1(preds, y)
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+        self.log('train_acc', self.train_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('train_f1', self.train_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.val_acc(preds, y)
+        self.val_f1(preds, y)
+        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.test_acc(preds, y)
+        self.test_f1(preds, y)
+        self.log('test_loss', loss, on_step=False, on_epoch=True)
+        self.log('test_acc', self.test_acc, on_step=False, on_epoch=True)
+        self.log('test_f1', self.test_f1, on_step=False, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay
+        )
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            mode='min',
+            factor=0.5,
+            patience=2,
+            verbose=True
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "monitor": "val_loss",
+                "interval": "epoch",
+                "frequency": 1
+            }
+        }
+class AudioCNN(pl.LightningModule):
+    def __init__(self, embed_dim=512, num_heads=8, num_layers=6, num_classes=2,
+                 learning_rate=2e-5, weight_decay=0.01):
+        super(AudioCNN, self).__init__()
+        self.save_hyperparameters()
+        self.encoder = cnnblock(embed_dim=embed_dim)
+        self.decoder = CrossAttn_Transformer(embed_dim=embed_dim, num_heads=num_heads,
+                                            num_layers=num_layers, num_classes=num_classes)
+        # Metrics
+        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+        self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+    def forward(self, x, cross_attention_input=None):
+        x = self.encoder(x)
+        x = x.unsqueeze(1)
+        if cross_attention_input is None:
+            cross_attention_input = x
+        x = self.decoder(x, cross_attention_input)
+        return x
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.train_acc(preds, y)
+        self.train_f1(preds, y)
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+        self.log('train_acc', self.train_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('train_f1', self.train_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.val_acc(preds, y)
+        self.val_f1(preds, y)
+        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
+        self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.test_acc(preds, y)
+        self.test_f1(preds, y)
+        self.log('test_loss', loss, on_step=False, on_epoch=True)
+        self.log('test_acc', self.test_acc, on_step=False, on_epoch=True)
+        self.log('test_f1', self.test_f1, on_step=False, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay
+        )
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            mode='min',
+            factor=0.5,
+            patience=2,
+            verbose=True
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "monitor": "val_loss",
+                "interval": "epoch",
+                "frequency": 1
+            }
+        }
+# 필요한 보조 클래스들
+class Music2vec(nn.Module):
+    def __init__(self, freeze_feature_extractor=True):
+        super(Music2vec, self).__init__()
+        self.processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-base-960h")
+        self.music2vec = Data2VecAudioModel.from_pretrained("m-a-p/music2vec-v1")
+        if freeze_feature_extractor:
+            for param in self.music2vec.parameters():
+                param.requires_grad = False
+        self.conv1d = nn.Conv1d(in_channels=13, out_channels=1, kernel_size=1)
+    def forward(self, input_values):
+        input_values = input_values.squeeze(1)
+        with torch.no_grad():
+            outputs = self.music2vec(input_values, output_hidden_states=True)
+        hidden_states = torch.stack(outputs.hidden_states)
+        time_reduced = hidden_states.mean(dim=2)
+        time_reduced = time_reduced.permute(1, 0, 2)
+        weighted_avg = self.conv1d(time_reduced).squeeze(1)
+        return weighted_avg