Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

waidhoferj commited on May 8, 2023

Commit

3a0f0a5

•

1 Parent(s): b6800ef

added decision tree

Browse files

Files changed (5) hide show

environment.yml +1 -0
models/audio_spectrogram_transformer.py +48 -1
models/config/train_local.yaml +47 -0
models/decision_tree.py +124 -0
train.py +26 -1

environment.yml CHANGED Viewed

@@ -9,6 +9,7 @@ dependencies:
   - pytorch
   - torchaudio
   - torchvision
   - numpy
   - pandas
   - seaborn

   - pytorch
   - torchaudio
   - torchvision
+  - librosa
   - numpy
   - pandas
   - seaborn

models/audio_spectrogram_transformer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import ASTFeatureExtractor, AutoFeatureExtractor, ASTConfig, AutoModelForAudioClassification, TrainingArguments, Trainer
 import torch
 from torch import nn
 from sklearn.utils.class_weight import compute_class_weight
@@ -7,6 +7,53 @@ import numpy as np
 accuracy = evaluate.load("accuracy")
 def compute_metrics(eval_pred):
     predictions = np.argmax(eval_pred.predictions, axis=1)
     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

+from transformers import ASTModel, AutoFeatureExtractor, ASTConfig, AutoModelForAudioClassification, TrainingArguments, Trainer
 import torch
 from torch import nn
 from sklearn.utils.class_weight import compute_class_weight
 accuracy = evaluate.load("accuracy")
+class MultiModalAST(nn.Module):
+    def __init__(self, labels, sample_rate, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        id2label, label2id = get_id_label_mapping(labels)
+        model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
+        self.ast_feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
+        self.ast_model = ASTModel.from_pretrained(
+        model_checkpoint,
+        num_labels=len(label2id),
+        label2id=label2id,
+        id2label=id2label,
+        ignore_mismatched_sizes=True
+        )
+        self.sample_rate = sample_rate
+        self.bpm_model = nn.Sequential(
+            nn.Linear(len(labels), 100),
+            nn.Linear(100, 50)
+        )
+        out_dim = 50 # TODO: Calculate output dimension
+        self.classifier = nn.Sequential(
+            nn.Linear(out_dim, 100),
+            nn.Linear(100, len(labels))
+        )
+    def vectorize_bpm(self, waveform):
+        pass
+    def forward(self, audio):
+        bpm_vector = self.vectorize_bpm(audio)
+        bpm_out = self.bpm_model(bpm_vector)
+        spectrogram = self.ast_feature_extractor(audio)
+        ast_out = self.ast_model(spectrogram)
+        # Late fusion
+        z = torch.cat([ast_out, bpm_out]) # Which dimension?
+        return self.classifier(z)
 def compute_metrics(eval_pred):
     predictions = np.argmax(eval_pred.predictions, axis=1)
     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

models/config/train_local.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+global:
+  id: decision_tree
+  device: mps
+  seed: 42
+  dance_ids:
+    - ATN
+    - BCH
+    - CHA
+    - ECS
+    - HST
+    - JIV
+    - QST
+    - RMB
+    - SFT
+    - SLS
+    - SMB
+    - SWZ
+    - TGO
+    - VWZ
+    - WCS
+data_module:
+  song_data_path: data/songs_cleaned.csv
+  song_audio_path: data/samples
+  batch_size: 32
+  num_workers: 4
+  min_votes: 1
+  dataset_kwargs:
+    audio_window_duration: 6
+    audio_window_jitter: 1.5
+    audio_pipeline_kwargs:
+      mask_count: 0 # Don't mask the data
+      snr_mean: 15.0 # Pretty much eliminate the noise
+      freq_mask_size: 10
+      time_mask_size: 80
+trainer:
+  log_every_n_steps: 15
+  accelerator: gpu
+  max_epochs: 50
+  min_epochs: 5
+  fast_dev_run: False
+  # gradient_clip_val: 0.5
+  # overfit_batches: 1
+training_environment:
+  learning_rate: 0.00053
+model:
+  n_channels: 128

models/decision_tree.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from sklearn.base import ClassifierMixin, BaseEstimator
+import pandas as pd
+from torch import nn
+import torch
+from typing import Iterator
+import numpy as np
+import json
+from tqdm import tqdm
+import librosa
+DANCE_INFO_FILE = "data/dance_info.csv"
+dance_info_df = pd.read_csv(DANCE_INFO_FILE, converters={'tempoRange': lambda s : json.loads(s.replace("'", '"'))})
+class DanceTreeClassifier(BaseEstimator,ClassifierMixin):
+    """
+    Trains a series of binary classifiers to classify each dance when a song falls into its bpm range.
+    Features:
+        - Spectrogram
+        - BPM
+    """
+    def __init__(self, device="cpu", lr=1e-4, epochs=5, verbose=True) -> None:
+        self.device=device
+        self.epochs=epochs
+        self.verbose = verbose
+        self.lr = lr
+        self.classifiers = {}
+        self.optimizers = {}
+        self.criterion = nn.BCELoss()
+    def get_valid_dances_from_bpm(self,bpm:float) -> list[str]:
+        mask = dance_info_df["tempoRange"].apply(lambda interval: interval["min"] <= bpm <= interval["max"])
+        return list(dance_info_df["id"][mask])
+    def fit(self, x, y):
+        """
+        x: (specs, bpms). The first element is the spectrogram, second element is the bpm. spec shape should be (channel, freq_bins, sr * time)
+        y: (batch_size, n_classes)
+        """
+        progress_bar = tqdm(range(self.epochs))
+        for _ in progress_bar:
+            # TODO: Introduce batches
+            epoch_loss = 0
+            pred_count = 0
+            for (spec, bpm), label in zip(x, y):
+                # find all models that are in the bpm range
+                matching_dances = self.get_valid_dances_from_bpm(bpm)
+                for dance in matching_dances:
+                    if dance not in self.classifiers or dance not in self.optimizers:
+                        classifier = DanceCNN()
+                        self.classifiers[dance] = classifier
+                        self.optimizers[dance] = torch.optim.Adam(classifier.parameters(), lr=self.lr)
+                models = [(dance, model, self.optimizers[dance]) for dance, model in self.classifiers.items() if dance in matching_dances]
+                for dance, model,opt in models:
+                    opt.zero_grad()
+                    spec = torch.from_numpy(spec).to(self.device)
+                    output = model(spec)
+                    target = torch.tensor(float(dance == label))
+                    loss = self.criterion(output, target)
+                    epoch_loss += loss.item()
+                    pred_count +=1
+                    loss.backward()
+                    opt.step()
+            progress_bar.set_description(f"Loss: {epoch_loss / pred_count}")
+    def predict(self, x) -> list[str]:
+        results = []
+        for spec, bpm in zip(*x):
+            matching_dances = self.get_valid_dances_from_bpm(bpm)
+            dance_i = torch.tensor([self.classifiers[dance](spec) for dance in matching_dances]).argmax()
+            results.append(matching_dances[dance_i])
+        return results
+class DanceCNN(nn.Module):
+    def __init__(self, sr=16000, freq_bins=20, duration=6, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        kernel_size=(3,9)
+        self.cnn = nn.Sequential(
+            nn.Conv2d(1,16, kernel_size=kernel_size),
+            nn.ReLU(),
+            nn.MaxPool2d((2,10)),
+            nn.Conv2d(16,32, kernel_size=kernel_size),
+            nn.ReLU(),
+            nn.MaxPool2d((2,10))
+        )
+        embedding_dimension = 32* 3 * 959
+        self.classifier = nn.Sequential(
+            nn.Linear(embedding_dimension, 200),
+            nn.ReLU(),
+            nn.Linear(200, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        x = self.cnn(x)
+        x = x.flatten() if len(x.shape) == 3 else x.flatten(1)
+        return self.classifier(x)
+def features_from_path(paths:list[str],
+                       audio_window_duration=6,
+                       audio_duration=30,
+                       resample_freq=16000) -> Iterator[tuple[np.array, float]]:
+    """
+    Loads audio and bpm from an audio path.
+    """
+    for path in paths:
+        waveform, sr = librosa.load(path, mono=True, sr=resample_freq)
+        num_frames =  audio_window_duration * sr
+        tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
+        mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=20)
+        mfccs_normalized = (mfccs - mfccs.mean()) / mfccs.std()
+        mfccs_padded = librosa.util.fix_length(mfccs_normalized, size=sr*audio_duration, axis=1)
+        mfccs_reshaped = mfccs_padded.reshape(1, mfccs_padded.shape[0], mfccs_padded.shape[1])
+        for i in range(audio_duration//audio_window_duration):
+            mfcc_window = mfccs_reshaped[:,:,i*num_frames:(i+1)*num_frames]
+            yield (mfcc_window, tempo)

train.py CHANGED Viewed

@@ -11,6 +11,7 @@ from models.audio_spectrogram_transformer import train as train_audio_spectrogra
 from preprocessing.dataset import SongDataset, WaveformTrainingEnvironment
 from preprocessing.preprocess import get_examples
 from models.residual import ResidualDancer, TrainingEnvironment
 import yaml
 from preprocessing.dataset import DanceDataModule, WaveformSongDataset, HuggingFaceWaveformSongDataset
 from torch.utils.data import random_split
@@ -32,6 +33,8 @@ def get_training_fn(id:str) -> Callable:
             return train_ast
         case "residual_dancer":
             return train_model
         case _:
             raise Exception(f"Couldn't find a training function for '{id}'.")
@@ -143,9 +146,31 @@ def train_ast_lightning(config:dict):
     trainer.fit(train_env, datamodule=data)
     trainer.test(train_env, datamodule=data)
 if __name__ == "__main__":
     parser = ArgumentParser(description="Trains models on the dance dataset and saves weights.")
-    parser.add_argument("--config", help="Path to the yaml file that defines the training configuration.", default="models/config/train.yaml")
     args = parser.parse_args()
     config = get_config(args.config)
     training_id = config["global"]["id"]

 from preprocessing.dataset import SongDataset, WaveformTrainingEnvironment
 from preprocessing.preprocess import get_examples
 from models.residual import ResidualDancer, TrainingEnvironment
+from models.decision_tree import DanceTreeClassifier, features_from_path
 import yaml
 from preprocessing.dataset import DanceDataModule, WaveformSongDataset, HuggingFaceWaveformSongDataset
 from torch.utils.data import random_split
             return train_ast
         case "residual_dancer":
             return train_model
+        case "decision_tree":
+            return train_decision_tree
         case _:
             raise Exception(f"Couldn't find a training function for '{id}'.")
     trainer.fit(train_env, datamodule=data)
     trainer.test(train_env, datamodule=data)
+def train_decision_tree(config:dict):
+    TARGET_CLASSES = config["global"]["dance_ids"]
+    DEVICE = config["global"]["device"]
+    SEED = config["global"]["seed"]
+    song_data_path=config['data_module']["song_data_path"]
+    song_audio_path = config['data_module']["song_audio_path"]
+    pl.seed_everything(SEED, workers=True)
+    df = pd.read_csv(song_data_path)
+    x, y = get_examples(df, song_audio_path,class_list=TARGET_CLASSES, multi_label=True)
+    # Convert y back to string classes
+    y = np.array(TARGET_CLASSES)[y.argmax(-1)]
+    train_i, test_i = random_split(np.arange(len(x)), [0.8, 0.2])
+    train_paths, train_y = x[train_i], y[train_i]
+    train_x = features_from_path(train_paths)
+    model = DanceTreeClassifier(device=DEVICE)
+    model.fit(train_x, train_y)
+    model.save()
 if __name__ == "__main__":
     parser = ArgumentParser(description="Trains models on the dance dataset and saves weights.")
+    parser.add_argument("--config",
+                        help="Path to the yaml file that defines the training configuration.",
+                        default="models/config/train.yaml")
     args = parser.parse_args()
     config = get_config(args.config)
     training_id = config["global"]["id"]