Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

waidhoferj commited on May 16, 2023

Commit

3b31903

•

1 Parent(s): 3a0f0a5

updated packages

Browse files

Files changed (7) hide show

.gitignore +1 -0
TODO.md +4 -0
environment.yml +1 -1
models/config/train_local.yaml +2 -2
models/decision_tree.py +62 -39
preprocessing/dataset.py +165 -104
train.py +79 -54

.gitignore CHANGED Viewed

@@ -8,3 +8,4 @@ scrapers/auth
 lightning_logs
 .lr_find_*
 .cache

 lightning_logs
 .lr_find_*
 .cache
+.vscode

TODO.md CHANGED Viewed

@@ -9,3 +9,7 @@
 - Read the Medium series about audio DL
 - double check \_rectify_duration
 - ✅ Filter out songs that have only one vote

 - Read the Medium series about audio DL
 - double check \_rectify_duration
 - ✅ Filter out songs that have only one vote
+## Notes
+2xM60 insufficient memory.

environment.yml CHANGED Viewed

@@ -22,7 +22,7 @@ dependencies:
   - rich
   - scikit-learn
   - tensorboard
   - pip:
-      - git+https://github.com/huggingface/transformers.git
       - evaluate
       - wakepy

   - rich
   - scikit-learn
   - tensorboard
+  - transformers
   - pip:
       - evaluate
       - wakepy

models/config/train_local.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 global:
-  id: decision_tree
   device: mps
   seed: 42
   dance_ids:
@@ -22,7 +22,7 @@ data_module:
   song_data_path: data/songs_cleaned.csv
   song_audio_path: data/samples
   batch_size: 32
-  num_workers: 4
   min_votes: 1
   dataset_kwargs:
     audio_window_duration: 6

 global:
+  id: ast_ptl # decision_tree
   device: mps
   seed: 42
   dance_ids:
   song_data_path: data/songs_cleaned.csv
   song_audio_path: data/samples
   batch_size: 32
+  num_workers: 7
   min_votes: 1
   dataset_kwargs:
     audio_window_duration: 6

models/decision_tree.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from sklearn.base import ClassifierMixin, BaseEstimator
 import pandas as pd
 from torch import nn
@@ -8,10 +7,15 @@ import numpy as np
 import json
 from tqdm import tqdm
 import librosa
 DANCE_INFO_FILE = "data/dance_info.csv"
-dance_info_df = pd.read_csv(DANCE_INFO_FILE, converters={'tempoRange': lambda s : json.loads(s.replace("'", '"'))})
-class DanceTreeClassifier(BaseEstimator,ClassifierMixin):
     """
     Trains a series of binary classifiers to classify each dance when a song falls into its bpm range.
@@ -21,20 +25,20 @@ class DanceTreeClassifier(BaseEstimator,ClassifierMixin):
     """
     def __init__(self, device="cpu", lr=1e-4, epochs=5, verbose=True) -> None:
-        self.device=device
-        self.epochs=epochs
         self.verbose = verbose
         self.lr = lr
         self.classifiers = {}
         self.optimizers = {}
         self.criterion = nn.BCELoss()
-    def get_valid_dances_from_bpm(self,bpm:float) -> list[str]:
-        mask = dance_info_df["tempoRange"].apply(lambda interval: interval["min"] <= bpm <= interval["max"])
         return list(dance_info_df["id"][mask])
     def fit(self, x, y):
         """
         x: (specs, bpms). The first element is the spectrogram, second element is the bpm. spec shape should be (channel, freq_bins, sr * time)
@@ -45,57 +49,73 @@ class DanceTreeClassifier(BaseEstimator,ClassifierMixin):
             # TODO: Introduce batches
             epoch_loss = 0
             pred_count = 0
             for (spec, bpm), label in zip(x, y):
                 # find all models that are in the bpm range
                 matching_dances = self.get_valid_dances_from_bpm(bpm)
                 for dance in matching_dances:
                     if dance not in self.classifiers or dance not in self.optimizers:
-                        classifier = DanceCNN()
                         self.classifiers[dance] = classifier
-                        self.optimizers[dance] = torch.optim.Adam(classifier.parameters(), lr=self.lr)
-                models = [(dance, model, self.optimizers[dance]) for dance, model in self.classifiers.items() if dance in matching_dances]
-                for dance, model,opt in models:
                     opt.zero_grad()
-                    spec = torch.from_numpy(spec).to(self.device)
                     output = model(spec)
-                    target = torch.tensor(float(dance == label))
                     loss = self.criterion(output, target)
                     epoch_loss += loss.item()
-                    pred_count +=1
                     loss.backward()
                     opt.step()
-            progress_bar.set_description(f"Loss: {epoch_loss / pred_count}")
     def predict(self, x) -> list[str]:
         results = []
         for spec, bpm in zip(*x):
             matching_dances = self.get_valid_dances_from_bpm(bpm)
-            dance_i = torch.tensor([self.classifiers[dance](spec) for dance in matching_dances]).argmax()
             results.append(matching_dances[dance_i])
         return results
 class DanceCNN(nn.Module):
     def __init__(self, sr=16000, freq_bins=20, duration=6, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        kernel_size=(3,9)
         self.cnn = nn.Sequential(
-            nn.Conv2d(1,16, kernel_size=kernel_size),
             nn.ReLU(),
-            nn.MaxPool2d((2,10)),
-            nn.Conv2d(16,32, kernel_size=kernel_size),
             nn.ReLU(),
-            nn.MaxPool2d((2,10))
         )
-        embedding_dimension = 32* 3 * 959
         self.classifier = nn.Sequential(
             nn.Linear(embedding_dimension, 200),
             nn.ReLU(),
             nn.Linear(200, 1),
-            nn.Sigmoid()
         )
     def forward(self, x):
@@ -103,22 +123,25 @@ class DanceCNN(nn.Module):
         x = x.flatten() if len(x.shape) == 3 else x.flatten(1)
         return self.classifier(x)
-def features_from_path(paths:list[str],
-                       audio_window_duration=6,
-                       audio_duration=30,
-                       resample_freq=16000) -> Iterator[tuple[np.array, float]]:
     """
     Loads audio and bpm from an audio path.
     """
     for path in paths:
         waveform, sr = librosa.load(path, mono=True, sr=resample_freq)
-        num_frames =  audio_window_duration * sr
         tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
         mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=20)
-        mfccs_normalized = (mfccs - mfccs.mean()) / mfccs.std()
-        mfccs_padded = librosa.util.fix_length(mfccs_normalized, size=sr*audio_duration, axis=1)
-        mfccs_reshaped = mfccs_padded.reshape(1, mfccs_padded.shape[0], mfccs_padded.shape[1])
-        for i in range(audio_duration//audio_window_duration):
-            mfcc_window = mfccs_reshaped[:,:,i*num_frames:(i+1)*num_frames]
-            yield (mfcc_window, tempo)

 from sklearn.base import ClassifierMixin, BaseEstimator
 import pandas as pd
 from torch import nn
 import json
 from tqdm import tqdm
 import librosa
 DANCE_INFO_FILE = "data/dance_info.csv"
+dance_info_df = pd.read_csv(
+    DANCE_INFO_FILE,
+    converters={"tempoRange": lambda s: json.loads(s.replace("'", '"'))},
+)
+class DanceTreeClassifier(BaseEstimator, ClassifierMixin):
     """
     Trains a series of binary classifiers to classify each dance when a song falls into its bpm range.
     """
     def __init__(self, device="cpu", lr=1e-4, epochs=5, verbose=True) -> None:
+        self.device = device
+        self.epochs = epochs
         self.verbose = verbose
         self.lr = lr
         self.classifiers = {}
         self.optimizers = {}
         self.criterion = nn.BCELoss()
+    def get_valid_dances_from_bpm(self, bpm: float) -> list[str]:
+        mask = dance_info_df["tempoRange"].apply(
+            lambda interval: interval["min"] <= bpm <= interval["max"]
+        )
         return list(dance_info_df["id"][mask])
     def fit(self, x, y):
         """
         x: (specs, bpms). The first element is the spectrogram, second element is the bpm. spec shape should be (channel, freq_bins, sr * time)
             # TODO: Introduce batches
             epoch_loss = 0
             pred_count = 0
+            step = 0
             for (spec, bpm), label in zip(x, y):
+                step += 1
                 # find all models that are in the bpm range
                 matching_dances = self.get_valid_dances_from_bpm(bpm)
+                spec = torch.from_numpy(spec).to(self.device)
                 for dance in matching_dances:
                     if dance not in self.classifiers or dance not in self.optimizers:
+                        classifier = DanceCNN().to(self.device)
                         self.classifiers[dance] = classifier
+                        self.optimizers[dance] = torch.optim.Adam(
+                            classifier.parameters(), lr=self.lr
+                        )
+                models = [
+                    (dance, model, self.optimizers[dance])
+                    for dance, model in self.classifiers.items()
+                    if dance in matching_dances
+                ]
+                for model_i, (dance, model, opt) in enumerate(models):
                     opt.zero_grad()
                     output = model(spec)
+                    target = torch.tensor([float(dance == label)], device=self.device)
                     loss = self.criterion(output, target)
                     epoch_loss += loss.item()
+                    pred_count += 1
                     loss.backward()
                     opt.step()
+                    progress_bar.set_description(
+                        f"Loss: {epoch_loss / pred_count}, Step: {step}, Model: {model_i+1}/{len(models)}"
+                    )
     def predict(self, x) -> list[str]:
         results = []
         for spec, bpm in zip(*x):
             matching_dances = self.get_valid_dances_from_bpm(bpm)
+            dance_i = torch.tensor(
+                [self.classifiers[dance](spec) for dance in matching_dances]
+            ).argmax()
             results.append(matching_dances[dance_i])
         return results
 class DanceCNN(nn.Module):
     def __init__(self, sr=16000, freq_bins=20, duration=6, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
+        kernel_size = (3, 9)
         self.cnn = nn.Sequential(
+            nn.Conv2d(1, 16, kernel_size=kernel_size),
             nn.ReLU(),
+            nn.MaxPool2d((2, 10)),
+            nn.Conv2d(16, 32, kernel_size=kernel_size),
             nn.ReLU(),
+            nn.MaxPool2d((2, 10)),
+            nn.Conv2d(32, 32, kernel_size=kernel_size),
+            nn.ReLU(),
+            nn.MaxPool2d((2, 10)),
+            nn.Conv2d(32, 16, kernel_size=kernel_size),
+            nn.ReLU(),
+            nn.MaxPool2d((2, 10)),
         )
+        embedding_dimension = 16 * 6 * 8
         self.classifier = nn.Sequential(
             nn.Linear(embedding_dimension, 200),
             nn.ReLU(),
             nn.Linear(200, 1),
+            nn.Sigmoid(),
         )
     def forward(self, x):
         x = x.flatten() if len(x.shape) == 3 else x.flatten(1)
         return self.classifier(x)
+def features_from_path(
+    paths: list[str], audio_window_duration=6, audio_duration=30, resample_freq=16000
+) -> Iterator[tuple[np.array, float]]:
     """
     Loads audio and bpm from an audio path.
     """
     for path in paths:
         waveform, sr = librosa.load(path, mono=True, sr=resample_freq)
+        num_frames = audio_window_duration * sr
         tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
+        spec = librosa.feature.melspectrogram(y=waveform, sr=sr)
         mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=20)
+        spec_normalized = (spec - spec.mean()) / spec.std()
+        spec_padded = librosa.util.fix_length(
+            spec_normalized, size=sr * audio_duration, axis=1
+        )
+        batched_spec = np.expand_dims(spec_padded, axis=0)
+        for i in range(audio_duration // audio_window_duration):
+            spec_window = batched_spec[:, :, i * num_frames : (i + 1) * num_frames]
+            yield (spec_window, tempo)

preprocessing/dataset.py CHANGED Viewed

@@ -12,19 +12,23 @@ from torch import nn
 from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
 class SongDataset(Dataset):
-    def __init__(self,
-    audio_paths: list[str],
-    dance_labels: list[np.ndarray],
-    audio_duration=30, # seconds
-    audio_window_duration=6, # seconds
-    audio_window_jitter=0.0, # seconds
-    audio_pipeline_kwargs={},
-    resample_frequency=16000
     ):
-        assert audio_duration % audio_window_duration == 0, "Audio window should divide duration evenly."
-        assert audio_window_duration > audio_window_jitter, "Jitter should be a small fraction of the audio window duration."
         self.audio_paths = audio_paths
         self.dance_labels = dance_labels
@@ -34,14 +38,21 @@ class SongDataset(Dataset):
         self.audio_window_jitter = audio_window_jitter
         self.audio_duration = int(audio_duration)
-        self.audio_pipeline = AudioTrainingPipeline(self.sample_rate, resample_frequency, audio_window_duration, **audio_pipeline_kwargs)
     def __len__(self):
-        return len(self.audio_paths) * self.audio_duration // self.audio_window_duration
-    def __getitem__(self, idx:int) -> tuple[torch.Tensor, torch.Tensor]:
         waveform = self._waveform_from_index(idx)
-        assert waveform.shape[1] > 10, f"No data found: {self._backtrace_audio_path(idx)}"
         spectrogram = self.audio_pipeline(waveform)
         dance_labels = self._label_from_index(idx)
@@ -53,206 +64,256 @@ class SongDataset(Dataset):
             # Try the previous one
             # This happens when some of the audio recordings are really quiet
             # This WILL NOT leak into other data partitions because songs belong entirely to a partition
-            return self[idx-1]
-    def _convert_idx(self,idx:int) -> int:
         return idx * self.audio_window_duration // self.audio_duration
-    def _backtrace_audio_path(self, index:int) -> str:
         return self.audio_paths[self._convert_idx(index)]
-    def _validate_output(self,x,y):
-        is_finite =  not torch.any(torch.isinf(x))
         is_numerical = not torch.any(torch.isnan(x))
         has_data = torch.any(x != 0.0)
         is_binary = len(torch.unique(y)) < 3
-        return all((is_finite,is_numerical, has_data, is_binary))
-    def _waveform_from_index(self, idx:int) -> torch.Tensor:
         audio_filepath = self.audio_paths[self._convert_idx(idx)]
         num_windows = self.audio_duration // self.audio_window_duration
         frame_index = idx % num_windows
         jitter_start = -self.audio_window_jitter if frame_index > 0 else 0.0
         jitter_end = self.audio_window_jitter if frame_index != num_windows - 1 else 0.0
-        jitter = int(torch.FloatTensor(1).uniform_(jitter_start, jitter_end) * self.sample_rate)
-        frame_offset = frame_index * self.audio_window_duration * self.sample_rate + jitter
         num_frames = self.sample_rate * self.audio_window_duration
-        waveform, sample_rate = ta.load(audio_filepath, frame_offset=frame_offset, num_frames=num_frames)
-        assert sample_rate == self.sample_rate, f"Expected sample rate of {self.sample_rate}. Found {sample_rate}"
         return waveform
-    def _label_from_index(self, idx:int) -> torch.Tensor:
         return torch.from_numpy(self.dance_labels[self._convert_idx(idx)])
 class WaveformSongDataset(SongDataset):
     """
     Outputs raw waveforms of the data instead of a spectrogram.
     """
-    def __init__(self, *args,resample_frequency=16000, **kwargs):
         super().__init__(*args, **kwargs)
         self.resample_frequency = resample_frequency
         self.resampler = taT.Resample(self.sample_rate, self.resample_frequency)
         self.pipeline = []
-    def __getitem__(self, idx:int) -> dict[str, torch.Tensor]:
         waveform = self._waveform_from_index(idx)
-        assert waveform.shape[1] > 10, f"No data found: {self._backtrace_audio_path(idx)}"
         # resample the waveform
         waveform = self.resampler(waveform)
         waveform = waveform.mean(0)
         dance_labels = self._label_from_index(idx)
         return waveform, dance_labels
 class HuggingFaceWaveformSongDataset(WaveformSongDataset):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.pipeline = []
-    def __getitem__(self, idx:int) -> dict[str, torch.Tensor]:
-        x,y = super().__getitem__(idx)
         if len(self.pipeline) > 0:
             for fn in self.pipeline:
                 x = fn(x)
         dance_labels = y.argmax()
-        return {"input_values": x["input_values"][0] if hasattr(x, "input_values") else x, "label": dance_labels}
-    def map(self,fn):
         """
         NOTE this mutates the original, doesn't return a copy like normal maps.
         """
         self.pipeline.append(fn)
 class DanceDataModule(pl.LightningDataModule):
-    def __init__(self,
-    song_data_path="data/songs_cleaned.csv",
-    song_audio_path="data/samples",
-    test_proportion=0.15,
-    val_proportion=0.1,
-    target_classes:list[str]=None,
-    min_votes=1,
-    batch_size:int=64,
-    num_workers=10,
-    dataset_cls = None,
-    dataset_kwargs={}
     ):
         super().__init__()
         self.song_data_path = song_data_path
         self.song_audio_path = song_audio_path
-        self.val_proportion=val_proportion
-        self.test_proportion=test_proportion
-        self.train_proportion= 1.-test_proportion-val_proportion
-        self.target_classes=target_classes
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.dataset_kwargs = dataset_kwargs
         self.dataset_cls = dataset_cls if dataset_cls is not None else SongDataset
         df = pd.read_csv(song_data_path)
-        self.x,self.y = get_examples(df, self.song_audio_path,class_list=self.target_classes, multi_label=True, min_votes=min_votes)
     def setup(self, stage: str):
-        train_i, val_i, test_i = random_split(np.arange(len(self.x)), [self.train_proportion, self.val_proportion, self.test_proportion])
         self.train_ds = self._dataset_from_indices(train_i)
         self.val_ds = self._dataset_from_indices(val_i)
         self.test_ds = self._dataset_from_indices(test_i)
-    def _dataset_from_indices(self, idx:list[int]) -> SongDataset:
         return self.dataset_cls(self.x[idx], self.y[idx], **self.dataset_kwargs)
     def train_dataloader(self):
-        return DataLoader(self.train_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
     def val_dataloader(self):
-        return DataLoader(self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers)
     def test_dataloader(self):
-        return DataLoader(self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers)
     def get_label_weights(self):
         n_examples, n_classes = self.y.shape
         return torch.from_numpy(n_examples / (n_classes * sum(self.y)))
-class WaveformTrainingEnvironment(pl.LightningModule):
-    def __init__(self, model: nn.Module, criterion: nn.Module, feature_extractor, config:dict, learning_rate=1e-4, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.model = model
         self.criterion = criterion
         self.learning_rate = learning_rate
-        self.config=config
-        self.feature_extractor=feature_extractor
-        self.save_hyperparameters({
-            "model": type(model).__name__,
-            "loss": type(criterion).__name__,
-            "config": config,
-             **kwargs
-            })
     def preprocess_inputs(self, x):
         device = x.device
-        x = x.squeeze(1).cpu().numpy()
-        x = self.feature_extractor(list(x),return_tensors='pt', sampling_rate=16000)
         return x["input_values"].to(device)
-    def training_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int) -> torch.Tensor:
         features, labels = batch
         features = self.preprocess_inputs(features)
         outputs = self.model(features).logits
-        outputs = nn.Sigmoid()(outputs) # good for multi label classification, should be softmax otherwise
         loss = self.criterion(outputs, labels)
         metrics = calculate_metrics(outputs, labels, prefix="train/", multi_label=True)
         self.log_dict(metrics, prog_bar=True)
         return loss
-    def validation_step(self, batch:tuple[torch.Tensor, torch.TensorType], batch_index:int):
-        x,y = batch
         x = self.preprocess_inputs(x)
         preds = self.model(x).logits
-        preds = nn.Sigmoid()(preds)
         metrics = calculate_metrics(preds, y, prefix="val/", multi_label=True)
         metrics["val/loss"] = self.criterion(preds, y)
-        self.log_dict(metrics,prog_bar=True)
-    def test_step(self, batch:tuple[torch.Tensor, torch.TensorType], batch_index:int):
         x, y = batch
         x = self.preprocess_inputs(x)
         preds = self.model(x).logits
-        preds = nn.Sigmoid()(preds)
-        self.log_dict(calculate_metrics(preds, y, prefix="test/", multi_label=True), prog_bar=True)
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
         # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') {"scheduler": scheduler, "monitor": "val/loss"}
-        return [optimizer]
-def calculate_metrics(pred, target, threshold=0.5, prefix="", multi_label=True) -> dict[str, torch.Tensor]:
     target = target.detach().cpu().numpy()
     pred = pred.detach().cpu().numpy()
     params = {
-            "y_true": target if multi_label else target.argmax(1) ,
-            "y_pred": np.array(pred > threshold, dtype=float) if multi_label else pred.argmax(1),
-            "zero_division": 0,
-            "average":"macro"
-            }
-    metrics= {
-            'precision': precision_score(**params),
-            'recall': recall_score(**params),
-            'f1': f1_score(**params),
-            'accuracy': accuracy_score(y_true=params["y_true"], y_pred=params["y_pred"]),
-            }
-    return {prefix + k: torch.tensor(v,dtype=torch.float32) for k,v in metrics.items()}

 from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
 class SongDataset(Dataset):
+    def __init__(
+        self,
+        audio_paths: list[str],
+        dance_labels: list[np.ndarray],
+        audio_duration=30,  # seconds
+        audio_window_duration=6,  # seconds
+        audio_window_jitter=0.0,  # seconds
+        audio_pipeline_kwargs={},
+        resample_frequency=16000,
     ):
+        assert (
+            audio_duration % audio_window_duration == 0
+        ), "Audio window should divide duration evenly."
+        assert (
+            audio_window_duration > audio_window_jitter
+        ), "Jitter should be a small fraction of the audio window duration."
         self.audio_paths = audio_paths
         self.dance_labels = dance_labels
         self.audio_window_jitter = audio_window_jitter
         self.audio_duration = int(audio_duration)
+        self.audio_pipeline = AudioTrainingPipeline(
+            self.sample_rate,
+            resample_frequency,
+            audio_window_duration,
+            **audio_pipeline_kwargs,
+        )
     def __len__(self):
+        return len(self.audio_paths) * self.audio_duration // self.audio_window_duration
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
         waveform = self._waveform_from_index(idx)
+        assert (
+            waveform.shape[1] > 10
+        ), f"No data found: {self._backtrace_audio_path(idx)}"
         spectrogram = self.audio_pipeline(waveform)
         dance_labels = self._label_from_index(idx)
             # Try the previous one
             # This happens when some of the audio recordings are really quiet
             # This WILL NOT leak into other data partitions because songs belong entirely to a partition
+            return self[idx - 1]
+    def _convert_idx(self, idx: int) -> int:
         return idx * self.audio_window_duration // self.audio_duration
+    def _backtrace_audio_path(self, index: int) -> str:
         return self.audio_paths[self._convert_idx(index)]
+    def _validate_output(self, x, y):
+        is_finite = not torch.any(torch.isinf(x))
         is_numerical = not torch.any(torch.isnan(x))
         has_data = torch.any(x != 0.0)
         is_binary = len(torch.unique(y)) < 3
+        return all((is_finite, is_numerical, has_data, is_binary))
+    def _waveform_from_index(self, idx: int) -> torch.Tensor:
         audio_filepath = self.audio_paths[self._convert_idx(idx)]
         num_windows = self.audio_duration // self.audio_window_duration
         frame_index = idx % num_windows
         jitter_start = -self.audio_window_jitter if frame_index > 0 else 0.0
         jitter_end = self.audio_window_jitter if frame_index != num_windows - 1 else 0.0
+        jitter = int(
+            torch.FloatTensor(1).uniform_(jitter_start, jitter_end) * self.sample_rate
+        )
+        frame_offset = (
+            frame_index * self.audio_window_duration * self.sample_rate + jitter
+        )
         num_frames = self.sample_rate * self.audio_window_duration
+        waveform, sample_rate = ta.load(
+            audio_filepath, frame_offset=frame_offset, num_frames=num_frames
+        )
+        assert (
+            sample_rate == self.sample_rate
+        ), f"Expected sample rate of {self.sample_rate}. Found {sample_rate}"
         return waveform
+    def _label_from_index(self, idx: int) -> torch.Tensor:
         return torch.from_numpy(self.dance_labels[self._convert_idx(idx)])
 class WaveformSongDataset(SongDataset):
     """
     Outputs raw waveforms of the data instead of a spectrogram.
     """
+    def __init__(self, *args, resample_frequency=16000, **kwargs):
         super().__init__(*args, **kwargs)
         self.resample_frequency = resample_frequency
         self.resampler = taT.Resample(self.sample_rate, self.resample_frequency)
         self.pipeline = []
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         waveform = self._waveform_from_index(idx)
+        assert (
+            waveform.shape[1] > 10
+        ), f"No data found: {self._backtrace_audio_path(idx)}"
         # resample the waveform
         waveform = self.resampler(waveform)
         waveform = waveform.mean(0)
         dance_labels = self._label_from_index(idx)
         return waveform, dance_labels
 class HuggingFaceWaveformSongDataset(WaveformSongDataset):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.pipeline = []
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        x, y = super().__getitem__(idx)
         if len(self.pipeline) > 0:
             for fn in self.pipeline:
                 x = fn(x)
         dance_labels = y.argmax()
+        return {
+            "input_values": x["input_values"][0] if hasattr(x, "input_values") else x,
+            "label": dance_labels,
+        }
+    def map(self, fn):
         """
         NOTE this mutates the original, doesn't return a copy like normal maps.
         """
         self.pipeline.append(fn)
 class DanceDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        song_data_path="data/songs_cleaned.csv",
+        song_audio_path="data/samples",
+        test_proportion=0.15,
+        val_proportion=0.1,
+        target_classes: list[str] = None,
+        min_votes=1,
+        batch_size: int = 64,
+        num_workers=10,
+        dataset_cls=None,
+        dataset_kwargs={},
     ):
         super().__init__()
         self.song_data_path = song_data_path
         self.song_audio_path = song_audio_path
+        self.val_proportion = val_proportion
+        self.test_proportion = test_proportion
+        self.train_proportion = 1.0 - test_proportion - val_proportion
+        self.target_classes = target_classes
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.dataset_kwargs = dataset_kwargs
         self.dataset_cls = dataset_cls if dataset_cls is not None else SongDataset
         df = pd.read_csv(song_data_path)
+        self.x, self.y = get_examples(
+            df,
+            self.song_audio_path,
+            class_list=self.target_classes,
+            multi_label=True,
+            min_votes=min_votes,
+        )
     def setup(self, stage: str):
+        train_i, val_i, test_i = random_split(
+            np.arange(len(self.x)),
+            [self.train_proportion, self.val_proportion, self.test_proportion],
+        )
         self.train_ds = self._dataset_from_indices(train_i)
         self.val_ds = self._dataset_from_indices(val_i)
         self.test_ds = self._dataset_from_indices(test_i)
+    def _dataset_from_indices(self, idx: list[int]) -> SongDataset:
         return self.dataset_cls(self.x[idx], self.y[idx], **self.dataset_kwargs)
     def train_dataloader(self):
+        return DataLoader(
+            self.train_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            shuffle=True,
+        )
     def val_dataloader(self):
+        return DataLoader(
+            self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers
+        )
     def test_dataloader(self):
+        return DataLoader(
+            self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers
+        )
     def get_label_weights(self):
         n_examples, n_classes = self.y.shape
         return torch.from_numpy(n_examples / (n_classes * sum(self.y)))
+class WaveformTrainingEnvironment(pl.LightningModule):
+    def __init__(
+        self,
+        model: nn.Module,
+        criterion: nn.Module,
+        feature_extractor,
+        config: dict,
+        learning_rate=1e-4,
+        *args,
+        **kwargs,
+    ):
         super().__init__(*args, **kwargs)
         self.model = model
         self.criterion = criterion
         self.learning_rate = learning_rate
+        self.config = config
+        self.feature_extractor = feature_extractor
+        self.save_hyperparameters(
+            {
+                "model": type(model).__name__,
+                "loss": type(criterion).__name__,
+                "config": config,
+                **kwargs,
+            }
+        )
     def preprocess_inputs(self, x):
         device = x.device
+        x = list(x.squeeze(1).cpu().numpy())
+        x = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000)
         return x["input_values"].to(device)
+    def training_step(
+        self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int
+    ) -> torch.Tensor:
         features, labels = batch
         features = self.preprocess_inputs(features)
         outputs = self.model(features).logits
+        outputs = nn.Sigmoid()(
+            outputs
+        )  # good for multi label classification, should be softmax otherwise
         loss = self.criterion(outputs, labels)
         metrics = calculate_metrics(outputs, labels, prefix="train/", multi_label=True)
         self.log_dict(metrics, prog_bar=True)
         return loss
+    def validation_step(
+        self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int
+    ):
+        x, y = batch
         x = self.preprocess_inputs(x)
         preds = self.model(x).logits
+        preds = nn.Sigmoid()(preds)
         metrics = calculate_metrics(preds, y, prefix="val/", multi_label=True)
         metrics["val/loss"] = self.criterion(preds, y)
+        self.log_dict(metrics, prog_bar=True)
+    def test_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int):
         x, y = batch
         x = self.preprocess_inputs(x)
         preds = self.model(x).logits
+        preds = nn.Sigmoid()(preds)
+        self.log_dict(
+            calculate_metrics(preds, y, prefix="test/", multi_label=True), prog_bar=True
+        )
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
         # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') {"scheduler": scheduler, "monitor": "val/loss"}
+        return [optimizer]
+def calculate_metrics(
+    pred, target, threshold=0.5, prefix="", multi_label=True
+) -> dict[str, torch.Tensor]:
     target = target.detach().cpu().numpy()
     pred = pred.detach().cpu().numpy()
     params = {
+        "y_true": target if multi_label else target.argmax(1),
+        "y_pred": np.array(pred > threshold, dtype=float)
+        if multi_label
+        else pred.argmax(1),
+        "zero_division": 0,
+        "average": "macro",
+    }
+    metrics = {
+        "precision": precision_score(**params),
+        "recall": recall_score(**params),
+        "f1": f1_score(**params),
+        "accuracy": accuracy_score(y_true=params["y_true"], y_pred=params["y_pred"]),
+    }
+    return {
+        prefix + k: torch.tensor(v, dtype=torch.float32) for k, v in metrics.items()
+    }

train.py CHANGED Viewed

@@ -7,25 +7,32 @@ from sklearn.model_selection import KFold
 import pytorch_lightning as pl
 from pytorch_lightning import callbacks as cb
 from models.utils import LabelWeightedBCELoss
-from models.audio_spectrogram_transformer import train as train_audio_spectrogram_transformer, get_id_label_mapping
 from preprocessing.dataset import SongDataset, WaveformTrainingEnvironment
 from preprocessing.preprocess import get_examples
 from models.residual import ResidualDancer, TrainingEnvironment
 from models.decision_tree import DanceTreeClassifier, features_from_path
 import yaml
-from preprocessing.dataset import DanceDataModule, WaveformSongDataset, HuggingFaceWaveformSongDataset
 from torch.utils.data import random_split
 import numpy as np
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 from argparse import ArgumentParser
 import torch
 from torch import nn
 from sklearn.utils.class_weight import compute_class_weight
-def get_training_fn(id:str) -> Callable:
     match id:
         case "ast_ptl":
             return train_ast_lightning
@@ -38,7 +45,8 @@ def get_training_fn(id:str) -> Callable:
         case _:
             raise Exception(f"Couldn't find a training function for '{id}'.")
-def get_config(filepath:str) -> dict:
     with open(filepath, "r") as f:
         config = yaml.safe_load(f)
     return config
@@ -48,14 +56,14 @@ def cross_validation(config, k=5):
     df = pd.read_csv("data/songs.csv")
     g_config = config["global"]
     batch_size = config["data_module"]["batch_size"]
-    x,y = get_examples(df, "data/samples",class_list=g_config["dance_ids"])
-    dataset = SongDataset(x,y)
-    splits=KFold(n_splits=k,shuffle=True,random_state=g_config["seed"])
     trainer = pl.Trainer(accelerator=g_config["device"])
-    for fold, (train_idx,val_idx) in enumerate(splits.split(x,y)):
         print(f"Fold {fold+1}")
         model = ResidualDancer(n_classes=len(g_config["dance_ids"]))
-        train_env = TrainingEnvironment(model,nn.BCELoss())
         train_sampler = SubsetRandomSampler(train_idx)
         test_sampler = SubsetRandomSampler(val_idx)
         train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
@@ -64,15 +72,17 @@ def cross_validation(config, k=5):
         trainer.test(train_env, test_loader)
-def train_model(config:dict):
     TARGET_CLASSES = config["global"]["dance_ids"]
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
     pl.seed_everything(SEED, workers=True)
-    data = DanceDataModule(target_classes=TARGET_CLASSES, **config['data_module'])
-    model = ResidualDancer(n_classes=len(TARGET_CLASSES), **config['model'])
     label_weights = data.get_label_weights().to(DEVICE)
-    criterion = LabelWeightedBCELoss(label_weights) #nn.CrossEntropyLoss(label_weights)
     train_env = TrainingEnvironment(model, criterion, config)
     callbacks = [
         # cb.LearningRateFinder(update_attr=True),
@@ -81,36 +91,41 @@ def train_model(config:dict):
         cb.RichProgressBar(),
         cb.DeviceStatsMonitor(),
     ]
-    trainer = pl.Trainer(
-        callbacks=callbacks,
-        **config["trainer"]
-        )
     trainer.fit(train_env, datamodule=data)
     trainer.test(train_env, datamodule=data)
-def train_ast(
-    config:dict
-):
     TARGET_CLASSES = config["global"]["dance_ids"]
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
     dataset_kwargs = config["data_module"]["dataset_kwargs"]
     test_proportion = config["data_module"].get("test_proportion", 0.2)
-    train_proportion = 1. - test_proportion
-    song_data_path="data/songs_cleaned.csv"
     song_audio_path = "data/samples"
     pl.seed_everything(SEED, workers=True)
     df = pd.read_csv(song_data_path)
-    x, y = get_examples(df, song_audio_path,class_list=TARGET_CLASSES, multi_label=True)
-    train_i, test_i = random_split(np.arange(len(x)), [train_proportion, test_proportion])
-    train_ds = HuggingFaceWaveformSongDataset(x[train_i], y[train_i], **dataset_kwargs, resample_frequency=16000)
-    test_ds = HuggingFaceWaveformSongDataset(x[test_i], y[test_i], **dataset_kwargs, resample_frequency=16000)
-    train_audio_spectrogram_transformer(TARGET_CLASSES, train_ds, test_ds, device=DEVICE)
-def train_ast_lightning(config:dict):
     """
     work on integration between waveform dataset and environment. Should work for both HF and PTL.
     """
@@ -118,45 +133,50 @@ def train_ast_lightning(config:dict):
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
     pl.seed_everything(SEED, workers=True)
-    data = DanceDataModule(target_classes=TARGET_CLASSES, dataset_cls=WaveformSongDataset, **config['data_module'])
     id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
     model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
     feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
     model = AutoModelForAudioClassification.from_pretrained(
-    model_checkpoint,
-    num_labels=len(label2id),
-    label2id=label2id,
-    id2label=id2label,
-    ignore_mismatched_sizes=True
-).to(DEVICE)
     label_weights = data.get_label_weights().to(DEVICE)
-    criterion = LabelWeightedBCELoss(label_weights) #nn.CrossEntropyLoss(label_weights)
-    train_env = WaveformTrainingEnvironment(model, criterion,feature_extractor, config)
     callbacks = [
         # cb.LearningRateFinder(update_attr=True),
         cb.EarlyStopping("val/loss", patience=5),
         cb.StochasticWeightAveraging(1e-2),
-        cb.RichProgressBar()
     ]
-    trainer = pl.Trainer(
-        callbacks=callbacks,
-        **config["trainer"]
-        )
     trainer.fit(train_env, datamodule=data)
     trainer.test(train_env, datamodule=data)
-def train_decision_tree(config:dict):
     TARGET_CLASSES = config["global"]["dance_ids"]
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
-    song_data_path=config['data_module']["song_data_path"]
-    song_audio_path = config['data_module']["song_audio_path"]
     pl.seed_everything(SEED, workers=True)
     df = pd.read_csv(song_data_path)
-    x, y = get_examples(df, song_audio_path,class_list=TARGET_CLASSES, multi_label=True)
     # Convert y back to string classes
     y = np.array(TARGET_CLASSES)[y.argmax(-1)]
     train_i, test_i = random_split(np.arange(len(x)), [0.8, 0.2])
@@ -166,13 +186,18 @@ def train_decision_tree(config:dict):
     model.fit(train_x, train_y)
     model.save()
 if __name__ == "__main__":
-    parser = ArgumentParser(description="Trains models on the dance dataset and saves weights.")
-    parser.add_argument("--config",
-                        help="Path to the yaml file that defines the training configuration.",
-                        default="models/config/train.yaml")
     args = parser.parse_args()
     config = get_config(args.config)
     training_id = config["global"]["id"]
     train = get_training_fn(training_id)
-    train(config)

 import pytorch_lightning as pl
 from pytorch_lightning import callbacks as cb
 from models.utils import LabelWeightedBCELoss
+from models.audio_spectrogram_transformer import (
+    train as train_audio_spectrogram_transformer,
+    get_id_label_mapping,
+)
 from preprocessing.dataset import SongDataset, WaveformTrainingEnvironment
 from preprocessing.preprocess import get_examples
 from models.residual import ResidualDancer, TrainingEnvironment
 from models.decision_tree import DanceTreeClassifier, features_from_path
 import yaml
+from preprocessing.dataset import (
+    DanceDataModule,
+    WaveformSongDataset,
+    HuggingFaceWaveformSongDataset,
+)
 from torch.utils.data import random_split
 import numpy as np
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 from argparse import ArgumentParser
 import torch
 from torch import nn
 from sklearn.utils.class_weight import compute_class_weight
+def get_training_fn(id: str) -> Callable:
     match id:
         case "ast_ptl":
             return train_ast_lightning
         case _:
             raise Exception(f"Couldn't find a training function for '{id}'.")
+def get_config(filepath: str) -> dict:
     with open(filepath, "r") as f:
         config = yaml.safe_load(f)
     return config
     df = pd.read_csv("data/songs.csv")
     g_config = config["global"]
     batch_size = config["data_module"]["batch_size"]
+    x, y = get_examples(df, "data/samples", class_list=g_config["dance_ids"])
+    dataset = SongDataset(x, y)
+    splits = KFold(n_splits=k, shuffle=True, random_state=g_config["seed"])
     trainer = pl.Trainer(accelerator=g_config["device"])
+    for fold, (train_idx, val_idx) in enumerate(splits.split(x, y)):
         print(f"Fold {fold+1}")
         model = ResidualDancer(n_classes=len(g_config["dance_ids"]))
+        train_env = TrainingEnvironment(model, nn.BCELoss())
         train_sampler = SubsetRandomSampler(train_idx)
         test_sampler = SubsetRandomSampler(val_idx)
         train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
         trainer.test(train_env, test_loader)
+def train_model(config: dict):
     TARGET_CLASSES = config["global"]["dance_ids"]
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
     pl.seed_everything(SEED, workers=True)
+    data = DanceDataModule(target_classes=TARGET_CLASSES, **config["data_module"])
+    model = ResidualDancer(n_classes=len(TARGET_CLASSES), **config["model"])
     label_weights = data.get_label_weights().to(DEVICE)
+    criterion = LabelWeightedBCELoss(
+        label_weights
+    )  # nn.CrossEntropyLoss(label_weights)
     train_env = TrainingEnvironment(model, criterion, config)
     callbacks = [
         # cb.LearningRateFinder(update_attr=True),
         cb.RichProgressBar(),
         cb.DeviceStatsMonitor(),
     ]
+    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
     trainer.fit(train_env, datamodule=data)
     trainer.test(train_env, datamodule=data)
+def train_ast(config: dict):
     TARGET_CLASSES = config["global"]["dance_ids"]
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
     dataset_kwargs = config["data_module"]["dataset_kwargs"]
     test_proportion = config["data_module"].get("test_proportion", 0.2)
+    train_proportion = 1.0 - test_proportion
+    song_data_path = "data/songs_cleaned.csv"
     song_audio_path = "data/samples"
     pl.seed_everything(SEED, workers=True)
     df = pd.read_csv(song_data_path)
+    x, y = get_examples(
+        df, song_audio_path, class_list=TARGET_CLASSES, multi_label=True
+    )
+    train_i, test_i = random_split(
+        np.arange(len(x)), [train_proportion, test_proportion]
+    )
+    train_ds = HuggingFaceWaveformSongDataset(
+        x[train_i], y[train_i], **dataset_kwargs, resample_frequency=16000
+    )
+    test_ds = HuggingFaceWaveformSongDataset(
+        x[test_i], y[test_i], **dataset_kwargs, resample_frequency=16000
+    )
+    train_audio_spectrogram_transformer(
+        TARGET_CLASSES, train_ds, test_ds, device=DEVICE
+    )
+def train_ast_lightning(config: dict):
     """
     work on integration between waveform dataset and environment. Should work for both HF and PTL.
     """
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
     pl.seed_everything(SEED, workers=True)
+    data = DanceDataModule(
+        target_classes=TARGET_CLASSES,
+        dataset_cls=WaveformSongDataset,
+        **config["data_module"],
+    )
     id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
     model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
     feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
     model = AutoModelForAudioClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(label2id),
+        label2id=label2id,
+        id2label=id2label,
+        ignore_mismatched_sizes=True,
+    ).to(DEVICE)
     label_weights = data.get_label_weights().to(DEVICE)
+    criterion = LabelWeightedBCELoss(
+        label_weights
+    )  # nn.CrossEntropyLoss(label_weights)
+    train_env = WaveformTrainingEnvironment(model, criterion, feature_extractor, config)
     callbacks = [
         # cb.LearningRateFinder(update_attr=True),
         cb.EarlyStopping("val/loss", patience=5),
         cb.StochasticWeightAveraging(1e-2),
+        cb.RichProgressBar(),
     ]
+    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
     trainer.fit(train_env, datamodule=data)
     trainer.test(train_env, datamodule=data)
+def train_decision_tree(config: dict):
     TARGET_CLASSES = config["global"]["dance_ids"]
     DEVICE = config["global"]["device"]
     SEED = config["global"]["seed"]
+    song_data_path = config["data_module"]["song_data_path"]
+    song_audio_path = config["data_module"]["song_audio_path"]
     pl.seed_everything(SEED, workers=True)
     df = pd.read_csv(song_data_path)
+    x, y = get_examples(
+        df, song_audio_path, class_list=TARGET_CLASSES, multi_label=True
+    )
     # Convert y back to string classes
     y = np.array(TARGET_CLASSES)[y.argmax(-1)]
     train_i, test_i = random_split(np.arange(len(x)), [0.8, 0.2])
     model.fit(train_x, train_y)
     model.save()
 if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Trains models on the dance dataset and saves weights."
+    )
+    parser.add_argument(
+        "--config",
+        help="Path to the yaml file that defines the training configuration.",
+        default="models/config/train_local.yaml",
+    )
     args = parser.parse_args()
     config = get_config(args.config)
     training_id = config["global"]["id"]
     train = get_training_fn(training_id)
+    train(config)