Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

waidhoferj commited on Jun 27, 2023

Commit

42c4703

•

1 Parent(s): 248f682

updated models

Browse files

Files changed (6) hide show

app.py +106 -14
models/audio_spectrogram_transformer.py +6 -2
models/config/train_local.yaml +7 -6
models/residual.py +0 -64
models/training_environment.py +5 -3
models/wav2vec2.py +1 -2

app.py CHANGED Viewed

@@ -1,23 +1,113 @@
 from pathlib import Path
 import gradio as gr
 import numpy as np
-from models.residual import DancePredictor
 import os
 from functools import cache
 from pathlib import Path
-CONFIG_FILE = Path("models/config/dance-predictor.yaml")
 @cache
-def get_model(config_path:str) -> DancePredictor:
     model = DancePredictor.from_config(config_path)
     return model
 def predict(audio: tuple[int, np.ndarray]) -> list[str]:
     sample_rate, waveform = audio
     model = get_model(CONFIG_FILE)
-    results = model(waveform,sample_rate)
     return results if len(results) else "Dance Not Found"
@@ -25,34 +115,36 @@ def demo():
     title = "Dance Classifier"
     description = "What should I dance to this song? Pass some audio to the Dance Classifier find out!"
     song_samples = Path(os.path.dirname(__file__), "assets", "song-samples")
-    example_audio = [str(song) for song in song_samples.iterdir() if song.name[0] != '.']
     all_dances = get_model(CONFIG_FILE).labels
     recording_interface = gr.Interface(
         fn=predict,
         description="Record at least **6 seconds** of the song.",
         inputs=gr.Audio(source="microphone", label="Song Recording"),
         outputs=gr.Label(label="Dances"),
-        examples=example_audio
     )
     uploading_interface = gr.Interface(
         fn=predict,
         inputs=gr.Audio(label="Song Audio File"),
         outputs=gr.Label(label="Dances"),
-        examples=example_audio
     )
     with gr.Blocks() as app:
         gr.Markdown(f"# {title}")
         gr.Markdown(description)
-        gr.TabbedInterface([uploading_interface, recording_interface], ["Upload Song", "Record Song"])
         with gr.Accordion("See all dances", open=False):
             gr.Markdown("\n".join(f"- {dance}" for dance in all_dances))
     return app
 if __name__ == "__main__":
-    demo().launch()

 from pathlib import Path
 import gradio as gr
 import numpy as np
 import os
 from functools import cache
 from pathlib import Path
+from models.audio_spectrogram_transformer import AST, ASTExtractorWrapper
+from models.training_environment import TrainingEnvironment
+import torch
+from torch import nn
+import yaml
+import torchaudio
+CONFIG_FILE = Path("models/config/train_local.yaml")
+MODEL_CLS = AST
+EXTRACTOR = ASTExtractorWrapper
+class DancePredictor:
+    def __init__(
+        self,
+        weight_path: str,
+        labels: list[str],
+        expected_duration=6,
+        threshold=0.5,
+        resample_frequency=16000,
+        device="cpu",
+    ):
+        super().__init__()
+        self.expected_duration = expected_duration
+        self.threshold = threshold
+        self.resample_frequency = resample_frequency
+        self.labels = np.array(labels)
+        self.device = device
+        self.model = self.get_model(weight_path)
+        self.extractor = ASTExtractorWrapper()
+    def get_model(self, weight_path: str) -> nn.Module:
+        weights = torch.load(weight_path, map_location=self.device)["state_dict"]
+        model = AST(self.labels).to(self.device)
+        for key in list(weights):
+            weights[
+                key.replace(
+                    "model.",
+                    "",
+                )
+            ] = weights.pop(key)
+        model.load_state_dict(weights, strict=False)
+        return model.to(self.device).eval()
+    @classmethod
+    def from_config(cls, config_path: str) -> "DancePredictor":
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        weight_path = config["checkpoint"]
+        labels = sorted(config["dance_ids"])
+        expected_duration = 6
+        threshold = 0.5
+        resample_frequency = 16000
+        device = "mps"
+        return DancePredictor(
+            weight_path,
+            labels,
+            expected_duration,
+            threshold,
+            resample_frequency,
+            device,
+        )
+    @torch.no_grad()
+    def __call__(self, waveform: np.ndarray, sample_rate: int) -> dict[str, float]:
+        if waveform.ndim == 1:
+            waveform = np.stack([waveform, waveform]).T
+        waveform = torch.from_numpy(waveform.T)
+        waveform = torchaudio.functional.apply_codec(
+            waveform, sample_rate, "wav", channels_first=True
+        )
+        waveform = torchaudio.functional.resample(
+            waveform, sample_rate, self.resample_frequency
+        )
+        waveform = waveform[
+            :, : self.resample_frequency * self.expected_duration
+        ]  # TODO PAD
+        features = self.extractor(waveform)
+        features = features.unsqueeze(0).to(self.device)
+        results = self.model(features)
+        results = nn.functional.softmax(results.squeeze(0), dim=0)
+        results = results.detach().cpu().numpy()
+        result_mask = results > self.threshold
+        probs = results[result_mask]
+        dances = self.labels[result_mask]
+        return {dance: float(prob) for dance, prob in zip(dances, probs)}
 @cache
+def get_model(config_path: str) -> DancePredictor:
     model = DancePredictor.from_config(config_path)
     return model
 def predict(audio: tuple[int, np.ndarray]) -> list[str]:
     sample_rate, waveform = audio
     model = get_model(CONFIG_FILE)
+    results = model(waveform, sample_rate)
     return results if len(results) else "Dance Not Found"
     title = "Dance Classifier"
     description = "What should I dance to this song? Pass some audio to the Dance Classifier find out!"
     song_samples = Path(os.path.dirname(__file__), "assets", "song-samples")
+    example_audio = [
+        str(song) for song in song_samples.iterdir() if song.name[0] != "."
+    ]
     all_dances = get_model(CONFIG_FILE).labels
     recording_interface = gr.Interface(
         fn=predict,
         description="Record at least **6 seconds** of the song.",
         inputs=gr.Audio(source="microphone", label="Song Recording"),
         outputs=gr.Label(label="Dances"),
+        examples=example_audio,
     )
     uploading_interface = gr.Interface(
         fn=predict,
         inputs=gr.Audio(label="Song Audio File"),
         outputs=gr.Label(label="Dances"),
+        examples=example_audio,
     )
     with gr.Blocks() as app:
         gr.Markdown(f"# {title}")
         gr.Markdown(description)
+        gr.TabbedInterface(
+            [uploading_interface, recording_interface], ["Upload Song", "Record Song"]
+        )
         with gr.Accordion("See all dances", open=False):
             gr.Markdown("\n".join(f"- {dance}" for dance in all_dances))
     return app
 if __name__ == "__main__":
+    demo().launch()

models/audio_spectrogram_transformer.py CHANGED Viewed

@@ -88,13 +88,17 @@ def train_lightning_ast(config: dict):
         target_classes=TARGET_CLASSES,
         **config["data_module"],
     )
     model = AST(TARGET_CLASSES).to(DEVICE)
     label_weights = data.get_label_weights().to(DEVICE)
     criterion = nn.CrossEntropyLoss(
         label_weights
     )  # LabelWeightedBCELoss(label_weights)
-    train_env = TrainingEnvironment(model, criterion, config)
     callbacks = [
         # cb.LearningRateFinder(update_attr=True),
         cb.EarlyStopping("val/loss", patience=5),

         target_classes=TARGET_CLASSES,
         **config["data_module"],
     )
     model = AST(TARGET_CLASSES).to(DEVICE)
     label_weights = data.get_label_weights().to(DEVICE)
     criterion = nn.CrossEntropyLoss(
         label_weights
     )  # LabelWeightedBCELoss(label_weights)
+    if "checkpoint" in config:
+        train_env = TrainingEnvironment.load_from_checkpoint(
+            config["checkpoint"], criterion=criterion, model=model, config=config
+        )
+    else:
+        train_env = TrainingEnvironment(model, criterion, config)
     callbacks = [
         # cb.LearningRateFinder(update_attr=True),
         cb.EarlyStopping("val/loss", patience=5),

models/config/train_local.yaml CHANGED Viewed

@@ -1,4 +1,5 @@
-training_fn: audio_spectrogram_transformer.train_lightning_ast
 device: mps
 seed: 42
 dance_ids: &dance_ids
@@ -23,10 +24,10 @@ data_module:
   test_proportion: 0.2
 datasets:
-  preprocessing.dataset.BestBallroomDataset:
-    audio_dir: data/ballroom-songs
-    class_list: *dance_ids
-    audio_window_jitter: 0.7
   preprocessing.dataset.Music4DanceDataset:
     song_data_path: data/songs_cleaned.csv
@@ -49,7 +50,7 @@ trainer:
   log_every_n_steps: 15
   accelerator: gpu
   max_epochs: 50
-  min_epochs: 7
   fast_dev_run: False
   # gradient_clip_val: 0.5
   # overfit_batches: 1

+training_fn: wav2vec2.train_huggingface
+checkpoint: lightning_logs/version_172/checkpoints/epoch=3-step=4572.ckpt
 device: mps
 seed: 42
 dance_ids: &dance_ids
   test_proportion: 0.2
 datasets:
+  # preprocessing.dataset.BestBallroomDataset:
+  #   audio_dir: data/ballroom-songs
+  #   class_list: *dance_ids
+  #   audio_window_jitter: 0.7
   preprocessing.dataset.Music4DanceDataset:
     song_data_path: data/songs_cleaned.csv
   log_every_n_steps: 15
   accelerator: gpu
   max_epochs: 50
+  min_epochs: 2
   fast_dev_run: False
   # gradient_clip_val: 0.5
   # overfit_batches: 1

models/residual.py CHANGED Viewed

@@ -107,70 +107,6 @@ class ResBlock(nn.Module):
         return out
-class DancePredictor:
-    def __init__(
-        self,
-        weight_path: str,
-        labels: list[str],
-        expected_duration=6,
-        threshold=0.5,
-        resample_frequency=16000,
-        device="cpu",
-    ):
-        super().__init__()
-        self.expected_duration = expected_duration
-        self.threshold = threshold
-        self.resample_frequency = resample_frequency
-        self.preprocess_waveform = WaveformPreprocessing(
-            resample_frequency * expected_duration
-        )
-        self.audio_to_spectrogram = lambda x: x  # TODO: Fix
-        self.labels = np.array(labels)
-        self.device = device
-        self.model = self.get_model(weight_path)
-    def get_model(self, weight_path: str) -> nn.Module:
-        weights = torch.load(weight_path, map_location=self.device)["state_dict"]
-        model = ResidualDancer(n_classes=len(self.labels))
-        for key in list(weights):
-            weights[key.replace("model.", "")] = weights.pop(key)
-        model.load_state_dict(weights)
-        return model.to(self.device).eval()
-    @classmethod
-    def from_config(cls, config_path: str) -> "DancePredictor":
-        with open(config_path, "r") as f:
-            config = yaml.safe_load(f)
-        return DancePredictor(**config)
-    @torch.no_grad()
-    def __call__(self, waveform: np.ndarray, sample_rate: int) -> dict[str, float]:
-        if len(waveform.shape) > 1 and waveform.shape[1] < waveform.shape[0]:
-            waveform = waveform.transpose(1, 0)
-        elif len(waveform.shape) == 1:
-            waveform = np.expand_dims(waveform, 0)
-        waveform = torch.from_numpy(waveform.astype("int16"))
-        waveform = torchaudio.functional.apply_codec(
-            waveform, sample_rate, "wav", channels_first=True
-        )
-        waveform = torchaudio.functional.resample(
-            waveform, sample_rate, self.resample_frequency
-        )
-        waveform = self.preprocess_waveform(waveform)
-        spectrogram = self.audio_to_spectrogram(waveform)
-        spectrogram = spectrogram.unsqueeze(0).to(self.device)
-        results = self.model(spectrogram)
-        results = results.squeeze(0).detach().cpu().numpy()
-        result_mask = results > self.threshold
-        probs = results[result_mask]
-        dances = self.labels[result_mask]
-        return {dance: float(prob) for dance, prob in zip(dances, probs)}
 def train_residual_dancer(config: dict):
     TARGET_CLASSES = config["dance_ids"]
     DEVICE = config["device"]

         return out
 def train_residual_dancer(config: dict):
     TARGET_CLASSES = config["dance_ids"]
     DEVICE = config["device"]

models/training_environment.py CHANGED Viewed

@@ -17,10 +17,12 @@ class TrainingEnvironment(pl.LightningModule):
         *args,
         **kwargs,
     ):
-        super().__init__(*args, **kwargs)
         self.model = model
         self.criterion = criterion
-        self.learning_rate = learning_rate
         self.experiment_loggers = load_loggers(
             config["training_environment"].get("loggers", {})
         )
@@ -64,7 +66,7 @@ class TrainingEnvironment(pl.LightningModule):
             preds, y, prefix="val/", multi_label=self.has_multi_label_predictions
         )
         metrics["val/loss"] = self.criterion(preds, y)
-        self.log_dict(metrics, prog_bar=True)
     def test_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int):
         x, y = batch

         *args,
         **kwargs,
     ):
+        super().__init__()
         self.model = model
         self.criterion = criterion
+        self.learning_rate = config["training_environment"].get(
+            "learning_rate", learning_rate
+        )
         self.experiment_loggers = load_loggers(
             config["training_environment"].get("loggers", {})
         )
             preds, y, prefix="val/", multi_label=self.has_multi_label_predictions
         )
         metrics["val/loss"] = self.criterion(preds, y)
+        self.log_dict(metrics, prog_bar=True, sync_dist=True)
     def test_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int):
         x, y = batch

models/wav2vec2.py CHANGED Viewed

@@ -7,14 +7,13 @@ from transformers import AutoModelForAudioClassification, TrainingArguments, Tra
 from preprocessing.dataset import (
     HuggingFaceDatasetWrapper,
-    BestBallroomDataset,
     get_datasets,
 )
 from preprocessing.pipelines import WaveformTrainingPipeline
 from .utils import get_id_label_mapping, compute_hf_metrics
-MODEL_CHECKPOINT = "facebook/wav2vec2-base"
 class Wav2VecFeatureExtractor:

 from preprocessing.dataset import (
     HuggingFaceDatasetWrapper,
     get_datasets,
 )
 from preprocessing.pipelines import WaveformTrainingPipeline
 from .utils import get_id_label_mapping, compute_hf_metrics
+MODEL_CHECKPOINT = "m3hrdadfi/wav2vec2-base-100k-voxpopuli-gtzan-music"
 class Wav2VecFeatureExtractor: