Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

waidhoferj commited on Jan 2, 2023

Commit

4b8361a

•

1 Parent(s): dad3c09

lightning modules, spotify scraping and configs

Browse files

Files changed (12) hide show

.gitignore +9 -2
app.py +32 -82
models/config/dance-predictor.yaml +26 -0
models/config/train.yaml +23 -0
models/residual.py +107 -5
models/utils.py +38 -0
preprocessing/dataset.py +48 -2
preprocessing/pipelines.py +63 -0
preprocessing/preprocess.py +0 -20
scrapers/music4dance.py +0 -6
scrapers/spotify.py +87 -0
scrapers/utils.py +9 -0

.gitignore CHANGED Viewed

@@ -1,8 +1,15 @@
 __pycache__
 .DS_Store
 data/samples
-data/samples-backup.zip
 data/samples-backup.zip
 data/songs.csv
 logs
-gradio_cached_examples

 __pycache__
 .DS_Store
 data/samples
+data/spotify-samples
 data/samples-backup.zip
 data/songs.csv
+data/songs_original.csv
 logs
+gradio_cached_examples
+explore.ipynb
+scrapers/auth
+lightning_logs
+data/backup_1.csv
+data/backup.csv
+data/*.zip

app.py CHANGED Viewed

@@ -1,105 +1,55 @@
 from pathlib import Path
 import gradio as gr
 import numpy as np
-import torch
-from preprocessing.preprocess import AudioPipeline
-from models.residual import ResidualDancer
 import os
-import json
 from functools import cache
-import pandas as pd
-DEVICE = "cpu"
-@cache
-def get_model(device) -> tuple[ResidualDancer, np.ndarray]:
-    model_path = "models/weights/ResidualDancer"
-    weights = os.path.join(model_path, "dancer_net.pt")
-    config_path = os.path.join(model_path, "config.json")
-    with open(config_path) as f:
-        config = json.load(f)
-    labels = np.array(sorted(config["classes"]))
-    model = ResidualDancer(n_classes=len(labels))
-    model.load_state_dict(torch.load(weights, map_location=DEVICE))
-    model = model.to(device).eval()
-    return model, labels
-@cache
-def get_pipeline(sample_rate:int) -> AudioPipeline:
-    return AudioPipeline(input_freq=sample_rate)
 @cache
-def get_dance_map() -> dict:
-    df = pd.read_csv("data/dance_mapping.csv")
-    return df.set_index("id").to_dict()["name"]
 def predict(audio: tuple[int, np.ndarray]) -> list[str]:
     sample_rate, waveform = audio
-    expected_duration = 6
-    threshold = 0.5
-    sample_len = sample_rate * expected_duration
-    audio_pipeline = get_pipeline(sample_rate)
-    model, labels = get_model(DEVICE)
-    if sample_len > len(waveform):
-        raise gr.Error("You must record for at least 6 seconds")
-    if len(waveform.shape) > 1 and waveform.shape[1] > 1:
-        waveform = waveform.transpose(1,0)
-        waveform = waveform.mean(axis=0, keepdims=True)
-    else:
-        waveform = np.expand_dims(waveform, 0)
-    waveform = waveform[: ,:sample_len]
-    waveform = (waveform - waveform.min()) / (waveform.max() - waveform.min()) * 2 - 1
-    waveform = waveform.astype("float32")
-    waveform = torch.from_numpy(waveform)
-    spectrogram = audio_pipeline(waveform)
-    spectrogram = spectrogram.unsqueeze(0).to(DEVICE)
-    with torch.no_grad():
-        results = model(spectrogram)
-    dance_mapping = get_dance_map()
-    results = results.squeeze(0).detach().cpu().numpy()
-    result_mask = results > threshold
-    probs = results[result_mask]
-    dances = labels[result_mask]
-    return {dance_mapping[dance_id]:float(prob) for dance_id, prob in zip(dances, probs)} if len(dances) else "Couldn't find a dance."
 def demo():
     title = "Dance Classifier"
-    description = "Record 6 seconds of a song and find out what dance fits the music."
     with gr.Blocks() as app:
         gr.Markdown(f"# {title}")
         gr.Markdown(description)
-        with gr.Tab("Record Song"):
-            mic_audio = gr.Audio(source="microphone", label="Song Recording")
-            mic_submit = gr.Button("Predict")
-        with gr.Tab("Upload Song") as t:
-            audio_file = gr.Audio(label="Song Audio File")
-            audio_file_submit = gr.Button("Predict")
-        song_samples = Path(os.path.dirname(__file__), "assets", "song-samples")
-        example_audio = [str(song) for song in song_samples.iterdir() if song.name[0] != '.']
-        labels = gr.Label(label="Dances")
-        gr.Markdown("## Examples")
-        gr.Examples(
-            examples=example_audio,
-            inputs=audio_file,
-            outputs=labels,
-            fn=predict,
-            )
-        audio_file_submit.click(fn=predict, inputs=audio_file, outputs=labels)
-        mic_submit.click(fn=predict, inputs=mic_audio, outputs=labels)
     return app

 from pathlib import Path
 import gradio as gr
 import numpy as np
+from models.residual import DancePredictor
 import os
 from functools import cache
+from pathlib import Path
+CONFIG_FILE = Path("models/config/dance-predictor.yaml")
 @cache
+def get_model(config_path:str) -> DancePredictor:
+    model = DancePredictor.from_config(config_path)
+    return model
 def predict(audio: tuple[int, np.ndarray]) -> list[str]:
     sample_rate, waveform = audio
+    model = get_model(CONFIG_FILE)
+    results = model(waveform,sample_rate)
+    return results if len(results) else "Dance Not Found"
 def demo():
     title = "Dance Classifier"
+    description = "What should I dance to this song? Pass some audio to the Dance Classifier find out!"
+    song_samples = Path(os.path.dirname(__file__), "assets", "song-samples")
+    example_audio = [str(song) for song in song_samples.iterdir() if song.name[0] != '.']
+    all_dances = get_model(CONFIG_FILE).labels
+    recording_interface = gr.Interface(
+        fn=predict,
+        description="Record at least **6 seconds** of the song.",
+        inputs=gr.Audio(source="microphone", label="Song Recording"),
+        outputs=gr.Label(label="Dances"),
+        examples=example_audio
+    )
+    uploading_interface = gr.Interface(
+        fn=predict,
+        inputs=gr.Audio(label="Song Audio File"),
+        outputs=gr.Label(label="Dances"),
+        examples=example_audio
+    )
     with gr.Blocks() as app:
         gr.Markdown(f"# {title}")
         gr.Markdown(description)
+        gr.TabbedInterface([uploading_interface, recording_interface], ["Upload Song", "Record Song"])
+        with gr.Accordion("See all dances", open=False):
+            gr.Markdown("\n".join(f"- {dance}" for dance in all_dances))
     return app

models/config/dance-predictor.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+weight_path: lightning_logs/version_0/checkpoints/epoch=5-step=870.ckpt
+expected_duration: 6
+threshold: 0.5
+resample_frequency: 16000
+device: cpu
+labels:
+  - Argentine Tango
+  - Balboa
+  - Bachata
+  - Blues
+  - Cha Cha
+  - Cumbia
+  - Carolina Shag
+  - East Coast Swing
+  - Hustle
+  - Jive
+  - Lindy Hop
+  - Quickstep
+  - Rumba
+  - Slow Foxtrot
+  - Salsa
+  - Samba
+  - Slow Waltz
+  - Tango (Ballroom)
+  - Viennese Waltz
+  - West Coast Swing

models/config/train.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+device: mps
+seed: 42
+dance_ids:
+  - ATN
+  - BBA
+  - BCH
+  - BLU
+  - CHA
+  - CMB
+  - CSG
+  - ECS
+  - HST
+  - JIV
+  - LHP
+  - QST
+  - RMB
+  - SFT
+  - SLS
+  - SMB
+  - SWZ
+  - TGO
+  - VWZ
+  - WCS

models/residual.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import torch.nn as nn
 import torch.nn.functional as F
 # Architecture based on: https://github.com/minzwon/sota-music-tagging-models/blob/36aa13b7205ff156cf4dcab60fd69957da453151/training/model.py
 class ResidualDancer(nn.Module):
-    def __init__(self,
-                n_channels=128,
-                n_classes=50):
         super().__init__()
         # Spectrogram
@@ -50,7 +56,7 @@ class ResidualDancer(nn.Module):
         x = nn.Sigmoid()(x)
         return x
 class ResBlock(nn.Module):
     def __init__(self, input_channels, output_channels, shape=3, stride=2):
@@ -78,4 +84,100 @@ class ResBlock(nn.Module):
             x = self.bn_3(self.conv_3(x))
         out = x + out
         out = self.relu(out)
-        return out

+import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import pytorch_lightning as pl
+import numpy as np
+import torchaudio
+import yaml
+from .utils import calculate_metrics
+from preprocessing.pipelines import AudioPipeline
 # Architecture based on: https://github.com/minzwon/sota-music-tagging-models/blob/36aa13b7205ff156cf4dcab60fd69957da453151/training/model.py
 class ResidualDancer(nn.Module):
+    def __init__(self,n_channels=128, n_classes=50):
         super().__init__()
         # Spectrogram
         x = nn.Sigmoid()(x)
         return x
 class ResBlock(nn.Module):
     def __init__(self, input_channels, output_channels, shape=3, stride=2):
             x = self.bn_3(self.conv_3(x))
         out = x + out
         out = self.relu(out)
+        return out
+class TrainingEnvironment(pl.LightningModule):
+    def __init__(self, model: nn.Module, criterion: nn.Module, learning_rate=1e-4, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.criterion = criterion
+        self.learning_rate = learning_rate
+    def training_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int) -> torch.Tensor:
+        features, labels = batch
+        outputs = self.model(features)
+        loss = self.criterion(outputs, labels)
+        batch_metrics = calculate_metrics(outputs, labels)
+        self.log_dict(batch_metrics)
+        return loss
+    def validation_step(self, batch:tuple[torch.Tensor, torch.TensorType], batch_index:int):
+        x, y = batch
+        preds = self.model(x)
+        metrics = calculate_metrics(preds, y, prefix="val_")
+        metrics["val_loss"] = self.criterion(preds, y)
+        self.log_dict(metrics)
+    def test_step(self, batch:tuple[torch.Tensor, torch.TensorType], batch_index:int):
+        x, y = batch
+        preds = self.model(x)
+        self.log_dict(calculate_metrics(preds, y, prefix="test_"))
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
+class DancePredictor:
+    def __init__(
+        self,
+        weight_path:str,
+        labels:list[str],
+        expected_duration=6,
+        threshold=0.5,
+        resample_frequency=16000,
+        device="cpu"):
+        super().__init__()
+        self.expected_duration = expected_duration
+        self.threshold = threshold
+        self.resample_frequency = resample_frequency
+        self.audio_pipeline = AudioPipeline(input_freq=self.resample_frequency)
+        self.labels = np.array(labels)
+        self.device = device
+        self.model = self.get_model(weight_path)
+    def get_model(self, weight_path:str) -> nn.Module:
+        weights = torch.load(weight_path, map_location=self.device)["state_dict"]
+        model = ResidualDancer(n_classes=len(self.labels))
+        for key in list(weights):
+            weights[key.replace("model.", "")] = weights.pop(key)
+        model.load_state_dict(weights)
+        return model.to(self.device).eval()
+    @classmethod
+    def from_config(cls, config_path:str) -> "DancePredictor":
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        return DancePredictor(**config)
+    @torch.no_grad()
+    def __call__(self, waveform: np.ndarray, sample_rate:int) -> dict[str,float]:
+        min_sample_len = sample_rate * self.expected_duration
+        if min_sample_len > len(waveform):
+            raise Exception("You must record for at least 6 seconds")
+        if len(waveform.shape) > 1 and waveform.shape[1] > 1:
+            waveform = waveform.transpose(1,0)
+            waveform = waveform.mean(axis=0, keepdims=True)
+        else:
+            waveform = np.expand_dims(waveform, 0)
+        waveform = waveform[: ,:min_sample_len]
+        waveform = torch.from_numpy(waveform.astype("int16"))
+        waveform = torchaudio.functional.apply_codec(waveform,sample_rate, "wav", channels_first=True)
+        waveform = torchaudio.functional.resample(waveform, sample_rate,self.resample_frequency)
+        spectrogram = self.audio_pipeline(waveform)
+        spectrogram = spectrogram.unsqueeze(0).to(self.device)
+        results = self.model(spectrogram)
+        results = results.squeeze(0).detach().cpu().numpy()
+        result_mask = results > self.threshold
+        probs = results[result_mask]
+        dances = self.labels[result_mask]
+        return {dance:float(prob) for dance, prob in zip(dances, probs)}

models/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch.nn as nn
+import torch
+import numpy as np
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
+class LabelWeightedBCELoss(nn.Module):
+    def __init__(self, label_weights:torch.Tensor, reduction="mean"):
+        super().__init__()
+        self.label_weights = label_weights
+        match reduction:
+            case "mean":
+                self.reduction = torch.mean
+            case "sum":
+                self.reduction = torch.sum
+    def _log(self,x:torch.Tensor) -> torch.Tensor:
+        return torch.clamp_min(torch.log(x), -100)
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        losses = -self.label_weights * (target * self._log(input) + (1-target) * self._log(1-input))
+        return self.reduction(losses)
+def calculate_metrics(pred, target, threshold=0.5, prefix="") -> dict[str, torch.Tensor]:
+    target = target.detach().cpu().numpy()
+    pred = pred.detach().cpu().numpy()
+    pred = np.array(pred > threshold, dtype=float)
+    metrics= {
+            'precision': precision_score(y_true=target, y_pred=pred, average='macro', zero_division=0),
+            'recall': recall_score(y_true=target, y_pred=pred, average='macro', zero_division=0),
+            'f1': f1_score(y_true=target, y_pred=pred, average='macro', zero_division=0),
+            'accuracy': accuracy_score(y_true=target, y_pred=pred),
+            }
+    if prefix != "":
+        metrics = {prefix + k : v for k, v in metrics.items()}
+    return {k: torch.tensor(v,dtype=torch.float32) for k,v in metrics.items()}

preprocessing/dataset.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import torch
-from torch.utils.data import Dataset
 import numpy as np
 import torchaudio as ta
-from .preprocess import AudioPipeline
 class SongDataset(Dataset):
@@ -47,3 +51,45 @@ class SongDataset(Dataset):
     def _label_from_index(self, idx:int) -> torch.Tensor:
         label_idx =  idx * self.audio_window_duration // self.audio_duration
         return torch.from_numpy(self.dance_labels[label_idx])

 import torch
+from torch.utils.data import Dataset, DataLoader, random_split
 import numpy as np
+import pandas as pd
 import torchaudio as ta
+from .pipelines import AudioPipeline
+import pytorch_lightning as pl
+from .preprocess import get_examples
 class SongDataset(Dataset):
     def _label_from_index(self, idx:int) -> torch.Tensor:
         label_idx =  idx * self.audio_window_duration // self.audio_duration
         return torch.from_numpy(self.dance_labels[label_idx])
+class DanceDataModule(pl.LightningDataModule):
+    def __init__(self,
+    song_data_path="data/songs.csv",
+    song_audio_path="data/samples",
+    test_proportion=0.15,
+    val_proportion=0.1,
+    target_classes:list[str]=None,
+    batch_size:int=64,
+    num_workers=10
+    ):
+        super().__init__()
+        self.song_data_path = song_data_path
+        self.song_audio_path = song_audio_path
+        self.val_proportion=val_proportion
+        self.test_proportion=test_proportion
+        self.train_proporition= 1.-test_proportion-val_proportion
+        self.target_classes=target_classes
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        df = pd.read_csv("data/songs.csv")
+        self.x,self.y = get_examples(df, self.song_audio_path,class_list=self.target_classes)
+    def setup(self, stage: str):
+        dataset = SongDataset(self.x,self.y)
+        self.train_ds, self.val_ds, self.test_ds = random_split(dataset, [self.train_proporition, self.val_proportion, self.test_proportion])
+    def train_dataloader(self):
+        return DataLoader(self.train_ds, batch_size=self.batch_size, num_workers=self.num_workers)
+    def val_dataloader(self):
+        return DataLoader(self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers)
+    def test_dataloader(self):
+        return DataLoader(self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers)
+    def get_label_weights(self):
+        return torch.from_numpy(len(self.y) / (len(self.y[0]) * sum(self.y)))

preprocessing/pipelines.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+from torchaudio import transforms as taT, functional as taF
+import torch.nn as nn
+class AudioPipeline(torch.nn.Module):
+    def __init__(
+        self,
+        input_freq=16000,
+        resample_freq=16000,
+    ):
+        super().__init__()
+        self.resample = taT.Resample(orig_freq=input_freq, new_freq=resample_freq)
+        self.spec = taT.MelSpectrogram(sample_rate=resample_freq, n_mels=64, n_fft=1024)
+        self.to_db = taT.AmplitudeToDB()
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(0, keepdim=True)
+        waveform = (waveform - waveform.mean()) / waveform.abs().max()
+        waveform = self.resample(waveform)
+        spectrogram = self.spec(waveform)
+        spectrogram = self.to_db(spectrogram)
+        return spectrogram
+class SpectrogramAugmentationPipeline(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.pipeline = nn.Sequential(
+            taT.FrequencyMasking(80),
+            taT.TimeMasking(80),
+            taT.TimeStretch(80)
+        )
+    def forward(self, spectrogram:torch.Tensor) -> torch.Tensor:
+        return self.pipeline(spectrogram)
+class WaveformAugmentationPipeline(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, waveform:torch.Tensor) -> torch.Tensor:
+        taF.pitch_shift()
+class AudioTrainingPipeline(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.waveform_aug = WaveformAugmentationPipeline()
+        self.spec_aug = SpectrogramAugmentationPipeline()
+        self.audio_preprocessing = AudioPipeline()
+    def forward(self, waveform:torch.Tensor) -> torch.Tensor:
+        x = self.audio_preprocessing(waveform)
+        x = self.spec_aug(x)
+        return x

preprocessing/preprocess.py CHANGED Viewed

@@ -82,23 +82,3 @@ def get_examples(df:pd.DataFrame, audio_dir:str, class_list=None) -> tuple[list[
     return audio_paths, list(labels)
-class AudioPipeline(torch.nn.Module):
-    def __init__(
-        self,
-        input_freq=16000,
-        resample_freq=16000,
-    ):
-        super().__init__()
-        self.resample = taT.Resample(orig_freq=input_freq, new_freq=resample_freq)
-        self.spec = taT.MelSpectrogram(sample_rate=resample_freq, n_mels=64, n_fft=1024)
-        self.to_db = taT.AmplitudeToDB()
-    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
-        if waveform.shape[0] > 1:
-            waveform = waveform.mean(0, keepdim=True)
-        waveform = self.resample(waveform)
-        spectrogram = self.spec(waveform)
-        spectrogram = self.to_db(spectrogram)
-        return spectrogram


82
83	return audio_paths, list(labels)
84

scrapers/music4dance.py CHANGED Viewed

@@ -75,12 +75,6 @@ def get_songs(soup: bs) -> dict:
     return songs
-def download_song(url: str, out_dir: str):
-    response = requests.get(url)
-    filename = url.split("/")[-1]
-    out_file = Path(out_dir, f"{filename}.mp3")
-    with open(out_file, "wb") as f:
-        f.write(response.content)
 def scrape_dance_info() -> pd.DataFrame:
     js_obj = re.compile(r"{(.|\n)*}")

     return songs
 def scrape_dance_info() -> pd.DataFrame:
     js_obj = re.compile(r"{(.|\n)*}")

scrapers/spotify.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import spotipy
+from spotipy.oauth2 import SpotifyClientCredentials
+import os
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from utils import download_song
+import time
+def set_env():
+    here = os.path.dirname(__file__)
+    with open(os.path.join(here, "auth", "spotify.json"), "r") as f:
+        config = json.load(f)
+    os.environ["SPOTIPY_CLIENT_ID"] = config["client_id"]
+    os.environ["SPOTIPY_CLIENT_SECRET"] = config["client_secret"]
+    os.environ["SPOTIPY_REDIRECT_URI"] = "https://localhost:8080/callback"
+set_env()
+def get_song_preview_url(song_name:str, spotify:spotipy.Spotify, artist:str = None) -> str | None:
+    info = {
+        "track": song_name
+    }
+    if artist is not None:
+        info["artist"] = artist
+    query = " ".join(f"{k}: {v}" for k,v in info.items())
+    results = spotify.search(query,type="track", limit=1)["tracks"]["items"]
+    valid_results = len(results) > 0 and results[0] is not None and "preview_url" in results[0]
+    if not valid_results:
+        return None
+    song = results[0]
+    return song["preview_url"]
+def patch_missing_songs(
+    df: pd.DataFrame,
+) -> pd.DataFrame:
+    spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())
+    # find songs with missing previews
+    audio_urls = df["Sample"].replace(".", np.nan)
+    missing_audio = pd.isna(audio_urls)
+    missing_df = df[missing_audio]
+    def patch_preview(row: pd.Series):
+        song:str = row["Title"]
+        artist:str = row["Artist"]
+        preview_url = get_song_preview_url(song, spotify, artist)
+        if preview_url is not None:
+            row["Sample"] = preview_url
+        return row
+    backup_file = open("data/backup_1.csv", "a")
+    rows = []
+    indices = []
+    total_rows = len(missing_df)
+    for i, row in tqdm(missing_df.iloc[11121:].iterrows(),total=total_rows):
+        patched_row = patch_preview(row)
+        backup_file.write(f"{i}, {patched_row['Sample']}\n")
+        rows.append(patch_preview(row))
+        indices.append(i)
+    patched_df = pd.DataFrame(rows,index=indices)
+    df.update(patched_df)
+    return df
+def download_links():
+    start = 3180
+    with open("data/backup_2.csv") as f:
+        links = [x.split(",")[1].strip() for x in f.readlines()]
+    links = links[start:]
+    links = [l for l in links if "https" in l]
+    links = links[2680:]
+    for link in tqdm(links, "Songs Downloaded"):
+        download_song(link, "data/spotify-samples")
+        time.sleep(5e-3) # hopefully wont be rate limited with delay 🤞
+if __name__ == "__main__":
+    df = pd.read_csv("data/songs.csv")
+    patched = patch_missing_songs(df)
+    patched.to_csv("data/last_part.csv")

scrapers/utils.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import requests
+from pathlib import Path
+def download_song(url: str, out_dir: str, file_type="mp3"):
+    response = requests.get(url)
+    filename = url.split("/")[-1]
+    out_file = Path(out_dir, f"{filename}.{file_type}")
+    with open(out_file, "wb") as f:
+        f.write(response.content)