Spaces:

waidhoferj
/

dance-classifier

Runtime error

File size: 5,797 Bytes

from typing import Any
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    ASTConfig,
    ASTFeatureExtractor,
    ASTForAudioClassification,
)
import torch
from torch import nn
from models.training_environment import TrainingEnvironment
from preprocessing.pipelines import WaveformTrainingPipeline

from preprocessing.dataset import (
    DanceDataModule,
    HuggingFaceDatasetWrapper,
    get_datasets,
)
from preprocessing.dataset import get_music4dance_examples
from .utils import get_id_label_mapping, compute_hf_metrics

import pytorch_lightning as pl
from pytorch_lightning import callbacks as cb

MODEL_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593"


class AST(nn.Module):
    def __init__(self, labels, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        id2label, label2id = get_id_label_mapping(labels)
        config = ASTConfig(
            hidden_size=256,
            num_hidden_layers=6,
            num_attention_heads=4,
            id2label=id2label,
            label2id=label2id,
            num_labels=len(label2id),
            ignore_mismatched_sizes=True,
        )
        self.model = ASTForAudioClassification(config)

    def forward(self, x):
        return self.model(x).logits


# TODO: Remove waveform normalization from ASTFeatureExtractor.
# Find correct mean and std dev
# Find correct max length
class ASTExtractorWrapper:
    def __init__(self, sampling_rate=16000, return_tensors="pt") -> None:
        max_length = 1024
        self.extractor = ASTFeatureExtractor(do_normalize=False, max_length=max_length)
        self.sampling_rate = sampling_rate
        self.return_tensors = return_tensors
        self.waveform_pipeline = WaveformTrainingPipeline()  # TODO configure from yaml

    def __call__(self, x) -> Any:
        x = self.waveform_pipeline(x)
        device = x.device
        x = x.squeeze(0).numpy()
        x = self.extractor(
            x, return_tensors=self.return_tensors, sampling_rate=self.sampling_rate
        )

        x = x["input_values"].squeeze(0).to(device)
        # normalize
        x = (x - x.mean()) / x.std()
        return x


def train_lightning_ast(config: dict):
    """
    work on integration between waveform dataset and environment. Should work for both HF and PTL.
    """
    TARGET_CLASSES = config["dance_ids"]
    DEVICE = config["device"]
    SEED = config["seed"]
    pl.seed_everything(SEED, workers=True)
    feature_extractor = ASTExtractorWrapper()
    dataset = get_datasets(config["datasets"], feature_extractor)
    data = DanceDataModule(
        dataset,
        target_classes=TARGET_CLASSES,
        **config["data_module"],
    )
    model = AST(TARGET_CLASSES).to(DEVICE)
    label_weights = data.get_label_weights().to(DEVICE)
    criterion = nn.CrossEntropyLoss(
        label_weights
    )  # LabelWeightedBCELoss(label_weights)
    if "checkpoint" in config:
        train_env = TrainingEnvironment.load_from_checkpoint(
            config["checkpoint"], criterion=criterion, model=model, config=config
        )
    else:
        train_env = TrainingEnvironment(model, criterion, config)
    callbacks = [
        # cb.LearningRateFinder(update_attr=True),
        cb.EarlyStopping("val/loss", patience=5),
        cb.RichProgressBar(),
    ]
    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
    trainer.fit(train_env, datamodule=data)
    trainer.test(train_env, datamodule=data)


def train_huggingface_ast(config: dict):
    TARGET_CLASSES = config["dance_ids"]
    DEVICE = config["device"]
    SEED = config["seed"]
    OUTPUT_DIR = "models/weights/ast"
    batch_size = config["data_module"]["batch_size"]
    epochs = config["data_module"]["min_epochs"]
    test_proportion = config["data_module"].get("test_proportion", 0.2)
    pl.seed_everything(SEED, workers=True)
    dataset = get_datasets(config["datasets"])
    hf_dataset = HuggingFaceDatasetWrapper(dataset)
    id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
    model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
    preprocess_waveform = lambda wf: feature_extractor(
        wf,
        sampling_rate=train_ds.resample_frequency,
        # padding="max_length",
        # return_tensors="pt",
    )
    hf_dataset.append_to_pipeline(preprocess_waveform)
    test_proportion = config["data_module"]["test_proportion"]
    train_proporition = 1 - test_proportion
    train_ds, test_ds = torch.utils.data.random_split(
        hf_dataset, [train_proporition, test_proportion]
    )

    model = AutoModelForAudioClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(TARGET_CLASSES),
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    ).to(DEVICE)
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=5,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        warmup_ratio=0.1,
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        push_to_hub=False,
        use_mps_device=DEVICE == "mps",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        tokenizer=feature_extractor,
        compute_metrics=compute_hf_metrics,
    )
    trainer.train()
    return model