Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

dance-classifier / models /audio_spectrogram_transformer.py

waidhoferj

updated models

42c4703 over 1 year ago

raw

history blame

5.8 kB

	from typing import Any
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from transformers import (
	AutoFeatureExtractor,
	AutoModelForAudioClassification,
	TrainingArguments,
	Trainer,
	ASTConfig,
	ASTFeatureExtractor,
	ASTForAudioClassification,
	)
	import torch
	from torch import nn
	from models.training_environment import TrainingEnvironment
	from preprocessing.pipelines import WaveformTrainingPipeline

	from preprocessing.dataset import (
	DanceDataModule,
	HuggingFaceDatasetWrapper,
	get_datasets,
	)
	from preprocessing.dataset import get_music4dance_examples
	from .utils import get_id_label_mapping, compute_hf_metrics

	import pytorch_lightning as pl
	from pytorch_lightning import callbacks as cb

	MODEL_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593"


	class AST(nn.Module):
	def __init__(self, labels, args, *kwargs) -> None:
	super().__init__(args, *kwargs)
	id2label, label2id = get_id_label_mapping(labels)
	config = ASTConfig(
	hidden_size=256,
	num_hidden_layers=6,
	num_attention_heads=4,
	id2label=id2label,
	label2id=label2id,
	num_labels=len(label2id),
	ignore_mismatched_sizes=True,
	)
	self.model = ASTForAudioClassification(config)

	def forward(self, x):
	return self.model(x).logits


	# TODO: Remove waveform normalization from ASTFeatureExtractor.
	# Find correct mean and std dev
	# Find correct max length
	class ASTExtractorWrapper:
	def __init__(self, sampling_rate=16000, return_tensors="pt") -> None:
	max_length = 1024
	self.extractor = ASTFeatureExtractor(do_normalize=False, max_length=max_length)
	self.sampling_rate = sampling_rate
	self.return_tensors = return_tensors
	self.waveform_pipeline = WaveformTrainingPipeline() # TODO configure from yaml

	def __call__(self, x) -> Any:
	x = self.waveform_pipeline(x)
	device = x.device
	x = x.squeeze(0).numpy()
	x = self.extractor(
	x, return_tensors=self.return_tensors, sampling_rate=self.sampling_rate
	)

	x = x["input_values"].squeeze(0).to(device)
	# normalize
	x = (x - x.mean()) / x.std()
	return x


	def train_lightning_ast(config: dict):
	"""
	work on integration between waveform dataset and environment. Should work for both HF and PTL.
	"""
	TARGET_CLASSES = config["dance_ids"]
	DEVICE = config["device"]
	SEED = config["seed"]
	pl.seed_everything(SEED, workers=True)
	feature_extractor = ASTExtractorWrapper()
	dataset = get_datasets(config["datasets"], feature_extractor)
	data = DanceDataModule(
	dataset,
	target_classes=TARGET_CLASSES,
	**config["data_module"],
	)
	model = AST(TARGET_CLASSES).to(DEVICE)
	label_weights = data.get_label_weights().to(DEVICE)
	criterion = nn.CrossEntropyLoss(
	label_weights
	) # LabelWeightedBCELoss(label_weights)
	if "checkpoint" in config:
	train_env = TrainingEnvironment.load_from_checkpoint(
	config["checkpoint"], criterion=criterion, model=model, config=config
	)
	else:
	train_env = TrainingEnvironment(model, criterion, config)
	callbacks = [
	# cb.LearningRateFinder(update_attr=True),
	cb.EarlyStopping("val/loss", patience=5),
	cb.RichProgressBar(),
	]
	trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
	trainer.fit(train_env, datamodule=data)
	trainer.test(train_env, datamodule=data)


	def train_huggingface_ast(config: dict):
	TARGET_CLASSES = config["dance_ids"]
	DEVICE = config["device"]
	SEED = config["seed"]
	OUTPUT_DIR = "models/weights/ast"
	batch_size = config["data_module"]["batch_size"]
	epochs = config["data_module"]["min_epochs"]
	test_proportion = config["data_module"].get("test_proportion", 0.2)
	pl.seed_everything(SEED, workers=True)
	dataset = get_datasets(config["datasets"])
	hf_dataset = HuggingFaceDatasetWrapper(dataset)
	id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
	model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
	feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
	preprocess_waveform = lambda wf: feature_extractor(
	wf,
	sampling_rate=train_ds.resample_frequency,
	# padding="max_length",
	# return_tensors="pt",
	)
	hf_dataset.append_to_pipeline(preprocess_waveform)
	test_proportion = config["data_module"]["test_proportion"]
	train_proporition = 1 - test_proportion
	train_ds, test_ds = torch.utils.data.random_split(
	hf_dataset, [train_proporition, test_proportion]
	)

	model = AutoModelForAudioClassification.from_pretrained(
	model_checkpoint,
	num_labels=len(TARGET_CLASSES),
	label2id=label2id,
	id2label=id2label,
	ignore_mismatched_sizes=True,
	).to(DEVICE)
	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	learning_rate=5e-5,
	per_device_train_batch_size=batch_size,
	gradient_accumulation_steps=5,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=epochs,
	warmup_ratio=0.1,
	logging_steps=10,
	load_best_model_at_end=True,
	metric_for_best_model="accuracy",
	push_to_hub=False,
	use_mps_device=DEVICE == "mps",
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_ds,
	eval_dataset=test_ds,
	tokenizer=feature_extractor,
	compute_metrics=compute_hf_metrics,
	)
	trainer.train()
	return model