Spaces:

karlopintaric
/

instrument-recognizer-api

Running

App Files Files Community

instrument-recognizer-api / src /modeling /learner.py

Karlo Pintaric

Upload 25 files

fdc1efd 11 months ago

raw

history blame contribute delete

No virus

11.1 kB

	from abc import ABC, abstractmethod
	from typing import Tuple

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import wandb
	from torch.utils.data import DataLoader
	from tqdm.autonotebook import tqdm

	import modeling.loss as loss_module
	import modeling.metrics as metrics_module
	from modeling.loss import HardDistillationLoss
	from modeling.models import freeze, layerwise_lr_decay
	from modeling.utils import init_obj


	class BaseLearner(ABC):
	"""
	Abstract base class for a learner.

	:param train_dl: DataLoader for training data
	:type train_dl: Type[DataLoader]
	:param valid_dl: DataLoader for validation data
	:type valid_dl: Type[DataLoader]
	:param model: Model to be trained
	:type model: Type[nn.Module]
	:param config: Configuration object
	:type config: Any
	"""

	def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
	self.train_dl = train_dl
	self.valid_dl = valid_dl
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model = model.to(self.device)
	self.config = config

	@abstractmethod
	def fit(
	self,
	):
	"""Abstract method for fitting the model."""

	pass

	@abstractmethod
	def _train_epoch(
	self,
	):
	"""Abstract method for training the model for one epoch."""
	pass

	@abstractmethod
	def _test_epoch(
	self,
	):
	"""Abstract method for testing the model for one epoch."""
	pass


	class Learner(BaseLearner):
	def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
	"""
	A class that inherits from the BaseLearner class and represents a learner object.

	:param train_dl: DataLoader for training data
	:type train_dl: DataLoader
	:param valid_dl: DataLoader for validation data
	:type valid_dl: DataLoader
	:param model: Model to be trained
	:type model: nn.Module
	:param config: Configuration object
	:type config: Any
	"""

	super().__init__(train_dl, valid_dl, model, config)

	self.model = torch.nn.DataParallel(module=self.model, device_ids=list(range(config.num_gpus)))
	self.loss_fn = init_obj(self.config.loss, loss_module)
	params = layerwise_lr_decay(self.config, self.model)
	self.optimizer = init_obj(self.config.optimizer, optim, params)
	self.scheduler = init_obj(
	self.config.scheduler,
	optim.lr_scheduler,
	self.optimizer,
	max_lr=[param["lr"] for param in params],
	epochs=self.config.epochs,
	steps_per_epoch=int(np.ceil(len(train_dl) / self.config.num_accum)),
	)

	self.verbose = self.config.verbose
	self.metrics = MetricTracker(self.config.metrics, self.verbose)
	self.scaler = torch.cuda.amp.GradScaler()

	self.train_step = 0
	self.test_step = 0

	def fit(self, model_name: str = "model"):
	"""
	Method to train the model.

	:param model_name: Name of the model to be saved, defaults to "model"
	:type model_name: str, optional
	"""

	loop = tqdm(range(self.config.epochs), leave=False)

	for epoch in loop:
	train_loss = self._train_epoch()
	val_loss = self._test_epoch()

	wandb.log({"train_loss": train_loss, "val_loss": val_loss, "epoch": epoch + 1})

	if self.verbose:
	print(f"\| EPOCH: {epoch+1} \| train_loss: {train_loss:.3f} \| val_loss: {val_loss:.3f} \|\n")
	self.metrics.display()

	if self.config.save_last_checkpoint:
	torch.save(self.model.module.state_dict(), f"{model_name}.pth")

	def _train_epoch(self, distill: bool = False):
	"""
	Method to perform one epoch of training.

	:param distill: Flag to indicate if knowledge distillation is used, defaults to False
	:type distill: bool, optional
	:return: Average training loss for the epoch
	:rtype: float
	"""

	if distill:
	print("Distilling knowledge...", flush=True)

	loop = tqdm(self.train_dl, leave=False)
	self.model.train()

	num_batches = len(self.train_dl)
	train_loss = 0

	for idx, (xb, yb) in enumerate(loop):
	xb = xb.to(self.device)
	yb = yb.to(self.device)

	# forward
	with torch.autocast(device_type=self.device, dtype=torch.float16, enabled=not distill):
	predictions = self.model(xb)

	if distill:
	loss = self.KDloss_fn(xb, predictions, yb)
	else:
	loss = self.loss_fn(predictions, yb)

	loss /= self.config.num_accum

	# backward
	self.scaler.scale(loss).backward()
	wandb.log({f"lr_param_group_{i}": lr for i, lr in enumerate(self.scheduler.get_last_lr())})

	if ((idx + 1) % self.config.num_accum == 0) or (idx + 1 == num_batches):
	self.scaler.step(self.optimizer)
	self.scaler.update()
	self.scheduler.step()
	self.optimizer.zero_grad()

	# update loop
	loop.set_postfix(loss=loss.item())
	self.train_step += 1
	wandb.log({"train_loss_per_batch": loss.item(), "train_step": self.train_step})
	train_loss += loss.item()

	if distill:
	if ((idx + 1) % 2500 == 0) and not (idx + 1 == num_batches):
	val_loss = self._test_epoch()
	wandb.log({"val_loss": val_loss})
	self.model.train()

	train_loss /= num_batches

	return train_loss

	def _test_epoch(self):
	"""
	Method to perform one epoch of validation/testing.

	:return: Average validation/test loss for the epoch
	:rtype: float
	"""

	loop = tqdm(self.valid_dl, leave=False)
	self.model.eval()

	num_batches = len(self.valid_dl)
	preds = []
	targets = []
	test_loss = 0

	with torch.no_grad():
	for xb, yb in loop:
	xb, yb = xb.to(self.device), yb.to(self.device)
	pred = self.model(xb)
	loss = self.loss_fn(pred, yb).item()
	self.test_step += 1
	wandb.log({"valid_loss_per_batch": loss, "test_step": self.test_step})
	test_loss += loss

	pred = torch.sigmoid(pred)
	preds.extend(pred.cpu().numpy())
	targets.extend(yb.cpu().numpy())

	preds, targets = np.array(preds), np.array(targets)
	self.metrics.update(preds, targets)
	test_loss /= num_batches

	return test_loss


	class KDLearner(Learner):
	"""
	Knowledge Distillation Learner class for training a student model with knowledge distillation.

	:param train_dl: Train data loader
	:type train_dl: DataLoader
	:param valid_dl: Validation data loader
	:type valid_dl: DataLoader
	:param student_model: Student model to be trained
	:type student_model: nn.Module
	:param teacher: Teacher model for knowledge distillation
	:type teacher: nn.Module
	:param thresholds: Thresholds for HardDistillationLoss
	:type thresholds: List[float]
	:param config: Configuration object for training
	:type config: Config
	"""

	def __init__(self, train_dl, valid_dl, student_model, teacher, thresholds, config):
	super().__init__(train_dl, valid_dl, student_model, config)

	self.teacher = nn.DataParallel(freeze(teacher).to(self.device))
	self.KDloss_fn = HardDistillationLoss(self.teacher, self.loss_fn, thresholds, self.device)
	self.scaler = torch.cuda.amp.GradScaler(enabled=False)

	def _train_epoch(self):
	"""
	Method to perform one epoch of training with knowledge distillation.

	:return: Average training loss for the epoch
	:rtype: float
	"""

	return super()._train_epoch(distill=True)


	class MetricTracker:
	"""
	Metric Tracker class for tracking evaluation metrics during model validation.
	This class is used to track and display evaluation metrics during model validation.
	It keeps track of the results of the provided metric functions for each validation batch,
	and logs them to Weights & Biases using wandb.log(). The display() method can be used
	to print the tracked metric results, if verbose is set to True during initialization.

	:param metrics: List of metric functions to track
	:type metrics: List[Callable]
	:param verbose: Flag to indicate whether to print the results or not, defaults to True
	:type verbose: bool, optional
	"""

	def __init__(self, metrics, verbose: bool = True):
	self.metrics_fn = [getattr(metrics_module, metric) for metric in metrics]
	self.verbose = verbose
	self.result = None

	def update(self, preds, targets):
	"""
	Update the metric tracker with the latest predictions and targets.

	:param preds: Model predictions
	:type preds: torch.Tensor
	:param targets: Ground truth targets
	:type targets: torch.Tensor
	"""

	self.result = {metric.__name__: metric(preds, targets) for metric in self.metrics_fn}
	wandb.log(self.result)

	def display(self):
	"""Display the tracked metric results."""

	for k, v in self.result.items():
	print(f"{k}: {v:.2f}")


	def get_preds(data: DataLoader, model: nn.Module, device: str = "cpu") -> Tuple[np.ndarray, np.ndarray]:
	"""
	Get predictions and targets from a data loader and a PyTorch model.

	:param data: A PyTorch DataLoader containing the data to predict on.
	:type data: torch.utils.data.DataLoader
	:param model: A PyTorch model to use for predictions.
	:type model: torch.nn.Module
	:param device: The device to use for predictions (default is "cpu").
	:type device: str
	:raises TypeError: If any of the input arguments is of an incorrect type.
	:return: A tuple containing two NumPy arrays: the predictions and the targets.
	:rtype: Tuple[numpy.ndarray, numpy.ndarray]
	"""

	if not isinstance(data, DataLoader):
	raise TypeError("The 'data' argument must be a PyTorch DataLoader.")
	if not isinstance(model, nn.Module):
	raise TypeError("The 'model' argument must be a PyTorch model.")
	if not isinstance(device, str):
	raise TypeError("The 'device' argument must be a string.")

	loop = tqdm(data, leave=False)
	model = model.to(device)
	model.eval()

	preds = []
	targets = []

	with torch.no_grad():
	for xb, yb in loop:
	xb, yb = xb.to(device), yb.to(device)
	pred = model(xb)
	pred = torch.sigmoid(pred)
	preds.extend(pred.cpu().numpy())
	targets.extend(yb.cpu().numpy())

	preds, targets = np.array(preds), np.array(targets)

	return preds, targets