Spaces:

karlopintaric
/

instrument-recognizer-api

Sleeping

File size: 11,144 Bytes

fdc1efd

from abc import ABC, abstractmethod
from typing import Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import wandb
from torch.utils.data import DataLoader
from tqdm.autonotebook import tqdm

import modeling.loss as loss_module
import modeling.metrics as metrics_module
from modeling.loss import HardDistillationLoss
from modeling.models import freeze, layerwise_lr_decay
from modeling.utils import init_obj


class BaseLearner(ABC):
    """
    Abstract base class for a learner.

    :param train_dl: DataLoader for training data
    :type train_dl: Type[DataLoader]
    :param valid_dl: DataLoader for validation data
    :type valid_dl: Type[DataLoader]
    :param model: Model to be trained
    :type model: Type[nn.Module]
    :param config: Configuration object
    :type config: Any
    """

    def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
        self.train_dl = train_dl
        self.valid_dl = valid_dl
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = model.to(self.device)
        self.config = config

    @abstractmethod
    def fit(
        self,
    ):
        """Abstract method for fitting the model."""

        pass

    @abstractmethod
    def _train_epoch(
        self,
    ):
        """Abstract method for training the model for one epoch."""
        pass

    @abstractmethod
    def _test_epoch(
        self,
    ):
        """Abstract method for testing the model for one epoch."""
        pass


class Learner(BaseLearner):
    def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
        """
        A class that inherits from the BaseLearner class and represents a learner object.

        :param train_dl: DataLoader for training data
        :type train_dl: DataLoader
        :param valid_dl: DataLoader for validation data
        :type valid_dl: DataLoader
        :param model: Model to be trained
        :type model: nn.Module
        :param config: Configuration object
        :type config: Any
        """

        super().__init__(train_dl, valid_dl, model, config)

        self.model = torch.nn.DataParallel(module=self.model, device_ids=list(range(config.num_gpus)))
        self.loss_fn = init_obj(self.config.loss, loss_module)
        params = layerwise_lr_decay(self.config, self.model)
        self.optimizer = init_obj(self.config.optimizer, optim, params)
        self.scheduler = init_obj(
            self.config.scheduler,
            optim.lr_scheduler,
            self.optimizer,
            max_lr=[param["lr"] for param in params],
            epochs=self.config.epochs,
            steps_per_epoch=int(np.ceil(len(train_dl) / self.config.num_accum)),
        )

        self.verbose = self.config.verbose
        self.metrics = MetricTracker(self.config.metrics, self.verbose)
        self.scaler = torch.cuda.amp.GradScaler()

        self.train_step = 0
        self.test_step = 0

    def fit(self, model_name: str = "model"):
        """
        Method to train the model.

        :param model_name: Name of the model to be saved, defaults to "model"
        :type model_name: str, optional
        """

        loop = tqdm(range(self.config.epochs), leave=False)

        for epoch in loop:
            train_loss = self._train_epoch()
            val_loss = self._test_epoch()

            wandb.log({"train_loss": train_loss, "val_loss": val_loss, "epoch": epoch + 1})

            if self.verbose:
                print(f"| EPOCH: {epoch+1} | train_loss: {train_loss:.3f} | val_loss: {val_loss:.3f} |\n")
                self.metrics.display()

        if self.config.save_last_checkpoint:
            torch.save(self.model.module.state_dict(), f"{model_name}.pth")

    def _train_epoch(self, distill: bool = False):
        """
        Method to perform one epoch of training.

        :param distill: Flag to indicate if knowledge distillation is used, defaults to False
        :type distill: bool, optional
        :return: Average training loss for the epoch
        :rtype: float
        """

        if distill:
            print("Distilling knowledge...", flush=True)

        loop = tqdm(self.train_dl, leave=False)
        self.model.train()

        num_batches = len(self.train_dl)
        train_loss = 0

        for idx, (xb, yb) in enumerate(loop):
            xb = xb.to(self.device)
            yb = yb.to(self.device)

            # forward
            with torch.autocast(device_type=self.device, dtype=torch.float16, enabled=not distill):
                predictions = self.model(xb)

                if distill:
                    loss = self.KDloss_fn(xb, predictions, yb)
                else:
                    loss = self.loss_fn(predictions, yb)

                loss /= self.config.num_accum

            # backward
            self.scaler.scale(loss).backward()
            wandb.log({f"lr_param_group_{i}": lr for i, lr in enumerate(self.scheduler.get_last_lr())})

            if ((idx + 1) % self.config.num_accum == 0) or (idx + 1 == num_batches):
                self.scaler.step(self.optimizer)
                self.scaler.update()
                self.scheduler.step()
                self.optimizer.zero_grad()

            # update loop
            loop.set_postfix(loss=loss.item())
            self.train_step += 1
            wandb.log({"train_loss_per_batch": loss.item(), "train_step": self.train_step})
            train_loss += loss.item()

            if distill:
                if ((idx + 1) % 2500 == 0) and not (idx + 1 == num_batches):
                    val_loss = self._test_epoch()
                    wandb.log({"val_loss": val_loss})
                    self.model.train()

        train_loss /= num_batches

        return train_loss

    def _test_epoch(self):
        """
        Method to perform one epoch of validation/testing.

        :return: Average validation/test loss for the epoch
        :rtype: float
        """

        loop = tqdm(self.valid_dl, leave=False)
        self.model.eval()

        num_batches = len(self.valid_dl)
        preds = []
        targets = []
        test_loss = 0

        with torch.no_grad():
            for xb, yb in loop:
                xb, yb = xb.to(self.device), yb.to(self.device)
                pred = self.model(xb)
                loss = self.loss_fn(pred, yb).item()
                self.test_step += 1
                wandb.log({"valid_loss_per_batch": loss, "test_step": self.test_step})
                test_loss += loss

                pred = torch.sigmoid(pred)
                preds.extend(pred.cpu().numpy())
                targets.extend(yb.cpu().numpy())

        preds, targets = np.array(preds), np.array(targets)
        self.metrics.update(preds, targets)
        test_loss /= num_batches

        return test_loss


class KDLearner(Learner):
    """
    Knowledge Distillation Learner class for training a student model with knowledge distillation.

    :param train_dl: Train data loader
    :type train_dl: DataLoader
    :param valid_dl: Validation data loader
    :type valid_dl: DataLoader
    :param student_model: Student model to be trained
    :type student_model: nn.Module
    :param teacher: Teacher model for knowledge distillation
    :type teacher: nn.Module
    :param thresholds: Thresholds for HardDistillationLoss
    :type thresholds: List[float]
    :param config: Configuration object for training
    :type config: Config
    """

    def __init__(self, train_dl, valid_dl, student_model, teacher, thresholds, config):
        super().__init__(train_dl, valid_dl, student_model, config)

        self.teacher = nn.DataParallel(freeze(teacher).to(self.device))
        self.KDloss_fn = HardDistillationLoss(self.teacher, self.loss_fn, thresholds, self.device)
        self.scaler = torch.cuda.amp.GradScaler(enabled=False)

    def _train_epoch(self):
        """
        Method to perform one epoch of training with knowledge distillation.

        :return: Average training loss for the epoch
        :rtype: float
        """

        return super()._train_epoch(distill=True)


class MetricTracker:
    """
    Metric Tracker class for tracking evaluation metrics during model validation.
    This class is used to track and display evaluation metrics during model validation.
    It keeps track of the results of the provided metric functions for each validation batch,
    and logs them to Weights & Biases using wandb.log(). The display() method can be used
    to print the tracked metric results, if verbose is set to True during initialization.

    :param metrics: List of metric functions to track
    :type metrics: List[Callable]
    :param verbose: Flag to indicate whether to print the results or not, defaults to True
    :type verbose: bool, optional
    """

    def __init__(self, metrics, verbose: bool = True):
        self.metrics_fn = [getattr(metrics_module, metric) for metric in metrics]
        self.verbose = verbose
        self.result = None

    def update(self, preds, targets):
        """
        Update the metric tracker with the latest predictions and targets.

        :param preds: Model predictions
        :type preds: torch.Tensor
        :param targets: Ground truth targets
        :type targets: torch.Tensor
        """

        self.result = {metric.__name__: metric(preds, targets) for metric in self.metrics_fn}
        wandb.log(self.result)

    def display(self):
        """Display the tracked metric results."""

        for k, v in self.result.items():
            print(f"{k}: {v:.2f}")


def get_preds(data: DataLoader, model: nn.Module, device: str = "cpu") -> Tuple[np.ndarray, np.ndarray]:
    """
    Get predictions and targets from a data loader and a PyTorch model.

    :param data: A PyTorch DataLoader containing the data to predict on.
    :type data: torch.utils.data.DataLoader
    :param model: A PyTorch model to use for predictions.
    :type model: torch.nn.Module
    :param device: The device to use for predictions (default is "cpu").
    :type device: str
    :raises TypeError: If any of the input arguments is of an incorrect type.
    :return: A tuple containing two NumPy arrays: the predictions and the targets.
    :rtype: Tuple[numpy.ndarray, numpy.ndarray]
    """

    if not isinstance(data, DataLoader):
        raise TypeError("The 'data' argument must be a PyTorch DataLoader.")
    if not isinstance(model, nn.Module):
        raise TypeError("The 'model' argument must be a PyTorch model.")
    if not isinstance(device, str):
        raise TypeError("The 'device' argument must be a string.")

    loop = tqdm(data, leave=False)
    model = model.to(device)
    model.eval()

    preds = []
    targets = []

    with torch.no_grad():
        for xb, yb in loop:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            pred = torch.sigmoid(pred)
            preds.extend(pred.cpu().numpy())
            targets.extend(yb.cpu().numpy())

    preds, targets = np.array(preds), np.array(targets)

    return preds, targets