File size: 11,144 Bytes
fdc1efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
from abc import ABC, abstractmethod
from typing import Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import wandb
from torch.utils.data import DataLoader
from tqdm.autonotebook import tqdm

import modeling.loss as loss_module
import modeling.metrics as metrics_module
from modeling.loss import HardDistillationLoss
from modeling.models import freeze, layerwise_lr_decay
from modeling.utils import init_obj


class BaseLearner(ABC):
    """
    Abstract base class for a learner.

    :param train_dl: DataLoader for training data
    :type train_dl: Type[DataLoader]
    :param valid_dl: DataLoader for validation data
    :type valid_dl: Type[DataLoader]
    :param model: Model to be trained
    :type model: Type[nn.Module]
    :param config: Configuration object
    :type config: Any
    """

    def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
        self.train_dl = train_dl
        self.valid_dl = valid_dl
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = model.to(self.device)
        self.config = config

    @abstractmethod
    def fit(
        self,
    ):
        """Abstract method for fitting the model."""

        pass

    @abstractmethod
    def _train_epoch(
        self,
    ):
        """Abstract method for training the model for one epoch."""
        pass

    @abstractmethod
    def _test_epoch(
        self,
    ):
        """Abstract method for testing the model for one epoch."""
        pass


class Learner(BaseLearner):
    def __init__(self, train_dl: DataLoader, valid_dl: DataLoader, model: nn.Module, config):
        """
        A class that inherits from the BaseLearner class and represents a learner object.

        :param train_dl: DataLoader for training data
        :type train_dl: DataLoader
        :param valid_dl: DataLoader for validation data
        :type valid_dl: DataLoader
        :param model: Model to be trained
        :type model: nn.Module
        :param config: Configuration object
        :type config: Any
        """

        super().__init__(train_dl, valid_dl, model, config)

        self.model = torch.nn.DataParallel(module=self.model, device_ids=list(range(config.num_gpus)))
        self.loss_fn = init_obj(self.config.loss, loss_module)
        params = layerwise_lr_decay(self.config, self.model)
        self.optimizer = init_obj(self.config.optimizer, optim, params)
        self.scheduler = init_obj(
            self.config.scheduler,
            optim.lr_scheduler,
            self.optimizer,
            max_lr=[param["lr"] for param in params],
            epochs=self.config.epochs,
            steps_per_epoch=int(np.ceil(len(train_dl) / self.config.num_accum)),
        )

        self.verbose = self.config.verbose
        self.metrics = MetricTracker(self.config.metrics, self.verbose)
        self.scaler = torch.cuda.amp.GradScaler()

        self.train_step = 0
        self.test_step = 0

    def fit(self, model_name: str = "model"):
        """
        Method to train the model.

        :param model_name: Name of the model to be saved, defaults to "model"
        :type model_name: str, optional
        """

        loop = tqdm(range(self.config.epochs), leave=False)

        for epoch in loop:
            train_loss = self._train_epoch()
            val_loss = self._test_epoch()

            wandb.log({"train_loss": train_loss, "val_loss": val_loss, "epoch": epoch + 1})

            if self.verbose:
                print(f"| EPOCH: {epoch+1} | train_loss: {train_loss:.3f} | val_loss: {val_loss:.3f} |\n")
                self.metrics.display()

        if self.config.save_last_checkpoint:
            torch.save(self.model.module.state_dict(), f"{model_name}.pth")

    def _train_epoch(self, distill: bool = False):
        """
        Method to perform one epoch of training.

        :param distill: Flag to indicate if knowledge distillation is used, defaults to False
        :type distill: bool, optional
        :return: Average training loss for the epoch
        :rtype: float
        """

        if distill:
            print("Distilling knowledge...", flush=True)

        loop = tqdm(self.train_dl, leave=False)
        self.model.train()

        num_batches = len(self.train_dl)
        train_loss = 0

        for idx, (xb, yb) in enumerate(loop):
            xb = xb.to(self.device)
            yb = yb.to(self.device)

            # forward
            with torch.autocast(device_type=self.device, dtype=torch.float16, enabled=not distill):
                predictions = self.model(xb)

                if distill:
                    loss = self.KDloss_fn(xb, predictions, yb)
                else:
                    loss = self.loss_fn(predictions, yb)

                loss /= self.config.num_accum

            # backward
            self.scaler.scale(loss).backward()
            wandb.log({f"lr_param_group_{i}": lr for i, lr in enumerate(self.scheduler.get_last_lr())})

            if ((idx + 1) % self.config.num_accum == 0) or (idx + 1 == num_batches):
                self.scaler.step(self.optimizer)
                self.scaler.update()
                self.scheduler.step()
                self.optimizer.zero_grad()

            # update loop
            loop.set_postfix(loss=loss.item())
            self.train_step += 1
            wandb.log({"train_loss_per_batch": loss.item(), "train_step": self.train_step})
            train_loss += loss.item()

            if distill:
                if ((idx + 1) % 2500 == 0) and not (idx + 1 == num_batches):
                    val_loss = self._test_epoch()
                    wandb.log({"val_loss": val_loss})
                    self.model.train()

        train_loss /= num_batches

        return train_loss

    def _test_epoch(self):
        """
        Method to perform one epoch of validation/testing.

        :return: Average validation/test loss for the epoch
        :rtype: float
        """

        loop = tqdm(self.valid_dl, leave=False)
        self.model.eval()

        num_batches = len(self.valid_dl)
        preds = []
        targets = []
        test_loss = 0

        with torch.no_grad():
            for xb, yb in loop:
                xb, yb = xb.to(self.device), yb.to(self.device)
                pred = self.model(xb)
                loss = self.loss_fn(pred, yb).item()
                self.test_step += 1
                wandb.log({"valid_loss_per_batch": loss, "test_step": self.test_step})
                test_loss += loss

                pred = torch.sigmoid(pred)
                preds.extend(pred.cpu().numpy())
                targets.extend(yb.cpu().numpy())

        preds, targets = np.array(preds), np.array(targets)
        self.metrics.update(preds, targets)
        test_loss /= num_batches

        return test_loss


class KDLearner(Learner):
    """
    Knowledge Distillation Learner class for training a student model with knowledge distillation.

    :param train_dl: Train data loader
    :type train_dl: DataLoader
    :param valid_dl: Validation data loader
    :type valid_dl: DataLoader
    :param student_model: Student model to be trained
    :type student_model: nn.Module
    :param teacher: Teacher model for knowledge distillation
    :type teacher: nn.Module
    :param thresholds: Thresholds for HardDistillationLoss
    :type thresholds: List[float]
    :param config: Configuration object for training
    :type config: Config
    """

    def __init__(self, train_dl, valid_dl, student_model, teacher, thresholds, config):
        super().__init__(train_dl, valid_dl, student_model, config)

        self.teacher = nn.DataParallel(freeze(teacher).to(self.device))
        self.KDloss_fn = HardDistillationLoss(self.teacher, self.loss_fn, thresholds, self.device)
        self.scaler = torch.cuda.amp.GradScaler(enabled=False)

    def _train_epoch(self):
        """
        Method to perform one epoch of training with knowledge distillation.

        :return: Average training loss for the epoch
        :rtype: float
        """

        return super()._train_epoch(distill=True)


class MetricTracker:
    """
    Metric Tracker class for tracking evaluation metrics during model validation.
    This class is used to track and display evaluation metrics during model validation.
    It keeps track of the results of the provided metric functions for each validation batch,
    and logs them to Weights & Biases using wandb.log(). The display() method can be used
    to print the tracked metric results, if verbose is set to True during initialization.

    :param metrics: List of metric functions to track
    :type metrics: List[Callable]
    :param verbose: Flag to indicate whether to print the results or not, defaults to True
    :type verbose: bool, optional
    """

    def __init__(self, metrics, verbose: bool = True):
        self.metrics_fn = [getattr(metrics_module, metric) for metric in metrics]
        self.verbose = verbose
        self.result = None

    def update(self, preds, targets):
        """
        Update the metric tracker with the latest predictions and targets.

        :param preds: Model predictions
        :type preds: torch.Tensor
        :param targets: Ground truth targets
        :type targets: torch.Tensor
        """

        self.result = {metric.__name__: metric(preds, targets) for metric in self.metrics_fn}
        wandb.log(self.result)

    def display(self):
        """Display the tracked metric results."""

        for k, v in self.result.items():
            print(f"{k}: {v:.2f}")


def get_preds(data: DataLoader, model: nn.Module, device: str = "cpu") -> Tuple[np.ndarray, np.ndarray]:
    """
    Get predictions and targets from a data loader and a PyTorch model.

    :param data: A PyTorch DataLoader containing the data to predict on.
    :type data: torch.utils.data.DataLoader
    :param model: A PyTorch model to use for predictions.
    :type model: torch.nn.Module
    :param device: The device to use for predictions (default is "cpu").
    :type device: str
    :raises TypeError: If any of the input arguments is of an incorrect type.
    :return: A tuple containing two NumPy arrays: the predictions and the targets.
    :rtype: Tuple[numpy.ndarray, numpy.ndarray]
    """

    if not isinstance(data, DataLoader):
        raise TypeError("The 'data' argument must be a PyTorch DataLoader.")
    if not isinstance(model, nn.Module):
        raise TypeError("The 'model' argument must be a PyTorch model.")
    if not isinstance(device, str):
        raise TypeError("The 'device' argument must be a string.")

    loop = tqdm(data, leave=False)
    model = model.to(device)
    model.eval()

    preds = []
    targets = []

    with torch.no_grad():
        for xb, yb in loop:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            pred = torch.sigmoid(pred)
            preds.extend(pred.cpu().numpy())
            targets.extend(yb.cpu().numpy())

    preds, targets = np.array(preds), np.array(targets)

    return preds, targets