Spaces:

darshanjani
/

YoloV3-from-Scratch

Runtime error

App Files Files Community

darshanjani commited on Sep 9, 2023

Commit

3a0062c

•

1 Parent(s): 2beb6cf

utils function for inference

Browse files

Files changed (8) hide show

Utilities/callbacks.py +99 -0
Utilities/config.py +148 -0
Utilities/dataset.py +298 -0
Utilities/loss.py +96 -0
Utilities/model.py +270 -0
Utilities/runtime_utils.py +89 -0
Utilities/transforms.py +70 -0
Utilities/utils.py +524 -0

Utilities/callbacks.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pytorch_lightning as pl
+from . import config
+from .utils import (
+    check_class_accuracy,
+    get_evaluation_bboxes,
+    mean_average_precision,
+    plot_couple_examples,
+)
+class PlotTestExamplesCallback(pl.Callback):
+    def __init__(self, every_n_epochs: int = 1) -> None:
+        super().__init__()
+        self.every_n_epochs = every_n_epochs
+    def on_train_epoch_end(self, trainer:pl.Trainer, pl_module:pl.LightningModule) -> None:
+        if (trainer.current_epoch + 1) % self.every_n_epochs == 0:
+            plot_couple_examples(
+                model=pl_module,
+                loader=trainer.datamodule.test_dataloader(),
+                thresh=0.6,
+                iou_thresh=0.5,
+                anchors=pl_module.scaled_anchors
+            )
+class CheckClassAccuracyCallback(pl.Callback):
+    def __init__(
+        self, train_every_n_epochs: int = 1, test_every_n_epochs: int = 3
+    ) -> None:
+        super().__init__()
+        self.train_every_n_epochs = train_every_n_epochs
+        self.test_every_n_epochs = test_every_n_epochs
+    def on_train_epoch_end(
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule
+    ) -> None:
+        if (trainer.current_epoch + 1) % self.train_every_n_epochs == 0:
+            print("+++ TRAIN ACCURACIES")
+            class_acc, no_obj_acc, obj_acc = check_class_accuracy(
+                model=pl_module,
+                loader=trainer.datamodule.train_dataloader(),
+                threshold=config.CONF_THRESHOLD,
+            )
+            pl_module.log_dict(
+                {
+                    "train_class_acc": class_acc,
+                    "train_no_obj_acc": no_obj_acc,
+                    "train_obj_acc": obj_acc,
+                },
+                logger=True,
+            )
+        if (trainer.current_epoch + 1) % self.test_every_n_epochs == 0:
+            print("+++ TEST ACCURACIES")
+            class_acc, no_obj_acc, obj_acc = check_class_accuracy(
+                model=pl_module,
+                loader=trainer.datamodule.test_dataloader(),
+                threshold=config.CONF_THRESHOLD,
+            )
+            pl_module.log_dict(
+                {
+                    "test_class_acc": class_acc,
+                    "test_no_obj_acc": no_obj_acc,
+                    "test_obj_acc": obj_acc,
+                },
+                logger=True,
+            )
+class MAPCallback(pl.Callback):
+    def __init__(self, every_n_epochs: int = 3) -> None:
+        super().__init__()
+        self.every_n_epochs = every_n_epochs
+    def on_train_epoch_end(
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule
+    ) -> None:
+        if (trainer.current_epoch + 1) % self.every_n_epochs == 0:
+            pred_boxes, true_boxes = get_evaluation_bboxes(
+                loader=trainer.datamodule.test_dataloader(),
+                model=pl_module,
+                iou_threshold=config.NMS_IOU_THRESH,
+                anchors=config.ANCHORS,
+                threshold=config.CONF_THRESHOLD,
+                device=config.DEVICE,
+            )
+            map_val = mean_average_precision(
+                pred_boxes=pred_boxes,
+                true_boxes=true_boxes,
+                iou_threshold=config.MAP_IOU_THRESH,
+                box_format="midpoint",
+                num_classes=config.NUM_CLASSES,
+            )
+            print("+++ MAP: ", map_val.item())
+            pl_module.log(
+                "MAP",
+                map_val.item(),
+                logger=True,
+            )
+            pl_module.train()

Utilities/config.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import torch
+MAIN_DIR = "/kaggle/working/"
+# DATASET = os.path.join(MAIN_DIR, "../data/PASCAL_VOC")
+DATASET = "/kaggle/input/pascal-voc-dataset-used-in-yolov3-video/PASCAL_VOC"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# DEVICE = "mps"
+# seed_everything()  # If you want deterministic behavior
+NUM_WORKERS = 2
+BATCH_SIZE = 40
+IMAGE_SIZE = 416
+INPUT_RESOLUTIONS = [416, 544]
+INPUT_RESOLUTIONS_CUM_PROBS = [50, 100]
+NUM_CLASSES = 20
+LEARNING_RATE = 1e-5
+WEIGHT_DECAY = 1e-4
+NUM_EPOCHS = 40
+CONF_THRESHOLD = 0.05
+MAP_IOU_THRESH = 0.5
+NMS_IOU_THRESH = 0.45
+S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
+PIN_MEMORY = True
+LOAD_MODEL = False
+SAVE_MODEL = True
+CHECKPOINT_PATH = os.path.join(MAIN_DIR, "Store/checkpoints/")
+IMG_DIR = DATASET + "/images/"
+LABEL_DIR = DATASET + "/labels/"
+TRAIN_MOSAIC_PERCENTAGE = 0.5
+TEST_MOSAIC_PERCENTAGE = 0.00
+MODEL_STATE_DICT_PATH = os.path.join(MAIN_DIR, "Store/checkpoints/yolov3.pth")
+ANCHORS = [
+    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
+    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
+    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
+]  # Note these have been rescaled to be between [0, 1]
+means = [0.485, 0.456, 0.406]
+scale = 1.1
+PASCAL_CLASSES = [
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+COCO_LABELS = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]

Utilities/dataset.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import os
+import random
+import numpy as np
+import pandas as pd
+import pytorch_lightning as pl
+import torch
+from PIL import Image, ImageFile
+from torch.utils.data import DataLoader, Dataset
+from torchvision.transforms import Resize
+from . import config, transforms
+from .utils import cells_to_bboxes
+from .utils import iou_width_height as iou
+from .utils import non_max_suppression as nms
+from .utils import plot_image, xyxy2xywhn, xywhn2xyxy
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+class YOLODataset(Dataset):
+    def __init__(
+        self,
+        csv_file,
+        img_dir,
+        label_dir,
+        anchors,
+        image_size=416,
+        S=[13, 26, 52],
+        C=20,
+        transform=None,
+        mosaic_percentage=0.67,
+    ):
+        self.annotations = pd.read_csv(csv_file)
+        self.img_dir = img_dir
+        self.label_dir = label_dir
+        self.image_size = image_size
+        self.mosaic_border = [image_size // 2, image_size // 2]
+        self.transform = transform
+        self.S = S
+        self.anchors = torch.tensor(
+            anchors[0] + anchors[1] + anchors[2]
+        )  # for all 3 scales
+        self.num_anchors = self.anchors.shape[0]
+        self.num_anchors_per_scale = self.num_anchors // 3
+        self.C = C
+        self.ignore_iou_thresh = 0.5
+        self.mosaic_percentage = mosaic_percentage
+    def __len__(self):
+        return len(self.annotations)
+    def load_mosaic(self, index):
+        # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
+        labels4 = []
+        s = self.image_size
+        yc, xc = (
+            int(random.uniform(x, 2 * s - x)) for x in self.mosaic_border
+        )  # mosaic center x, y
+        indices = [index] + random.choices(
+            range(len(self)), k=3
+        )  # 3 additional image indices
+        random.shuffle(indices)
+        for i, index in enumerate(indices):
+            # Load image
+            label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
+            bboxes = np.roll(
+                np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1
+            ).tolist()
+            img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
+            img = np.array(Image.open(img_path).convert("RGB"))
+            h, w = img.shape[0], img.shape[1]
+            labels = np.array(bboxes)
+            # place img in img4
+            if i == 0:  # top left
+                img4 = np.full(
+                    (s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8
+                )  # base image with 4 tiles
+                x1a, y1a, x2a, y2a = (
+                    max(xc - w, 0),
+                    max(yc - h, 0),
+                    xc,
+                    yc,
+                )  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = (
+                    w - (x2a - x1a),
+                    h - (y2a - y1a),
+                    w,
+                    h,
+                )  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+            padw = x1a - x1b
+            padh = y1a - y1b
+            # Labels
+            if labels.size:
+                labels[:, :-1] = xywhn2xyxy(
+                    labels[:, :-1], w, h, padw, padh
+                )  # normalized xywh to pixel xyxy format
+            labels4.append(labels)
+        # Concat/clip labels
+        labels4 = np.concatenate(labels4, 0)
+        for x in (labels4[:, :-1],):
+            np.clip(x, 0, 2 * s, out=x)  # clip when using random_perspective()
+        # img4, labels4 = replicate(img4, labels4)  # replicate
+        labels4[:, :-1] = xyxy2xywhn(labels4[:, :-1], 2 * s, 2 * s)
+        labels4[:, :-1] = np.clip(labels4[:, :-1], 0, 1)
+        labels4 = labels4[labels4[:, 2] > 0]
+        labels4 = labels4[labels4[:, 3] > 0]
+        return img4, labels4
+    def load_single_img(self, index):
+        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
+        bboxes = np.roll(
+            np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1
+        ).tolist()
+        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
+        image = np.array(Image.open(img_path).convert("RGB"))
+        return image, bboxes
+    def __getitem__(self, index):
+        if random.random() < self.mosaic_percentage:
+            image, bboxes = self.load_mosaic(index)
+        else:
+            image, bboxes = self.load_single_img(index)
+        if self.transform:
+            augmentations = self.transform(image=image, bboxes=bboxes)
+            image = augmentations["image"]
+            bboxes = augmentations["bboxes"]
+        # e.g. = (3, 13, 13, 6), (3, 26, 26, 6), (3, 52, 52, 6) || 6 = [x, y, w, h, obj, class] for each anchor box
+        targets = [torch.zeros((self.num_anchors // 3, S, S, 6)) for S in self.S]
+        for box in bboxes:
+            iou_anchors = iou(torch.tensor(box[2:4]), self.anchors)
+            anchor_indices = iou_anchors.argsort(descending=True, dim=0)
+            x, y, width, height, class_label = box
+            has_anchor = [False] * 3  # each scale should have one anchor
+            for anchor_idx in anchor_indices:
+                scale_idx = anchor_idx // self.num_anchors_per_scale
+                anchor_on_scale = anchor_idx % self.num_anchors_per_scale
+                S = self.S[scale_idx]
+                i, j = int(S * y), int(S * x)  # which cell
+                anchor_taken = targets[scale_idx][anchor_on_scale, i, j, 0]
+                if not anchor_taken and not has_anchor[scale_idx]:
+                    targets[scale_idx][anchor_on_scale, i, j, 0] = 1
+                    x_cell, y_cell = S * x - j, S * y - i  # both between [0,1]
+                    width_cell, height_cell = (
+                        width * S,
+                        height * S,
+                    )  # can be greater than 1 since it's relative to cell
+                    box_coordinates = torch.tensor(
+                        [x_cell, y_cell, width_cell, height_cell]
+                    )
+                    targets[scale_idx][anchor_on_scale, i, j, 1:5] = box_coordinates
+                    targets[scale_idx][anchor_on_scale, i, j, 5] = int(class_label)
+                    has_anchor[scale_idx] = True
+                elif (
+                    not anchor_taken
+                    and iou_anchors[anchor_idx] > self.ignore_iou_thresh
+                ):
+                    targets[scale_idx][
+                        anchor_on_scale, i, j, 0
+                    ] = -1  # ignore prediction
+        return image, tuple(targets)
+class YOLODataModule(pl.LightningDataModule):
+    def __init__(self, train_csv_path, test_csv_path):
+        super().__init__()
+        self.train_csv_path = train_csv_path
+        self.test_csv_path = test_csv_path
+        self.train_dataset = None
+        self.eval_dataset = None
+        self.test_dataset = None
+    def setup(self, stage=None):
+        self.train_dataset = YOLODataset(
+            self.train_csv_path,
+            transform=transforms.train_transforms,
+            S=[
+                config.IMAGE_SIZE // 32,
+                config.IMAGE_SIZE // 16,
+                config.IMAGE_SIZE // 8
+            ],
+            img_dir=config.IMG_DIR,
+            label_dir=config.LABEL_DIR,
+            anchors=config.ANCHORS,
+            mosaic_percentage=config.TRAIN_MOSAIC_PERCENTAGE
+        )
+        self.eval_dataset = YOLODataset(
+            self.train_csv_path,
+            transform=transforms.test_transforms,
+            S=[
+                config.IMAGE_SIZE // 32,
+                config.IMAGE_SIZE // 16,
+                config.IMAGE_SIZE // 8
+            ],
+            img_dir=config.IMG_DIR,
+            label_dir=config.LABEL_DIR,
+            anchors=config.ANCHORS,
+            mosaic_percentage=config.TRAIN_MOSAIC_PERCENTAGE # should be 0?
+        )
+        self.test_dataset = YOLODataset(
+            self.test_csv_path,
+            transform=transforms.test_transforms,
+            S=[
+                config.IMAGE_SIZE // 32,
+                config.IMAGE_SIZE // 16,
+                config.IMAGE_SIZE // 8
+            ],
+            img_dir=config.IMG_DIR,
+            label_dir=config.LABEL_DIR,
+            anchors=config.ANCHORS,
+            mosaic_percentage=config.TEST_MOSAIC_PERCENTAGE
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.train_dataset,
+            batch_size=config.BATCH_SIZE,
+            shuffle=True,
+            num_workers=config.NUM_WORKERS,
+            pin_memory=config.PIN_MEMORY,
+            drop_last=False
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.eval_dataset,
+            batch_size=config.BATCH_SIZE,
+            shuffle=False,
+            num_workers=config.NUM_WORKERS,
+            pin_memory=config.PIN_MEMORY,
+            drop_last=False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.test_dataset,
+            batch_size=config.BATCH_SIZE,
+            shuffle=False,
+            num_workers=config.NUM_WORKERS,
+            pin_memory=config.PIN_MEMORY,
+            drop_last=False
+        )
+def test():
+    anchors = config.ANCHORS
+    transform = config.test_transforms
+    dataset = YOLODataset(
+        "../data/PASCAL_VOC/2examples.csv",
+        "../data/PASCAL_VOC/images",
+        "../data/PASCAL_VOC/labels",
+        S=[13, 26, 52],
+        anchors=anchors,
+        transform=transform
+    )
+    S = [13, 26, 52]
+    scaled_anchors = torch.tensor(anchors) / (
+        1 / torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
+    )
+    loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
+    for x, y in loader:
+        boxes = []
+        for i in range(y[0].shape[1]):
+            anchor = scaled_anchors[i]
+            print(anchor.shape)
+            print(y[i].shape)
+            boxes += cells_to_bboxes(
+                y[i], is_preds=False, S=y[i].shape[2], anchors=anchor
+            )[0]
+        boxes = nms(boxes, iou_threshold=1, threshold=0.7, box_format="midpoint")
+        print(boxes)
+        plot_image(x[0].permute(1, 2, 0).to("cpu"), boxes)
+if __name__ == "__main__":
+    test()

Utilities/loss.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+Implementation of Yolo Loss Function similar to the one in Yolov3 paper,
+the difference from what I can tell is I use CrossEntropy for the classes
+instead of BinaryCrossEntropy.
+"""
+import random
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from .utils import intersection_over_union
+class YoloLoss(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.mse = nn.MSELoss()
+        self.bce = nn.BCEWithLogitsLoss()
+        self.entropy = nn.CrossEntropyLoss()
+        self.sigmoid = nn.Sigmoid()
+        # constants for the loss function
+        self.lambda_class = 1
+        self.lambda_noobj = 5
+        self.lambda_obj = 1
+        self.lambda_box = 1
+    def forward(self, predictions, target, anchors):
+        # Check where obj and noobj (we ignore if target == -1)
+        obj = target[..., 0] == 1
+        noobj = target[..., 0] == 0
+        # ======================= #
+        #   FOR NO OBJECT LOSS    #
+        # ======================= #
+        no_object_loss = self.bce(
+            (predictions[..., 0:1][noobj]),
+            (target[..., 0:1][noobj])
+        )
+        # ==================== #
+        #   FOR OBJECT LOSS    #
+        # ==================== #
+        anchors = anchors.reshape(1, 3, 1, 1, 2)
+        box_preds = torch.cat(
+            [
+                self.sigmoid(predictions[..., 1:3]),
+                torch.exp(predictions[..., 3:5]) * anchors,
+            ],
+            dim=-1,
+        )
+        ious = intersection_over_union(box_preds[obj], target[..., 1:5][obj]).detach()
+        object_loss = self.mse(
+            self.sigmoid(predictions[..., 0:1][obj]), ious * target[..., 0:1][obj]
+        )
+        # ======================== #
+        #   FOR BOX COORDINATES    #
+        # ======================== #
+        predictions[..., 1:3] = self.sigmoid(predictions[..., 1:3])  # x,y coordinates
+        target[..., 3:5] = torch.log(
+            (1e-16 + target[..., 3:5] / anchors)
+        )  # width, height coordinates
+        box_loss = self.mse(predictions[..., 1:5][obj], target[..., 1:5][obj])
+        # ================== #
+        #   FOR CLASS LOSS   #
+        # ================== #
+        class_loss = self.entropy(
+            (predictions[..., 5:][obj]),
+            (target[..., 5][obj].long()),
+        )
+        # print("__________________________________")
+        # print(self.lambda_box * box_loss)
+        # print(self.lambda_obj * object_loss)
+        # print(self.lambda_noobj * no_object_loss)
+        # print(self.lambda_class * class_loss)
+        # print("\n")
+        return (
+            self.lambda_box * box_loss
+            + self.lambda_obj * object_loss
+            + self.lambda_noobj * no_object_loss
+            + self.lambda_class * class_loss
+        )

Utilities/model.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+Implementation of YOLOv3 architecture
+"""
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import OneCycleLR
+from . import config
+from .loss import YoloLoss
+model_config = [
+    (32, 3, 1),
+    (64, 3, 2),
+    ["B", 1],
+    (128, 3, 2),
+    ["B", 2],
+    (256, 3, 2),
+    ["B", 8],
+    (512, 3, 2),
+    ["B", 8],
+    (1024, 3, 2),
+    ["B", 4],   # darknet 53 ends here
+    (512, 1, 1),
+    (1024, 3, 1),
+    "S",
+    (256, 1, 1),
+    "U",
+    (256, 1, 1),
+    (512, 3, 1),
+    "S",
+    (128, 1, 1),
+    "U",
+    (128, 1, 1),
+    (256, 3, 1),
+    "S"
+]
+class CNNBlock(pl.LightningModule):
+    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.leaky = nn.LeakyReLU(0.1)
+        self.use_bn_act = bn_act
+    def forward(self, x):
+        if self.use_bn_act:
+            return self.leaky(self.bn((self.conv(x))))
+        else:
+            return self.conv(x)
+class ResidualBlock(pl.LightningModule):
+    def __init__(self, channels, use_residual=True, num_repeats=1):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for repeat in range(num_repeats):
+            self.layers += [
+                nn.Sequential(
+                    CNNBlock(channels, channels//2, kernel_size=1),
+                    CNNBlock(channels//2, channels, kernel_size=3, padding=1)
+                )
+            ]
+        self.use_residual = use_residual
+        self.num_repeats = num_repeats
+    def forward(self, x):
+        for layer in self.layers:
+            if self.use_residual:
+                x = x + layer(x)
+            else:
+                x = layer(x)
+        return x
+class ScalePrediction(pl.LightningModule):
+    def __init__(self, in_channels, num_classes):
+        super().__init__()
+        self.pred = nn.Sequential(
+            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
+            CNNBlock(2 * in_channels, (num_classes + 5) * 3, kernel_size=1, bn_act=False)
+        )
+        self.num_classes = num_classes
+    def forward(self, x):
+        return (
+            self.pred(x).
+            reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3]).
+            permute(0, 1, 3, 4, 2)
+        )
+class YOLOv3(pl.LightningModule):
+    def __init__(self, in_channels=3, num_classes=20):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.layers = self._create_conv_layers()
+        self.scaled_anchors = (
+            torch.tensor(config.ANCHORS) * torch.tensor(config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)         #  ?
+        ).to(config.DEVICE)
+        self.learning_rate = config.LEARNING_RATE
+        self.weight_decay = config.WEIGHT_DECAY
+        self.best_lr = 1e-3  ## ?
+    def forward(self, x):  # ?
+        outputs = []    # for each scale
+        route_connections = []
+        for layer in self.layers:
+            if isinstance(layer, ScalePrediction):
+                outputs.append(layer(x))
+                continue
+            x = layer(x)
+            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
+                route_connections.append(x)
+            elif isinstance(layer, nn.Upsample):
+                x = torch.cat([x, route_connections[-1]], dim=1)
+                route_connections.pop()
+        return outputs
+    def _create_conv_layers(self):
+        layers = nn.ModuleList()
+        in_channels = self.in_channels
+        for module in model_config:
+            if isinstance(module, tuple):
+                out_channels, kernel_size, stride = module
+                layers.append(
+                    CNNBlock(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=1 if kernel_size==3 else 0)
+                )
+                in_channels = out_channels
+            elif isinstance(module, list):
+                num_repeats = module[1]
+                layers.append(
+                    ResidualBlock(in_channels, num_repeats=num_repeats)
+                )
+            elif isinstance(module, str):
+                if module == "S":
+                    layers += [
+                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
+                        CNNBlock(in_channels, in_channels//2, kernel_size=1),
+                        ScalePrediction(in_channels//2, num_classes=self.num_classes)
+                    ]
+                    in_channels = in_channels // 2
+                elif module == "U":
+                    layers.append(nn.Upsample(scale_factor=2))
+                    in_channels = in_channels * 3
+        return layers
+    def yololoss(self):
+        return YoloLoss()
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y0, y1, y2 = y[0], y[1], y[2]
+        out = self.forward(x)
+        # print(out[0].shape, y0.shape)
+        loss = (                                                    # ?
+            self.yololoss()(out[0], y0, self.scaled_anchors[0])
+            + self.yololoss()(out[1], y1, self.scaled_anchors[1])
+            + self.yololoss()(out[2], y2, self.scaled_anchors[2])
+        )
+        self.log(
+            "train_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True
+        )
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y0, y1, y2 = y[0], y[1], y[2]
+        out = self.forward(x)
+        loss = (
+            self.yololoss()(out[0], y0, self.scaled_anchors[0])
+            + self.yololoss()(out[1], y1, self.scaled_anchors[1])
+            + self.yololoss()(out[2], y2, self.scaled_anchors[2])
+        )
+        self.log(
+            "test_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True
+        )
+        return loss
+    def on_train_epoch_end(self) -> None:
+        print(
+            f"Epoch: {self.current_epoch}, Loss: {self.trainer.callback_metrics['train_loss_epoch']}"
+        )
+    def on_test_epoch_end(self) -> None:
+        print(
+            f"Epoch: {self.current_epoch}, Loss: {self.trainer.callback_metrics['test_loss_epoch']}"
+        )
+    def configure_optimizers(self):
+        optimizer = optim.Adam(
+            self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
+        )
+        scheduler = OneCycleLR(
+            optimizer,
+            max_lr=self.best_lr,
+            steps_per_epoch=len(self.trainer.datamodule.train_dataloader()),
+            epochs=config.NUM_EPOCHS,
+            pct_start=8 / config.NUM_EPOCHS,
+            div_factor=100,
+            three_phase=False,
+            final_div_factor=100,
+            anneal_strategy="linear"
+        )
+        return [optimizer], [{"scheduler": scheduler, "interval": "step", "frequency": 1}]
+    def on_train_end(self) -> None:
+        torch.save(self.state_dict(), config.MODEL_STATE_DICT_PATH)
+if __name__ == "main":
+    num_classes = 20
+    IMAGE_SIZE = 416
+    model = YOLOv3(num_classes=num_classes)
+    x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
+    out = model(x)
+    assert model(x)[0].shape == (
+        2,
+        3,
+        IMAGE_SIZE // 32,
+        IMAGE_SIZE // 32,
+        num_classes + 5
+    )
+    assert model(x)[1].shape == (
+        2,
+        3,
+        IMAGE_SIZE // 16,
+        IMAGE_SIZE // 16,
+        num_classes + 5
+    )
+    assert model(x)[2].shape == (
+        2,
+        3,
+        IMAGE_SIZE // 8,
+        IMAGE_SIZE // 8,
+        num_classes + 5
+    )
+    print("Image size compatibility check passed!")

Utilities/runtime_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import numpy as np
+import pytorch_lightning as pl
+import torch
+from pytorch_grad_cam import GradCAM
+from pytorch_grad_cam.utils.image import show_cam_on_image
+from Utilities.transforms import test_transforms
+# from Utilities.config import S
+from Utilities.utils import cells_to_bboxes, non_max_suppression, plot_image
+def plot_bboxes(
+        input_img,
+        model,
+        thresh=0.6,
+        iou_thresh=0.5,
+        anchors=None,
+):
+    input_img = test_transforms(image=input_img)["image"]
+    input_img = input_img.unsqueeze(0)
+    model.eval()
+    with torch.no_grad():
+        out = model(input_img)
+        for i in range(3):
+            batch_size, A, S, _, _ = out[i].shape
+            anchor = anchors[i]
+            boxes_scale_i = cells_to_bboxes(out[i], anchor, S=S, is_preds=True)
+            bboxes = boxes_scale_i[0]
+    nms_boxes = non_max_suppression(
+        bboxes,
+        iou_threshold=iou_thresh,
+        threshold=thresh,
+        box_formet="midpoint",
+    )
+    fig = plot_image(input_img[0].permute(1, 2, 0).detach().cpu(), nms_boxes)
+    return fig, input_img
+def return_top_objectness_class_preds(model, input_img, gradcam_output_stream):
+    out = model(input_img)[gradcam_output_stream]
+    # 1. get objectness score
+    objectness_scores = out[..., 0]
+    # 2. get index of highest objectness score
+    max_obj_arg = torch.argmax(objectness_scores)
+    max_obj_arg_onehot = torch.zeros(objectness_scores.flatten().shape[0])
+    max_obj_arg_onehot[max_obj_arg] = 1
+    max_obj_arg_onehot = max_obj_arg_onehot.reshape_as(objectness_scores).int()
+    selected_elements = out[max_obj_arg_onehot == 1]
+    selected_elements = selected_elements[:, 5:]
+    return selected_elements
+class TopObjectnessClassPreds(pl.LightningModule):
+    def __init__(self, model, gradcam_output_stream):
+        super().__init__()
+        self.model = model
+        self.gradcam_output_stream = gradcam_output_stream
+    def forward(self, x):
+        return return_top_objectness_class_preds(self.model, x, self.gradcam_output_stream)
+def generate_gradcam_output(org_img, model, input_img, gradcam_output_stream: int = 0):
+    TopObjectnessClassPredsObj = TopObjectnessClassPreds(model, gradcam_output_stream)
+    gradcam_model_layer = [15, 22, 29]
+    cam = GradCAM(
+        model=TopObjectnessClassPredsObj,
+        target_layers=[
+            TopObjectnessClassPredsObj.model.layers[
+                gradcam_model_layer[gradcam_output_stream]
+            ]
+        ],
+    )
+    grayscale_cam = cam(input_tensor=input_img, targets=None)
+    grayscale_cam = np.sum(grayscale_cam, axis=-1)
+    grayscale_cam = grayscale_cam[0, :]
+    visualization = show_cam_on_image(
+        org_img / 255,
+        grayscale_cam,
+        use_rgb=True,
+        image_weight=0.5,
+    )
+    return visualization

Utilities/transforms.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import albumentations as A
+import cv2
+from albumentations.pytorch import ToTensorV2
+from .config import IMAGE_SIZE, scale
+# train_transforms = A.Compose(
+#     [
+#         A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
+#         A.PadIfNeeded(
+#             min_height=int(IMAGE_SIZE * scale),
+#             min_width=int(IMAGE_SIZE * scale),
+#             border_mode=cv2.BORDER_CONSTANT,
+#         ),
+#         A.Rotate(limit=10, interpolation=1, border_mode=4),
+#         A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
+#         A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
+#         A.OneOf(
+#             [
+#                 A.ShiftScaleRotate(
+#                     rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT
+#                 ),
+#                 # A.Affine(shear=15, p=0.5, mode="constant"),
+#             ],
+#             p=1.0,
+#         ),
+#         A.HorizontalFlip(p=0.5),
+#         A.Blur(p=0.1),
+#         A.CLAHE(p=0.1),
+#         A.Posterize(p=0.1),
+#         A.ToGray(p=0.1),
+#         A.ChannelShuffle(p=0.05),
+#         A.Normalize(
+#             mean=[0, 0, 0],
+#             std=[1, 1, 1],
+#             max_pixel_value=255,
+#         ),
+#         ToTensorV2(),
+#     ],
+#     bbox_params=A.BboxParams(
+#         format="yolo",
+#         min_visibility=0.4,
+#         label_fields=[],
+#     ),
+# )
+test_transforms = A.Compose(
+    [
+        A.LongestMaxSize(max_size=IMAGE_SIZE),
+        A.PadIfNeeded(
+            min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
+        ),
+        A.Normalize(
+            mean=[0, 0, 0],
+            std=[1, 1, 1],
+            max_pixel_value=255,
+        ),
+        ToTensorV2(),
+    ],
+    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
+)
+resize_transforms = A.Compose(
+    [
+        A.LongestMaxSize(max_size=IMAGE_SIZE),
+        A.PadIfNeeded(
+            min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
+        ),
+    ]
+)

Utilities/utils.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import os
+import random
+from collections import Counter
+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from tqdm import tqdm
+from . import config
+def iou_width_height(boxes1, boxes2):
+    """
+    Parameters:
+        boxes1 (tensor): width and height of the first bounding boxes
+        boxes2 (tensor): width and height of the second bounding boxes
+    Returns:
+        tensor: Intersection over union of the corresponding boxes
+    """
+    intersection = torch.min(boxes1[..., 0], boxes2[..., 0]) * torch.min(
+        boxes1[..., 1], boxes2[..., 1]
+    )
+    union = (
+        boxes1[..., 0] * boxes1[..., 1] + boxes2[..., 0] * boxes2[..., 1] - intersection
+    )
+    return intersection / union
+def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
+    """
+    Video explanation of this function:
+    https://youtu.be/XXYG5ZWtjj0
+    This function calculates intersection over union (iou) given pred boxes
+    and target boxes.
+    Parameters:
+        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
+        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
+        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)
+    Returns:
+        tensor: Intersection over union for all examples
+    """
+    if box_format == "midpoint":
+        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
+        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
+        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
+        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
+        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
+        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
+        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
+        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
+    if box_format == "corners":
+        box1_x1 = boxes_preds[..., 0:1]
+        box1_y1 = boxes_preds[..., 1:2]
+        box1_x2 = boxes_preds[..., 2:3]
+        box1_y2 = boxes_preds[..., 3:4]
+        box2_x1 = boxes_labels[..., 0:1]
+        box2_y1 = boxes_labels[..., 1:2]
+        box2_x2 = boxes_labels[..., 2:3]
+        box2_y2 = boxes_labels[..., 3:4]
+    x1 = torch.max(box1_x1, box2_x1)
+    y1 = torch.max(box1_y1, box2_y1)
+    x2 = torch.min(box1_x2, box2_x2)
+    y2 = torch.min(box1_y2, box2_y2)
+    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
+    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
+    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
+    return intersection / (box1_area + box2_area - intersection + 1e-6)
+def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
+    """
+    Video explanation of this function:
+    https://youtu.be/YDkjWEN8jNA
+    Does Non Max Suppression given bboxes
+    Parameters:
+        bboxes (list): list of lists containing all bboxes with each bboxes
+        specified as [class_pred, prob_score, x1, y1, x2, y2]
+        iou_threshold (float): threshold where predicted bboxes is correct
+        threshold (float): threshold to remove predicted bboxes (independent of IoU)
+        box_format (str): "midpoint" or "corners" used to specify bboxes
+    Returns:
+        list: bboxes after performing NMS given a specific IoU threshold
+    """
+    assert type(bboxes) == list
+    bboxes = [box for box in bboxes if box[1] > threshold]
+    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
+    bboxes_after_nms = []
+    while bboxes:
+        chosen_box = bboxes.pop(0)
+        bboxes = [
+            box
+            for box in bboxes
+            if box[0] != chosen_box[0]
+            or intersection_over_union(
+                torch.tensor(chosen_box[2:]),
+                torch.tensor(box[2:]),
+                box_format=box_format,
+            )
+            < iou_threshold
+        ]
+        bboxes_after_nms.append(chosen_box)
+    return bboxes_after_nms
+def mean_average_precision(
+    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
+):
+    """
+    Video explanation of this function:
+    https://youtu.be/FppOzcDvaDI
+    This function calculates mean average precision (mAP)
+    Parameters:
+        pred_boxes (list): list of lists containing all bboxes with each bboxes
+        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
+        true_boxes (list): Similar as pred_boxes except all the correct ones
+        iou_threshold (float): threshold where predicted bboxes is correct
+        box_format (str): "midpoint" or "corners" used to specify bboxes
+        num_classes (int): number of classes
+    Returns:
+        float: mAP value across all classes given a specific IoU threshold
+    """
+    # list storing all AP for respective classes
+    average_precisions = []
+    # used for numerical stability later on
+    epsilon = 1e-6
+    for c in range(num_classes):
+        detections = []
+        ground_truths = []
+        # Go through all predictions and targets,
+        # and only add the ones that belong to the
+        # current class c
+        for detection in pred_boxes:
+            if detection[1] == c:
+                detections.append(detection)
+        for true_box in true_boxes:
+            if true_box[1] == c:
+                ground_truths.append(true_box)
+        # find the amount of bboxes for each training example
+        # Counter here finds how many ground truth bboxes we get
+        # for each training example, so let's say img 0 has 3,
+        # img 1 has 5 then we will obtain a dictionary with:
+        # amount_bboxes = {0:3, 1:5}
+        amount_bboxes = Counter([gt[0] for gt in ground_truths])
+        # We then go through each key, val in this dictionary
+        # and convert to the following (w.r.t same example):
+        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
+        for key, val in amount_bboxes.items():
+            amount_bboxes[key] = torch.zeros(val)
+        # sort by box probabilities which is index 2
+        detections.sort(key=lambda x: x[2], reverse=True)
+        TP = torch.zeros((len(detections)))
+        FP = torch.zeros((len(detections)))
+        total_true_bboxes = len(ground_truths)
+        # If none exists for this class then we can safely skip
+        if total_true_bboxes == 0:
+            continue
+        for detection_idx, detection in enumerate(detections):
+            # Only take out the ground_truths that have the same
+            # training idx as detection
+            ground_truth_img = [
+                bbox for bbox in ground_truths if bbox[0] == detection[0]
+            ]
+            num_gts = len(ground_truth_img)
+            best_iou = 0
+            for idx, gt in enumerate(ground_truth_img):
+                iou = intersection_over_union(
+                    torch.tensor(detection[3:]),
+                    torch.tensor(gt[3:]),
+                    box_format=box_format,
+                )
+                if iou > best_iou:
+                    best_iou = iou
+                    best_gt_idx = idx
+            if best_iou > iou_threshold:
+                # only detect ground truth detection once
+                if amount_bboxes[detection[0]][best_gt_idx] == 0:
+                    # true positive and add this bounding box to seen
+                    TP[detection_idx] = 1
+                    amount_bboxes[detection[0]][best_gt_idx] = 1
+                else:
+                    FP[detection_idx] = 1
+            # if IOU is lower then the detection is a false positive
+            else:
+                FP[detection_idx] = 1
+        TP_cumsum = torch.cumsum(TP, dim=0)
+        FP_cumsum = torch.cumsum(FP, dim=0)
+        recalls = TP_cumsum / (total_true_bboxes + epsilon)
+        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
+        precisions = torch.cat((torch.tensor([1]), precisions))
+        recalls = torch.cat((torch.tensor([0]), recalls))
+        # torch.trapz for numerical integration
+        average_precisions.append(torch.trapz(precisions, recalls))
+    return sum(average_precisions) / len(average_precisions)
+def plot_image(image, boxes):
+    """Plots predicted bounding boxes on the image"""
+    cmap = plt.get_cmap("tab20b")
+    class_labels = (
+        config.COCO_LABELS if config.DATASET == "COCO" else config.PASCAL_CLASSES
+    )
+    colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
+    im = np.array(image)
+    height, width, _ = im.shape
+    # Create figure and axes
+    fig, ax = plt.subplots(1)
+    # Display the image
+    ax.imshow(im)
+    # box[0] is x midpoint, box[2] is width
+    # box[1] is y midpoint, box[3] is height
+    # Create a Rectangle patch
+    for box in boxes:
+        assert (
+            len(box) == 6
+        ), "box should contain class pred, confidence, x, y, width, height"
+        class_pred = box[0]
+        box = box[2:]
+        upper_left_x = box[0] - box[2] / 2
+        upper_left_y = box[1] - box[3] / 2
+        rect = patches.Rectangle(
+            (upper_left_x * width, upper_left_y * height),
+            box[2] * width,
+            box[3] * height,
+            linewidth=2,
+            edgecolor=colors[int(class_pred)],
+            facecolor="none",
+        )
+        # Add the patch to the Axes
+        ax.add_patch(rect)
+        plt.text(
+            upper_left_x * width,
+            upper_left_y * height,
+            s=class_labels[int(class_pred)],
+            color="white",
+            verticalalignment="top",
+            bbox={"color": colors[int(class_pred)], "pad": 0},
+        )
+    plt.show()
+def get_evaluation_bboxes(
+    loader,
+    model,
+    iou_threshold,
+    anchors,
+    threshold,
+    box_format="midpoint",
+    device="cuda",
+):
+    # make sure model is in eval before get bboxes
+    model.eval()
+    train_idx = 0
+    all_pred_boxes = []
+    all_true_boxes = []
+    for batch_idx, (x, labels) in enumerate(tqdm(loader)):
+        x = x.to(device)
+        with torch.no_grad():
+            predictions = model(x)
+        batch_size = x.shape[0]
+        bboxes = [[] for _ in range(batch_size)]
+        for i in range(3):
+            S = predictions[i].shape[2]
+            anchor = torch.tensor([*anchors[i]]).to(device) * S
+            boxes_scale_i = cells_to_bboxes(predictions[i], anchor, S=S, is_preds=True)
+            for idx, (box) in enumerate(boxes_scale_i):
+                bboxes[idx] += box
+        # we just want one bbox for each label, not one for each scale
+        true_bboxes = cells_to_bboxes(labels[2], anchor, S=S, is_preds=False)
+        for idx in range(batch_size):
+            nms_boxes = non_max_suppression(
+                bboxes[idx],
+                iou_threshold=iou_threshold,
+                threshold=threshold,
+                box_format=box_format,
+            )
+            for nms_box in nms_boxes:
+                all_pred_boxes.append([train_idx] + nms_box)
+            for box in true_bboxes[idx]:
+                if box[1] > threshold:
+                    all_true_boxes.append([train_idx] + box)
+            train_idx += 1
+    model.train()
+    return all_pred_boxes, all_true_boxes
+def cells_to_bboxes(predictions, anchors, S, is_preds=True):
+    """
+    Scales the predictions coming from the model to
+    be relative to the entire image such that they for example later
+    can be plotted or.
+    INPUT:
+    predictions: tensor of size (N, 3, S, S, num_classes+5)
+    anchors: the anchors used for the predictions
+    S: the number of cells the image is divided in on the width (and height)
+    is_preds: whether the input is predictions or the true bounding boxes
+    OUTPUT:
+    converted_bboxes: the converted boxes of sizes (N, num_anchors, S, S, 1+5) with class index,
+                      object score, bounding box coordinates
+    """
+    BATCH_SIZE = predictions.shape[0]
+    num_anchors = len(anchors)
+    box_predictions = predictions[..., 1:5]
+    if is_preds:
+        anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
+        box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
+        box_predictions[..., 2:] = torch.exp(box_predictions[..., 2:]) * anchors
+        scores = torch.sigmoid(predictions[..., 0:1])
+        best_class = torch.argmax(predictions[..., 5:], dim=-1).unsqueeze(-1)
+    else:
+        scores = predictions[..., 0:1]
+        best_class = predictions[..., 5:6]
+    cell_indices = (
+        torch.arange(S)
+        .repeat(predictions.shape[0], 3, S, 1)
+        .unsqueeze(-1)
+        .to(predictions.device)
+    )
+    x = 1 / S * (box_predictions[..., 0:1] + cell_indices)
+    y = 1 / S * (box_predictions[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
+    w_h = 1 / S * box_predictions[..., 2:4]
+    converted_bboxes = torch.cat((best_class, scores, x, y, w_h), dim=-1).reshape(
+        BATCH_SIZE, num_anchors * S * S, 6
+    )
+    return converted_bboxes.tolist()
+def check_class_accuracy(model, loader, threshold):
+    model.eval()
+    tot_class_preds, correct_class = 0, 0
+    tot_noobj, correct_noobj = 0, 0
+    tot_obj, correct_obj = 0, 0
+    for idx, (x, y) in enumerate(tqdm(loader)):
+        x = x.to(config.DEVICE)
+        with torch.no_grad():
+            out = model(x)
+        for i in range(3):
+            y[i] = y[i].to(config.DEVICE)
+            obj = y[i][..., 0] == 1  # in paper this is Iobj_i
+            noobj = y[i][..., 0] == 0  # in paper this is Iobj_i
+            correct_class += torch.sum(
+                torch.argmax(out[i][..., 5:][obj], dim=-1) == y[i][..., 5][obj]
+            )
+            tot_class_preds += torch.sum(obj)
+            obj_preds = torch.sigmoid(out[i][..., 0]) > threshold
+            correct_obj += torch.sum(obj_preds[obj] == y[i][..., 0][obj])
+            tot_obj += torch.sum(obj)
+            correct_noobj += torch.sum(obj_preds[noobj] == y[i][..., 0][noobj])
+            tot_noobj += torch.sum(noobj)
+    class_acc = (correct_class / (tot_class_preds + 1e-16)) * 100
+    no_obj_acc = (correct_noobj / (tot_noobj + 1e-16)) * 100
+    obj_acc = (correct_obj / (tot_obj + 1e-16)) * 100
+    print(f"Class accuracy is: {class_acc:2f}%")
+    print(f"No obj accuracy is: {no_obj_acc:2f}%")
+    print(f"Obj accuracy is: {obj_acc:2f}%")
+    model.train()
+    return class_acc, no_obj_acc, obj_acc
+def get_mean_std(loader):
+    # var[X] = E[X**2] - E[X]**2
+    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0
+    for data, _ in tqdm(loader):
+        channels_sum += torch.mean(data, dim=[0, 2, 3])
+        channels_sqrd_sum += torch.mean(data**2, dim=[0, 2, 3])
+        num_batches += 1
+    mean = channels_sum / num_batches
+    std = (channels_sqrd_sum / num_batches - mean**2) ** 0.5
+    return mean, std
+def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    checkpoint = {
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer.state_dict(),
+    }
+    torch.save(checkpoint, filename)
+def load_checkpoint(checkpoint_file, model, optimizer, lr):
+    print("=> Loading checkpoint")
+    checkpoint = torch.load(checkpoint_file, map_location=config.DEVICE)
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    # If we don't do this then it will just have learning rate of old checkpoint
+    # and it will lead to many hours of debugging \:
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def plot_couple_examples(model, loader, thresh, iou_thresh, anchors):
+    model.eval()
+    x, y = next(iter(loader))
+    x = x.to(config.DEVICE)
+    with torch.no_grad():
+        out = model(x)
+        bboxes = [[] for _ in range(x.shape[0])]
+        for i in range(3): # should not be hard coded
+            batch_size, A, S, _, _ = out[i].shape
+            anchor = anchors[i]
+            boxes_scale_i = cells_to_bboxes(out[i], anchor, S=S, is_preds=True)
+            for idx, (box) in enumerate(boxes_scale_i):
+                bboxes[idx] += box
+        model.train() #correct indetation?
+    for i in range(batch_size // 4):
+        nms_boxes = non_max_suppression(
+            bboxes[i],
+            iou_threshold=iou_thresh,
+            threshold=thresh,
+            box_format="midpoint",
+        )
+        plot_image(x[i].permute(1, 2, 0).detach().cpu(), nms_boxes)
+def seed_everything(seed=42):
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
+    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw  # top left x
+    y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh  # top left y
+    y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw  # bottom right x
+    y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh  # bottom right y
+    return y
+def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = w * x[..., 0] + padw  # top left x
+    y[..., 1] = h * x[..., 1] + padh  # top left y
+    return y
+def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
+    if clip:
+        clip_boxes(x, (h - eps, w - eps))  # warning: inplace clip
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w  # x center
+    y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h  # y center
+    y[..., 2] = (x[..., 2] - x[..., 0]) / w  # width
+    y[..., 3] = (x[..., 3] - x[..., 1]) / h  # height
+    return y
+def clip_boxes(boxes, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[..., 0].clamp_(0, shape[1])  # x1
+        boxes[..., 1].clamp_(0, shape[0])  # y1
+        boxes[..., 2].clamp_(0, shape[1])  # x2
+        boxes[..., 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2