# Rewriting the LeNet model to learn the MNIST dataset and save the model parameters, | |
# This is considered something we should do in Week 3 of the Deep Learning and Computer Vision course. | |
# We will implement LeNet-5 architecture to learn the MNIST dataset. | |
from torchvision.transforms import ToTensor | |
# from torchvision.transforms import v2 | |
from torchvision import transforms | |
from torch.utils.data import DataLoader | |
from torch.utils.data import Dataset | |
from torchvision import datasets | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
from time import time | |
from torch import nn | |
import pandas as pd | |
import numpy as np | |
import torch, os | |
from utils import ApplyEnhancementFilter | |
# Load device first (GPU or CPU) | |
device = ( | |
"cuda" | |
if torch.cuda.is_available() | |
else "mps" | |
if torch.backends.mps.is_available() | |
else "cpu" | |
) | |
print(f"Using {device} device for training/inference.") | |
if device == "cuda": | |
print(f"GPU being used: {torch.cuda.get_device_name(0)}") | |
train_transform = transforms.Compose([ | |
# Data augmentation transformations | |
# ApplyEnhancementFilter(out_channels=1, kernel_size=3, stride=1, padding=1), | |
transforms.RandomAffine(degrees=35, translate=(0.1, 0.1), scale=(0.9, 1.1)), | |
transforms.RandomRotation(degrees=35), | |
# Convert images to tensors and normalize | |
transforms.ToTensor(), | |
transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)), | |
# Pad the image to make it 32x32 | |
transforms.Pad(2, fill=0, padding_mode='constant'), | |
]) | |
# For the test dataset, you should not apply these augmentations | |
test_transform = transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)), | |
transforms.Pad(2, fill=0, padding_mode='constant'), | |
]) | |
# Load the MNIST dataset which is 32x32x1 images (black and white ~ 1 channel) | |
# http://yann.lecun.com/exdb/mnist/ | |
# datasets.MNIST | |
# Loading from Dataset and DataLoader, https://pytorch.org/tutorials/beginner/basics/data_tutorial.html | |
# Load using known datasets, but what if we have our own dataset? | |
# training_data = datasets.MNIST( | |
# root="data", | |
# train=True, | |
# download=True, | |
# transform=ToTensor() | |
# ) | |
# | |
# test_data = datasets.MNIST( | |
# root="data", | |
# train=False, | |
# download=True, | |
# transform=ToTensor() | |
# ) | |
# Loading from a custom dataset | |
import idx2numpy | |
class CustomImageDataset(Dataset): | |
""" | |
This class must inherit from the torch.utils.data.Dataset class. | |
And contina functions __init__, __len__, and __getitem__. | |
""" | |
def __init__(self, annotations_file, image_file, transform=None, target_transform=None): | |
self.img_labels = idx2numpy.convert_from_file(annotations_file) | |
self.images = idx2numpy.convert_from_file(image_file) | |
self.transform = transform | |
self.target_transform = target_transform | |
def __len__(self): | |
return len(self.img_labels) | |
def __getitem__(self, idx): | |
"""Get the image and label at the index idx.""" | |
label = self.img_labels[idx] | |
img = self.images[idx] | |
img = Image.fromarray(img) | |
if self.transform: | |
img = self.transform(img) | |
if self.target_transform: | |
label = self.target_transform(label) | |
# Adding 0 padding to make it 32x32, as the model expects this. | |
# img = img.unsqueeze(0) # Add channel dimension, as model expects this. | |
return img, label # Return as float32, and label as int., should solve issue. | |
# Make the LeNet-5 model | |
class LeNet5Model(nn.Module): | |
def __init__(self): | |
super().__init__() | |
# Define activation, and sequential layers, then make forward pass. | |
self.tanh = nn.Tanh() | |
# Convolutional layers, https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html | |
# Avg Pooling, https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html | |
self.le_stack = nn.Sequential( | |
nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1), | |
self.tanh, | |
nn.AvgPool2d(kernel_size=2, stride=2), | |
nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1), | |
self.tanh, | |
nn.AvgPool2d(kernel_size=2, stride=2), | |
nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1), | |
self.tanh | |
) | |
# Fully connected layers, https://pytorch.org/docs/stable/generated/torch.nn.Linear.html | |
self.fc_stack = nn.Sequential( | |
nn.Linear(in_features=120, out_features=84), | |
self.tanh, | |
nn.Linear(in_features=84, out_features=10) | |
) | |
def forward(self, x): | |
"""Forward pass of the model.""" | |
x = self.le_stack(x) | |
x = x.reshape(x.shape[0], -1) | |
x = self.fc_stack(x) | |
return x | |
def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001, saved_model=None): | |
""" | |
Given a model, train the model using the train_loader and test_loader, and show metrics, | |
saving the best model parameters currently. | |
""" | |
# When we have model, we need the loss function and optimizer we will use. | |
# Loss function, https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html | |
loss_fn = nn.CrossEntropyLoss() # because we calculating probabilities and this is a classification problem. | |
# Optimizer, https://pytorch.org/docs/stable/optim.html | |
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-6) # learning rate of 0.001 | |
best_accuracy = 0.0 | |
# See if best accuracy is saved, if so, get current best accuracy. | |
if os.path.exists("best_model.txt"): | |
with open("best_model.txt", "r") as file: | |
best_accuracy = float(file.read()) | |
if saved_model is not None: # Load the model parameters if they exist. | |
model.load_state_dict(torch.load(saved_model)) | |
# Training loop | |
for i in range(epochs): | |
model.train() | |
print("Epoch ", i) | |
for batch, (x, y) in enumerate(train_loader): | |
x, y = x.to(device), y.to(device) | |
# Forward pass | |
# print(x.shape, y.shape) | |
# Shape of x is [64, 28, 28] and y is [64,] | |
# But x needs to include the channels, so shape should be [64, 1, 28, 28] | |
# x = x.view(-1, 1, 32, 32) | |
y_pred = model(x) | |
# Compute loss | |
loss = loss_fn(y_pred, y) | |
# Zero gradients, backward pass, and update weights | |
optimizer.zero_grad() | |
loss.backward() | |
optimizer.step() | |
# Print loss | |
if batch % 250 == 0: | |
print(f"Epoch {i} batch {batch} loss: {loss.item()}") | |
# Evaluate the model | |
model.eval() | |
correct, total = 0, 0 | |
with torch.no_grad(): | |
for x, y in test_loader: | |
x, y = x.to(device), y.to(device) | |
#x = x.view(-1, 1, 32, 32) | |
y_pred = model(x) | |
_, predicted = torch.max(y_pred, 1) | |
total += y.size(0) | |
correct += (predicted == y).sum().item() | |
print(f"Epoch {i} accuracy: {correct/total}") | |
if correct/total > best_accuracy: | |
best_accuracy = correct/total | |
torch.save(model.state_dict(), "lenet_mnist_model.pth") | |
with open("best_model.txt", "w") as file: | |
file.write(f"{best_accuracy}") | |
print("Training complete.") | |
def init_weights(m): | |
if isinstance(m, nn.Conv2d): | |
nn.init.xavier_uniform_(m.weight) | |
if m.bias is not None: | |
m.bias.data.fill_(0.01) | |
elif isinstance(m, nn.Linear): | |
nn.init.xavier_uniform_(m.weight) | |
m.bias.data.fill_(0.01) | |
if __name__ == "__main__": | |
# Testing conversion from ubyte idx to numpy array | |
# file_name = "t10k-images.idx3-ubyte" | |
# label_file = "t10k-labels.idx1-ubyte" | |
# file_path = os.path.join("mnist_dataset", label_file) | |
# image_array = idx2numpy.convert_from_file(file_path) | |
# print(image_array.shape) # (10000, 28, 28) # 10000 images of 28x28 pixels | |
test_data = CustomImageDataset("mnist_dataset/t10k-labels.idx1-ubyte", "mnist_dataset/t10k-images.idx3-ubyte", transform=test_transform) | |
print((test_data[0])[0].shape, "label value", test_data[0][1]) # Getting image from dataset. | |
train_data = CustomImageDataset("mnist_dataset/train-labels.idx1-ubyte", "mnist_dataset/train-images.idx3-ubyte", transform=train_transform) | |
# Create a DataLoader, so we can iterate through the dataset in batches. | |
test_loader = DataLoader(test_data, batch_size=64, shuffle=True) | |
train_loader = DataLoader(train_data, batch_size=64, shuffle=True) | |
# print(f"Output shape of train function, ", next(iter(test_loader))[0].shape) # [ 64x28x28 ] [64,] Image and labels. | |
# Display image and label. - From docs. | |
# train_features, train_labels = next(iter(train_loader)) | |
# print(f"Feature batch shape: {train_features.size()}") | |
# print(f"Labels batch shape: {train_labels.size()}") | |
# img = train_features[0].squeeze() | |
# label = train_labels[0] | |
# plt.imshow(img, cmap="gray") | |
# plt.show() | |
# print(f"Label: {label}") | |
model = LeNet5Model().to(device) | |
model.apply(init_weights) # Apply Xavier initialisation to the model. | |
print(model) | |
# Training the model | |
train_model(model, train_loader, test_loader, epochs=1000, learning_rate=0.001) | |
# Save the model parameters | |
torch.save(model.state_dict(), "lenet_mnist_model.pth") | |
# Current errors include: | |
# - RuntimeError: Input type (unsigned char) and bias type (float) should be the same | |
# - I solved this by converting the image from customer loader to float32 values. | |
# - RuntimeError: Calculated padded input size per channel: (4 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size | |
# - I solved this by adding padding to make it 32x32 as the model expect this and dataset is 28x28. | |
# - The model also had problems when evaluating, it is important dims are batch x channels x height x width, and labels are int. | |
# Ways to improve accuracy: | |
# We will try to normalise the dataset via z-score, so values which are brighter are not given more importance. [98.99% accuracy] | |
# We can apply rotations and affine to potentially improve the model by making it learn more abstractly from specific patterns rather than exact same orientation. | |
# Xavier intialisation of CNN and FC layers, to prevent vanishing gradients. | |
# Increase the angle of rotation and affine transformations to see if it improves the model. | |
# We could potentally help the model by applying a enhancement filter (negative laplacian) from computer vision, to the image, inverse laplacian | |
# We do not know whether model is overfitting, as we do not have a graph of the training and validation loss. |