makiisthebes
/

LeNet5-experimental

Model card Files Files and versions Community

makiisthebes commited on Feb 6

Commit

61f0100

•

1 Parent(s): 35d4777

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +2 -0
best_model.txt +1 -0
le_net_learning_mnist.py +266 -0
lenet_mnist_model.pth +3 -0
let_net_arch.png +0 -0
mnist_dataset/t10k-images.idx3-ubyte +3 -0
mnist_dataset/t10k-labels.idx1-ubyte +0 -0
mnist_dataset/train-images.idx3-ubyte +3 -0
mnist_dataset/train-labels.idx1-ubyte +0 -0
utils.py +62 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+mnist_dataset/t10k-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
+mnist_dataset/train-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text

best_model.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.9906

le_net_learning_mnist.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Rewriting the LeNet model to learn the MNIST dataset and save the model parameters,
+# This is considered something we should do in Week 3 of the Deep Learning and Computer Vision course.
+# We will implement LeNet-5 architecture to learn the MNIST dataset.
+from torchvision.transforms import ToTensor
+# from torchvision.transforms import v2
+from torchvision import transforms
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+from torchvision import datasets
+import matplotlib.pyplot as plt
+from PIL import Image
+from time import time
+from torch import nn
+import pandas as pd
+import numpy as np
+import torch, os
+from utils import ApplyEnhancementFilter
+# Load device first (GPU or CPU)
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+print(f"Using {device} device for training/inference.")
+if device == "cuda":
+    print(f"GPU being used: {torch.cuda.get_device_name(0)}")
+train_transform = transforms.Compose([
+    # Data augmentation transformations
+    # ApplyEnhancementFilter(out_channels=1, kernel_size=3, stride=1, padding=1),
+    transforms.RandomAffine(degrees=35, translate=(0.1, 0.1), scale=(0.9, 1.1)),
+    transforms.RandomRotation(degrees=35),
+    # Convert images to tensors and normalize
+    transforms.ToTensor(),
+    transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)),
+    # Pad the image to make it 32x32
+    transforms.Pad(2, fill=0, padding_mode='constant'),
+])
+# For the test dataset, you should not apply these augmentations
+test_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)),
+    transforms.Pad(2, fill=0, padding_mode='constant'),
+])
+# Load the MNIST dataset which is 32x32x1 images (black and white ~ 1 channel)
+# http://yann.lecun.com/exdb/mnist/
+# datasets.MNIST
+# Loading from Dataset and DataLoader, https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
+# Load using known datasets, but what if we have our own dataset?
+# training_data = datasets.MNIST(
+#     root="data",
+#     train=True,
+#     download=True,
+#     transform=ToTensor()
+# )
+#
+# test_data = datasets.MNIST(
+#     root="data",
+#     train=False,
+#     download=True,
+#     transform=ToTensor()
+# )
+# Loading from a custom dataset
+import idx2numpy
+class CustomImageDataset(Dataset):
+    """
+        This class must inherit from the torch.utils.data.Dataset class.
+        And contina functions __init__, __len__, and __getitem__.
+    """
+    def __init__(self, annotations_file, image_file, transform=None, target_transform=None):
+        self.img_labels = idx2numpy.convert_from_file(annotations_file)
+        self.images = idx2numpy.convert_from_file(image_file)
+        self.transform = transform
+        self.target_transform = target_transform
+    def __len__(self):
+        return len(self.img_labels)
+    def __getitem__(self, idx):
+        """Get the image and label at the index idx."""
+        label = self.img_labels[idx]
+        img = self.images[idx]
+        img = Image.fromarray(img)
+        if self.transform:
+            img = self.transform(img)
+        if self.target_transform:
+            label = self.target_transform(label)
+        # Adding 0 padding to make it 32x32, as the model expects this.
+        # img = img.unsqueeze(0)  # Add channel dimension, as model expects this.
+        return img, label  # Return as float32, and label as int., should solve issue.
+# Make the LeNet-5 model
+class LeNet5Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Define activation, and sequential layers, then make forward pass.
+        self.tanh = nn.Tanh()
+        # Convolutional layers, https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+        # Avg Pooling, https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html
+        self.le_stack = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
+            self.tanh,
+            nn.AvgPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
+            self.tanh,
+            nn.AvgPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
+            self.tanh
+        )
+        # Fully connected layers, https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
+        self.fc_stack = nn.Sequential(
+            nn.Linear(in_features=120, out_features=84),
+            self.tanh,
+            nn.Linear(in_features=84, out_features=10)
+        )
+    def forward(self, x):
+        """Forward pass of the model."""
+        x = self.le_stack(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc_stack(x)
+        return x
+def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001, saved_model=None):
+    """
+        Given a model, train the model using the train_loader and test_loader, and show metrics,
+        saving the best model parameters currently.
+    """
+    # When we have model, we need the loss function and optimizer we will use.
+    # Loss function, https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
+    loss_fn = nn.CrossEntropyLoss()  # because we calculating probabilities and this is a classification problem.
+    # Optimizer, https://pytorch.org/docs/stable/optim.html
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-6)  # learning rate of 0.001
+    best_accuracy = 0.0
+    # See if best accuracy is saved, if so, get current best accuracy.
+    if os.path.exists("best_model.txt"):
+        with open("best_model.txt", "r") as file:
+            best_accuracy = float(file.read())
+    if saved_model is not None:  # Load the model parameters if they exist.
+        model.load_state_dict(torch.load(saved_model))
+    # Training loop
+    for i in range(epochs):
+        model.train()
+        print("Epoch ", i)
+        for batch, (x, y) in enumerate(train_loader):
+            x, y = x.to(device), y.to(device)
+            # Forward pass
+            # print(x.shape, y.shape)
+            # Shape of x is [64, 28, 28] and y is [64,]
+            # But x needs to include the channels, so shape should be [64, 1, 28, 28]
+            # x = x.view(-1, 1, 32, 32)
+            y_pred = model(x)
+            # Compute loss
+            loss = loss_fn(y_pred, y)
+            # Zero gradients, backward pass, and update weights
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            # Print loss
+            if batch % 250 == 0:
+                print(f"Epoch {i} batch {batch} loss: {loss.item()}")
+        # Evaluate the model
+        model.eval()
+        correct, total = 0, 0
+        with torch.no_grad():
+            for x, y in test_loader:
+                x, y = x.to(device), y.to(device)
+                #x = x.view(-1, 1, 32, 32)
+                y_pred = model(x)
+                _, predicted = torch.max(y_pred, 1)
+                total += y.size(0)
+                correct += (predicted == y).sum().item()
+        print(f"Epoch {i} accuracy: {correct/total}")
+        if correct/total > best_accuracy:
+            best_accuracy = correct/total
+            torch.save(model.state_dict(), "lenet_mnist_model.pth")
+            with open("best_model.txt", "w") as file:
+                file.write(f"{best_accuracy}")
+    print("Training complete.")
+def init_weights(m):
+    if isinstance(m, nn.Conv2d):
+        nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0.01)
+    elif isinstance(m, nn.Linear):
+        nn.init.xavier_uniform_(m.weight)
+        m.bias.data.fill_(0.01)
+if __name__ == "__main__":
+    # Testing conversion from ubyte idx to numpy array
+    # file_name = "t10k-images.idx3-ubyte"
+    # label_file = "t10k-labels.idx1-ubyte"
+    # file_path = os.path.join("mnist_dataset", label_file)
+    # image_array = idx2numpy.convert_from_file(file_path)
+    # print(image_array.shape)  # (10000, 28, 28)  # 10000 images of 28x28 pixels
+    test_data = CustomImageDataset("mnist_dataset/t10k-labels.idx1-ubyte", "mnist_dataset/t10k-images.idx3-ubyte", transform=test_transform)
+    print((test_data[0])[0].shape, "label value", test_data[0][1]) # Getting image from dataset.
+    train_data = CustomImageDataset("mnist_dataset/train-labels.idx1-ubyte", "mnist_dataset/train-images.idx3-ubyte", transform=train_transform)
+    # Create a DataLoader, so we can iterate through the dataset in batches.
+    test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
+    train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
+    # print(f"Output shape of train function, ", next(iter(test_loader))[0].shape)  # [ 64x28x28 ] [64,] Image and labels.
+    # Display image and label. - From docs.
+    # train_features, train_labels = next(iter(train_loader))
+    # print(f"Feature batch shape: {train_features.size()}")
+    # print(f"Labels batch shape: {train_labels.size()}")
+    # img = train_features[0].squeeze()
+    # label = train_labels[0]
+    # plt.imshow(img, cmap="gray")
+    # plt.show()
+    # print(f"Label: {label}")
+    model = LeNet5Model().to(device)
+    model.apply(init_weights)  # Apply Xavier initialisation to the model.
+    print(model)
+    # Training the model
+    train_model(model, train_loader, test_loader, epochs=1000, learning_rate=0.001)
+    # Save the model parameters
+    torch.save(model.state_dict(), "lenet_mnist_model.pth")
+    # Current errors include:
+    # - RuntimeError: Input type (unsigned char) and bias type (float) should be the same
+    # - I solved this by converting the image from customer loader to float32 values.
+    # - RuntimeError: Calculated padded input size per channel: (4 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size
+    # - I solved this by adding padding to make it 32x32 as the model expect this and dataset is 28x28.
+    # - The model also had problems when evaluating, it is important dims are batch x channels x height x width, and labels are int.
+    # Ways to improve accuracy:
+    # We will try to normalise the dataset via z-score, so values which are brighter are not given more importance. [98.99% accuracy]
+    # We can apply rotations and affine to potentially improve the model by making it learn more abstractly from specific patterns rather than exact same orientation.
+    # Xavier intialisation of CNN and FC layers, to prevent vanishing gradients.
+    # Increase the angle of rotation and affine transformations to see if it improves the model.
+    # We could potentally help the model by applying a enhancement filter (negative laplacian) from computer vision, to the image, inverse laplacian
+    # We do not know whether model is overfitting, as we do not have a graph of the training and validation loss.

lenet_mnist_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05ff80605ac574e7e667ec532c8c4b94845e2b11c0c69c06feccd7d86dbab95f
+size 250431

let_net_arch.png ADDED Viewed

mnist_dataset/t10k-images.idx3-ubyte ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fa7898d509279e482958e8ce81c8e77db3f2f8254e26661ceb7762c4d494ce7
+size 7840016

mnist_dataset/t10k-labels.idx1-ubyte ADDED Viewed

Binary file (10 kB). View file

mnist_dataset/train-images.idx3-ubyte ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba891046e6505d7aadcbbe25680a0738ad16aec93bde7f9b65e87a2fc25776db
+size 47040016

mnist_dataset/train-labels.idx1-ubyte ADDED Viewed

Binary file (60 kB). View file

utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import idx2numpy, torch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms, datasets
+from PIL import Image
+class ApplyEnhancementFilter:
+	def __init__(self, out_channels, kernel_size, stride=1, padding=0, bias=False):
+		"""
+		Initialize the convolution parameters.
+		"""
+		self.out_channels = out_channels
+		self.kernel_size = kernel_size
+		self.stride = stride
+		self.padding = padding
+		self.bias = bias
+		# Define the convolutional layer (not trained here)
+		self.conv = nn.Conv2d(in_channels=1,  # Adjust this based on your image channels (1 for grayscale, 3 for RGB)
+		                      out_channels=out_channels,
+		                      kernel_size=kernel_size,
+		                      stride=stride,
+		                      padding=padding,
+		                      bias=bias)
+		# Example: Manually defining a simple edge-detection kernel
+		# For a real use-case, the kernel weights would be learned or defined according to the filter you need.
+		edge_detection_kernel = torch.tensor([[0, -1., 0.],
+		                                      [-1., 5., -1.],
+		                                      [0., -1., 0.]]).unsqueeze(0).unsqueeze(0)
+		self.conv.weight = nn.Parameter(edge_detection_kernel.float())
+	def __call__(self, img):
+		"""
+		Apply the convolution transformation.
+		"""
+		# Convert PIL image to tensor
+		img_tensor = transforms.functional.to_tensor(img).unsqueeze(0)  # Add batch dimension
+		# Apply convolution
+		conv_img = self.conv(img_tensor)
+		# Remove batch dimension and convert back to PIL image for further transformations or visualization
+		conv_img_pil = transforms.functional.to_pil_image(conv_img.squeeze(0))
+		return conv_img_pil
+if __name__ == "__main__":
+	# It is important to normalise the dataset, so no specific input effects the model more than other based purely on input values.
+	# As values can range from 0-255, this can cause problems, so z-score will be used via Transforms.
+	# First we need the mean and standard deviation of train dataset.
+	train_images = idx2numpy.convert_from_file("mnist_dataset/train-images.idx3-ubyte")
+	# Convert the training images to a PyTorch tensor and scale values to [0, 1]
+	train_images_tensor = torch.tensor(train_images, dtype=torch.float32) / 255.0
+	train_mean = train_images_tensor.mean()
+	train_std = train_images_tensor.std()
+	print(f"Mean: {train_mean}, Std: {train_std}")
+	# Mean: 0.13066047430038452, Std: 0.30810782313346863