Spaces:

abhishekrs4
/

Handwriting_Recognition

Sleeping

App Files Files Community

abhishekrs4 commited on Mar 31

Commit

bd421ea

•

1 Parent(s): 2cabce6

added iam_line_recognition module

Browse files

Files changed (10) hide show

iam_line_recognition/__init__.py +3 -0
iam_line_recognition/dataset.py +253 -0
iam_line_recognition/final_iam_line_recognizer.py +205 -0
iam_line_recognition/logger_utils.py +60 -0
iam_line_recognition/model_main.py +151 -0
iam_line_recognition/model_visual_features.py +401 -0
iam_line_recognition/test_internal.py +164 -0
iam_line_recognition/train.py +275 -0
iam_line_recognition/utils.py +103 -0
iam_line_recognition/utils_unique_chars.py +43 -0

iam_line_recognition/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import os, sys
2	+
3	+ sys.path.append(os.path.dirname(os.path.realpath(__file__)))

iam_line_recognition/dataset.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+import torch
+import torch.nn
+import numpy as np
+from PIL import Image
+from skimage.io import imread
+import torchvision.transforms as transforms
+from torch.utils.data import Dataset, DataLoader
+from sklearn.model_selection import train_test_split
+def read_IAM_label_txt_file(file_txt_labels):
+    """
+    ---------
+    Arguments
+    ---------
+    file_txt_labels : str
+        full path to the text file containing labels
+    -------
+    Returns
+    -------
+    a tuple of
+    all_image_files : list
+        a list of all image file names
+    all_labels : list
+        a list of all labels
+    """
+    label_file_handler = open(file_txt_labels, mode="r")
+    all_lines = label_file_handler.readlines()
+    num_lines = len(all_lines)
+    all_image_files = []
+    all_labels = []
+    for cur_line_num in range(num_lines):
+        if cur_line_num % 3 == 0:
+            all_image_files.append(all_lines[cur_line_num].strip())
+        elif cur_line_num % 3 == 1:
+            all_labels.append(all_lines[cur_line_num].strip())
+        else:
+            continue
+    return all_image_files, all_labels
+class HWRecogIAMDataset(Dataset):
+    """
+    Main dataset class to be used only for training, validation and internal testing
+    """
+    CHAR_SET = ' !"#&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+    CHAR_2_LABEL = {char: i + 1 for i, char in enumerate(CHAR_SET)}
+    LABEL_2_CHAR = {label: char for char, label in CHAR_2_LABEL.items()}
+    def __init__(self, list_image_files, list_labels, dir_images, image_height=32, image_width=768, which_set="train"):
+        """
+        ---------
+        Arguments
+        ---------
+        list_image_files : list
+            list of image files
+        list_labels : list
+            list of labels
+        dir_images : str
+            full path to directory containing images
+        image_height : int
+            image height (default: 32)
+        image_width : int
+            image width (default: 768)
+        which_set : str
+            a string indicating which set is being used (default: train)
+        """
+        self.list_labels = list_labels
+        self.dir_images = dir_images
+        self.list_image_files = list_image_files
+        self.image_width = image_width
+        self.image_height = image_height
+        self.which_set = which_set
+        if self.which_set == "train":
+            # apply data augmentation only for train set
+            self.transform = transforms.Compose([
+                transforms.ToPILImage(),
+                transforms.Resize((self.image_height, self.image_width), Image.BILINEAR),
+                transforms.RandomAffine(degrees=[-0.75, 0.75], translate=[0, 0.05], scale=[0.75, 1],
+                    shear=[-35, 35], interpolation=transforms.InterpolationMode.BILINEAR, fill=255,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ])
+        else:
+            self.transform = transforms.Compose([
+                transforms.ToPILImage(),
+                transforms.Resize((self.image_height, self.image_width), Image.BILINEAR),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ])
+    def __len__(self):
+        return len(self.list_image_files)
+    def __getitem__(self, idx):
+        image_file_name = self.list_image_files[idx]
+        image_gray = imread(os.path.join(self.dir_images, image_file_name))
+        image_3_channel = np.repeat(np.expand_dims(image_gray, -1), 3, -1)
+        image_3_channel = self.transform(image_3_channel)
+        label_string = self.list_labels[idx]
+        label_encoded = [self.CHAR_2_LABEL[c] for c in label_string]
+        label_length = [len(label_encoded)]
+        label_encoded = torch.LongTensor(label_encoded)
+        label_length = torch.LongTensor(label_length)
+        return image_3_channel, label_encoded, label_length
+def IAM_collate_fn(batch):
+    """
+    collate function
+    ---------
+    Arguments
+    ---------
+    batch : tuple
+        a batch of input data as a tuple
+    -------
+    Returns
+    -------
+    a collated tuple of
+    images : tensor
+        tensor of batch images
+    labels : tensor
+        tensor of batch labels
+    label_lengths : tensor
+        tensor of batch label lengths
+    """
+    images, labels, label_lengths = zip(*batch)
+    images = torch.stack(images, 0)
+    labels = torch.cat(labels, 0)
+    label_lengths = torch.cat(label_lengths, 0)
+    return images, labels, label_lengths
+def split_dataset(file_txt_labels, for_train=True):
+    """
+    ---------
+    Arguments
+    ---------
+    file_txt_labels : str
+        full path to the text file containing labels
+    for_train : bool
+        indicating whether split is for training or internal testing
+    -------
+    Returns
+    -------
+    a tuple of files depending for train or internal testing
+    """
+    all_image_files, all_labels = read_IAM_label_txt_file(file_txt_labels)
+    train_image_files, test_image_files, train_labels, test_labels = train_test_split(all_image_files, all_labels, test_size=0.1, random_state=4)
+    train_image_files, valid_image_files, train_labels, valid_labels = train_test_split(train_image_files, train_labels, test_size=0.1, random_state=4)
+    if for_train:
+        return train_image_files, valid_image_files, train_labels, valid_labels
+    else:
+        return test_image_files, test_labels
+def get_dataloaders_for_training(train_x, train_y, valid_x, valid_y, dir_images, image_height=32, image_width=768, batch_size=8):
+    """
+    ---------
+    Arguments
+    ---------
+    train_x : list
+        list of train file names
+    train_y : list
+        list of train labels
+    valid_x : list
+        list of validation file names
+    valid_y : list
+        list of validation labels
+    dir_images : str
+        full directory path containing the images
+    image_height : int
+        image height (default: 32)
+    image_width : int
+        image width (default: 768)
+    batch_size : int
+        batch size (default: 8)
+    -------
+    Returns
+    -------
+    a tuple of dataloaders objects
+    train_loader : object
+        object of train set dataloader
+    valid_loader : object
+        object of validation set dataloader
+    """
+    train_dataset = HWRecogIAMDataset(train_x, train_y, dir_images, image_height=image_height, image_width=image_width, which_set="train")
+    valid_dataset = HWRecogIAMDataset(valid_x, valid_y, dir_images, image_height=image_height, image_width=image_width, which_set="valid")
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=4,
+        collate_fn=IAM_collate_fn,
+    )
+    valid_loader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=4,
+        collate_fn=IAM_collate_fn,
+    )
+    return train_loader, valid_loader
+def get_dataloader_for_testing(test_x, test_y, dir_images, image_height=32, image_width=768, batch_size=1):
+    """
+    ---------
+    Arguments
+    ---------
+    test_x : list
+        list of test file names
+    test_y : list
+        list of test labels
+    dir_images : str
+        full directory path containing the images
+    image_height : int
+        image height (default: 32)
+    image_width : int
+        image width (default: 768)
+    batch_size : int
+        batch size (default: 1)
+    -------
+    Returns
+    -------
+    test_loader : object
+        object of test set dataloader
+    """
+    test_dataset = HWRecogIAMDataset(test_x, test_y, dir_images=dir_images, image_height=image_height, image_width=image_width, which_set="test")
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=4,
+        collate_fn=IAM_collate_fn,
+    )
+    return test_loader

iam_line_recognition/final_iam_line_recognizer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import os
+import sys
+import time
+import torch
+import argparse
+import torchvision
+import numpy as np
+import torch.nn as nn
+from PIL import Image
+from skimage.io import imread
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+from dataset import HWRecogIAMDataset
+from model_main import CRNN, STN_CRNN
+from utils import ctc_decode, compute_wer_and_cer_for_sample
+class DatasetFinalEval(HWRecogIAMDataset):
+    """
+    Dataset class for final evaluation - inherits main dataset class
+    """
+    def __init__(self, dir_images, image_height=32, image_width=768):
+        """
+        ---------
+        Arguments
+        ---------
+        dir_images : str
+            full path to directory containing images
+        image_height : int
+            image height (default: 32)
+        image_width : int
+            image width (default: 768)
+        """
+        self.dir_images = dir_images
+        self.image_files = [f for f in os.listdir(self.dir_images) if f.endswith(".png")]
+        self.image_width = image_width
+        self.image_height = image_height
+        self.transform = transforms.Compose([
+            transforms.ToPILImage(),
+            transforms.Resize((self.image_height, self.image_width), Image.BILINEAR),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225],
+            ),
+        ])
+    def __len__(self):
+        return len(self.image_files)
+    def __getitem__(self, idx):
+        image_file_name = self.image_files[idx]
+        image_gray = imread(os.path.join(self.dir_images, image_file_name))
+        image_3_channel = np.repeat(np.expand_dims(image_gray, -1), 3, -1)
+        image_3_channel = self.transform(image_3_channel)
+        return image_3_channel
+def get_dataloader_for_evaluation(dir_images, image_height=32, image_width=768, batch_size=1):
+    """
+    ---------
+    Arguments
+    ---------
+    dir_images : str
+        full path to directory containing images
+    image_height : int
+        image height (default: 32)
+    image_width : int
+        image width (default: 768)
+    batch_size : int
+        batch size to use for final evaluation (default: 1)
+    -------
+    Returns
+    -------
+    test_loader : object
+        dataset loader object for final evaluation
+    """
+    test_dataset = DatasetFinalEval(dir_images=dir_images, image_height=image_height, image_width=image_width)
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=4,
+    )
+    return test_loader
+def final_eval(hw_model, device, test_loader, dir_images, dir_results):
+    """
+    ---------
+    Arguments
+    ---------
+    hw_model : object
+        handwriting recognition model object
+    device : str
+        device to be used for running the evaluation
+    test_loader : object
+        dataset loader object
+    dir_images : str
+        full path to directory containing test images
+    dir_results : str
+        relative path to directory to save the predictions as txt files
+    """
+    hw_model.eval()
+    count = 0
+    num_test_samples = len(test_loader.dataset)
+    list_test_files = os.listdir(dir_images)
+    if not os.path.isdir(dir_results):
+        print(f"creating directory: {dir_results}")
+        os.makedirs(dir_results)
+    with torch.no_grad():
+        for image_test in test_loader:
+            file_test = list_test_files[count]
+            count += 1
+            """
+            if count == 11:
+                break
+            """
+            image_test = image_test.to(device, dtype=torch.float)
+            log_probs = hw_model(image_test)
+            pred_labels = ctc_decode(log_probs)
+            str_pred = [DatasetFinalEval.LABEL_2_CHAR[i] for i in pred_labels[0]]
+            str_pred = "".join(str_pred)
+            with open(os.path.join(dir_results, file_test+".txt"), "w", encoding="utf-8", newline="\n") as fh_pred:
+                fh_pred.write(str_pred)
+            print(f"progress: {count}/{num_test_samples}, test file: {list_test_files[count-1]}")
+            print(f"{str_pred}\n")
+    print(f"predictions saved in directory: ./{dir_results}\n")
+    return
+def test_hw_recognizer(FLAGS):
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+    num_classes = len(DatasetFinalEval.LABEL_2_CHAR) + 1
+    print(f"task - handwriting recognition")
+    print(f"model: {FLAGS.which_hw_model}")
+    print(f"image height: {FLAGS.image_height}, image width: {FLAGS.image_width}")
+    # load the right model
+    if FLAGS.which_hw_model == "crnn":
+        hw_model = CRNN(num_classes, FLAGS.image_height)
+    elif FLAGS.which_hw_model == "stn_crnn":
+        hw_model = STN_CRNN(num_classes, FLAGS.image_height, FLAGS.image_width)
+    else:
+        print(f"unidentified option : {FLAGS.which_hw_model}")
+        sys.exit(0)
+    dir_results = f"results_{FLAGS.which_hw_model}"
+    # choose a device for evaluation
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    hw_model.to(device)
+    hw_model.load_state_dict(torch.load(FLAGS.file_model))
+    # get test set dataloader
+    test_loader = get_dataloader_for_evaluation(
+        dir_images=FLAGS.dir_images, image_height=FLAGS.image_height, image_width=FLAGS.image_width,
+    )
+    # start the evaluation on the final test set
+    print(f"final evaluation of handwriting recognition model {FLAGS.which_hw_model} started\n")
+    final_eval(hw_model, device, test_loader, FLAGS.dir_images, dir_results)
+    print(f"final evaluation of handwriting recognition model completed!!!!")
+    return
+def main():
+    image_height = 32
+    image_width = 768
+    which_hw_model = "crnn"
+    dir_images = "/home/abhishek/Desktop/RUG/hw_recognition/IAM-data/img/"
+    file_model = "model_crnn/crnn_H_32_W_768_E_177.pth"
+    save_predictions = 1
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--image_height", default=image_height,
+        type=int, help="image height to be used to predict with the model")
+    parser.add_argument("--image_width", default=image_width,
+        type=int, help="image width to be used to predict with the model")
+    parser.add_argument("--dir_images", default=dir_images,
+        type=str, help="full directory path to directory containing images")
+    parser.add_argument("--which_hw_model", default=which_hw_model,
+        type=str, choices=["crnn", "stn_crnn"], help="which model to be used for prediction")
+    parser.add_argument("--file_model", default=file_model,
+        type=str, help="full path to trained model file (.pth)")
+    parser.add_argument("--save_predictions", default=save_predictions,
+        type=int, choices=[0, 1], help="save or do not save the predictions (1 - save, 0 - do not save)")
+    FLAGS, unparsed = parser.parse_known_args()
+    test_hw_recognizer(FLAGS)
+    return
+if __name__ == "__main__":
+    main()

iam_line_recognition/logger_utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import csv
+import json
+def write_json_file(file_json, dict_data):
+    """
+    ---------
+    Arguments
+    ---------
+    file_json : str
+        full path of json file to be saved
+    dict_data : dict
+        dictionary of params to be saved in the json file
+    """
+    with open(file_json, "w", encoding="utf-8") as fh:
+        fh.write(json.dumps(dict_data, indent=4))
+    return
+class CSVWriter:
+    """
+    for writing tabular data to a csv file
+    """
+    def __init__(self, file_name, column_names):
+        """
+        ---------
+        Arguments
+        ---------
+        file_name : str
+            full path of csv file
+        column_names : list
+            a list of columns names to be used to create the csv file
+        """
+        self.file_name = file_name
+        self.column_names = column_names
+        self.file_handle = open(self.file_name, "w", encoding="utf-8", newline="\n")
+        self.writer = csv.writer(self.file_handle)
+        self.write_header()
+        print(f"{self.file_name} created successfully with header row")
+    def write_header(self):
+        """
+        writes header into csv file
+        """
+        self.write_row(self.column_names)
+        return
+    def write_row(self, row):
+        """
+        writes a row into csv file
+        """
+        self.writer.writerow(row)
+        return
+    def close(self):
+        """
+        close the file
+        """
+        self.file_handle.close()
+        return

iam_line_recognition/model_main.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+from model_visual_features import ResNetFeatureExtractor, TPS_SpatialTransformerNetwork
+class HW_RNN_Seq2Seq(nn.Module):
+    """
+    Visual Seq2Seq model using BiLSTM
+    """
+    def __init__(self, num_classes, image_height, cnn_output_channels=512, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
+        """
+        ---------
+        Arguments
+        ---------
+        num_classes : int
+            num of distinct characters (classes) in the dataset
+        image_height : int
+            image height
+        cnn_output_channels : int
+            number of channels output from the CNN visual feature extractor (default: 512)
+        num_feats_mapped_seq_hidden : int
+            number of features to be used in the mapped visual features as sequences (default: 128)
+        num_feats_seq_hidden : int
+            number of features to be used in the LSTM for sequence modeling (default: 256)
+        """
+        super().__init__()
+        self.output_height = image_height // 32
+        self.dropout = nn.Dropout(p=0.25)
+        self.map_visual_to_seq = nn.Linear(cnn_output_channels * self.output_height, num_feats_mapped_seq_hidden)
+        self.b_lstm_1 = nn.LSTM(num_feats_mapped_seq_hidden, num_feats_seq_hidden, bidirectional=True)
+        self.b_lstm_2 = nn.LSTM(2 * num_feats_seq_hidden, num_feats_seq_hidden, bidirectional=True)
+        self.final_dense = nn.Linear(2 * num_feats_seq_hidden, num_classes)
+    def forward(self, visual_feats):
+        visual_feats = visual_feats.permute(3, 0, 1, 2)
+        # WBCH
+        # the sequence is along the width of the image as a sentence
+        visual_feats = visual_feats.contiguous().view(visual_feats.shape[0], visual_feats.shape[1], -1)
+        # WBC
+        seq = self.map_visual_to_seq(visual_feats)
+        seq = self.dropout(seq)
+        lstm_1, _ = self.b_lstm_1(seq)
+        lstm_2, _ = self.b_lstm_2(lstm_1)
+        lstm_2 = self.dropout(lstm_2)
+        dense_output = self.final_dense(lstm_2)
+        # [seq_len, B, num_classes]
+        log_probs = F.log_softmax(dense_output, dim=2)
+        return log_probs
+class CRNN(nn.Module):
+    """
+    Hybrid CNN - RNN model
+    CNN - Modified ResNet34 for visual features
+    RNN - BiLSTM for seq2seq modeling
+    """
+    def __init__(self, num_classes, image_height, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
+        """
+        ---------
+        Arguments
+        ---------
+        num_classes : int
+            num of distinct characters (classes) in the dataset
+        image_height : int
+            image height
+        num_feats_mapped_seq_hidden : int
+            number of features to be used in the mapped visual features as sequences (default: 128)
+        num_feats_seq_hidden : int
+            number of features to be used in the LSTM for sequence modeling (default: 256)
+        """
+        super().__init__()
+        self.visual_feature_extractor = ResNetFeatureExtractor()
+        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(num_classes, image_height, self.visual_feature_extractor.output_channels, num_feats_mapped_seq_hidden, num_feats_seq_hidden)
+    def forward(self, x):
+        visual_feats = self.visual_feature_extractor(x)
+        # [B, 512, H/32, W/32]
+        log_probs = self.rnn_seq2seq_module(visual_feats)
+        return log_probs
+class STN_CRNN(nn.Module):
+    """
+    STN + CNN + RNN model
+    STN - Spatial Transformer Network for learning variable handwriting
+    CNN - Modified ResNet34 for visual features
+    RNN - BiLSTM for seq2seq modeling
+    """
+    def __init__(self, num_classes, image_height, image_width, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
+        """
+        ---------
+        Arguments
+        ---------
+        num_classes : int
+            num of distinct characters (classes) in the dataset
+        image_height : int
+            image height
+        image_width : int
+            image width
+        num_feats_mapped_seq_hidden : int
+            number of features to be used in the mapped visual features as sequences (default: 128)
+        num_feats_seq_hidden : int
+            number of features to be used in the LSTM for sequence modeling (default: 256)
+        """
+        super().__init__()
+        self.stn = TPS_SpatialTransformerNetwork(
+            80,
+            (image_height, image_width),
+            (image_height, image_width),
+            I_channel_num=3,
+        )
+        self.visual_feature_extractor = ResNetFeatureExtractor()
+        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(num_classes, image_height, self.visual_feature_extractor.output_channels, num_feats_mapped_seq_hidden, num_feats_seq_hidden)
+    def forward(self, x):
+        stn_output = self.stn(x)
+        visual_feats = self.visual_feature_extractor(stn_output)
+        log_probs = self.rnn_seq2seq_module(visual_feats)
+        return log_probs
+"""
+class STN_PP_CRNN(nn.Module):
+    def __init__(self, num_classes, image_height, image_width, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
+        super().__init__()
+        self.stn = TPS_SpatialTransformerNetwork(
+            20,
+            (image_height, image_width),
+            (image_height, image_width),
+            I_channel_num=3,
+        )
+        self.visual_feature_extractor = ResNetFeatureExtractor()
+        self.pp_block = PyramidPoolBlock(num_channels=self.visual_feature_extractor.output_channels)
+        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(num_classes, image_height, self.visual_feature_extractor.output_channels, num_feats_mapped_seq_hidden, num_feats_seq_hidden)
+    def forward(self, x):
+        stn_output = self.stn(x)
+        visual_feats = self.visual_feature_extractor(stn_output)
+        pp_feats = self.pp_block(visual_feats)
+        log_probs = self.rnn_seq2seq_module(pp_feats)
+        return log_probs
+"""

iam_line_recognition/model_visual_features.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import torch
+import numpy as np
+import torch.nn as nn
+from typing import List
+from torch import Tensor
+import torch.nn.functional as F
+from torchvision.models.resnet import BasicBlock, model_urls, load_state_dict_from_url, conv1x1, conv3x3
+device = torch.device("cuda")
+class CustomResNet(nn.Module):
+    def __init__(
+        self,
+        layers: List[int],
+        block=BasicBlock,
+        zero_init_residual=False,
+        groups=1,
+        num_classes=1000,
+        width_per_group=64,
+        replace_stride_with_dilation=None,
+        norm_layer=None,
+    ):
+        super().__init__()
+        if norm_layer is None:
+            self._norm_layer = nn.BatchNorm2d
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                f"or a 3-element tuple, got {replace_stride_with_dilation}"
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=(2, 1), padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=(2, 1), dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=(2, 2), dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=(2, 1), dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+    def _make_layer(
+        self,
+        block,
+        planes,
+        blocks,
+        stride=1,
+        dilate=False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+def _resnet(layers: List[int], pretrained=True) -> CustomResNet:
+    model = CustomResNet(layers)
+    if pretrained:
+        model.load_state_dict(load_state_dict_from_url(model_urls["resnet34"]))
+    return model
+def resnet34(*, pretrained=True) -> CustomResNet:
+    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    Args:
+        weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet34_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+    .. autoclass:: torchvision.models.ResNet34_Weights
+        :members:
+    """
+    return _resnet([3, 4, 6, 3], pretrained=pretrained)
+class ResNetFeatureExtractor(nn.Module):
+    """
+    Defines Base ResNet-34 feature extractor
+    """
+    def __init__(self, pretrained=True):
+        """
+        ---------
+        Arguments
+        ---------
+        pretrained : bool (default=True)
+            boolean to indicate whether to use a pretrained resnet model or not
+        """
+        super().__init__()
+        self.output_channels = 512
+        self.resnet34 = resnet34(pretrained=pretrained)
+    def forward(self, x):
+        block1 = self.resnet34.conv1(x)
+        block1 = self.resnet34.bn1(block1)
+        block1 = self.resnet34.relu(block1)   # [64, H/2, W/2]
+        block2 = self.resnet34.maxpool(block1)
+        block2 = self.resnet34.layer1(block2)  # [64, H/4, W/4]
+        block3 = self.resnet34.layer2(block2)  # [128, H/8, W/8]
+        block4 = self.resnet34.layer3(block3)  # [256, H/16, W/16]
+        resnet_features = self.resnet34.layer4(block4)  # [512, H/32, W/32]
+        # [B, 512, H/32, W/32]
+        return resnet_features
+#########################################
+### STN - Spatial Transformer Network ###
+#########################################
+class TPS_SpatialTransformerNetwork(nn.Module):
+    """ Rectification Network of RARE, namely TPS based STN """
+    def __init__(self, num_fiducial_points, I_size, I_r_size, I_channel_num=1):
+        """ Based on RARE TPS
+        input:
+            batch_I: Batch Input Image [batch_size x I_channel_num x I_height x I_width]
+            I_size : (height, width) of the input image I
+            I_r_size : (height, width) of the rectified image I_r
+            I_channel_num : the number of channels of the input image I
+        output:
+            batch_I_r: rectified image [batch_size x I_channel_num x I_r_height x I_r_width]
+        """
+        super(TPS_SpatialTransformerNetwork, self).__init__()
+        self.num_fiducial_points = num_fiducial_points
+        self.I_size = I_size
+        self.I_r_size = I_r_size  # = (I_r_height, I_r_width)
+        self.I_channel_num = I_channel_num
+        self.LocalizationNetwork = LocalizationNetwork(self.num_fiducial_points, self.I_channel_num)
+        self.GridGenerator = GridGenerator(self.num_fiducial_points, self.I_r_size)
+    def forward(self, batch_I):
+        batch_C_prime = self.LocalizationNetwork(batch_I)  # batch_size x K x 2
+        build_P_prime = self.GridGenerator.build_P_prime(batch_C_prime)  # batch_size x n (= I_r_width x I_r_height) x 2
+        build_P_prime_reshape = build_P_prime.reshape([build_P_prime.size(0), self.I_r_size[0], self.I_r_size[1], 2])
+        if torch.__version__ > "1.2.0":
+            batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape, padding_mode='border', align_corners=True)
+        else:
+            batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape, padding_mode='border')
+        return batch_I_r
+class LocalizationNetwork(nn.Module):
+    """ Localization Network of RARE, which predicts C' (K x 2) from I (I_width x I_height) """
+    def __init__(self, num_fiducial_points, I_channel_num):
+        super(LocalizationNetwork, self).__init__()
+        self.num_fiducial_points = num_fiducial_points
+        self.I_channel_num = I_channel_num
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=self.I_channel_num, out_channels=64, kernel_size=3, stride=1, padding=1,
+                      bias=False), nn.BatchNorm2d(64), nn.ReLU(True),
+            nn.MaxPool2d(2, 2),  # batch_size x 64 x I_height/2 x I_width/2
+            nn.Conv2d(64, 128, 3, 1, 1, bias=False), nn.BatchNorm2d(128), nn.ReLU(True),
+            nn.MaxPool2d(2, 2),  # batch_size x 128 x I_height/4 x I_width/4
+            nn.Conv2d(128, 256, 3, 1, 1, bias=False), nn.BatchNorm2d(256), nn.ReLU(True),
+            nn.MaxPool2d(2, 2),  # batch_size x 256 x I_height/8 x I_width/8
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False), nn.BatchNorm2d(512), nn.ReLU(True),
+            nn.AdaptiveAvgPool2d(1)  # batch_size x 512
+        )
+        self.localization_fc1 = nn.Sequential(nn.Linear(512, 256), nn.ReLU(True))
+        self.localization_fc2 = nn.Linear(256, self.num_fiducial_points * 2)
+        # Init fc2 in LocalizationNetwork
+        self.localization_fc2.weight.data.fill_(0)
+        """ see RARE paper Fig. 6 (a) """
+        ctrl_pts_x = np.linspace(-1.0, 1.0, int(num_fiducial_points / 2))
+        ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(num_fiducial_points / 2))
+        ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(num_fiducial_points / 2))
+        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
+        self.localization_fc2.bias.data = torch.from_numpy(initial_bias).float().view(-1)
+    def forward(self, batch_I):
+        """
+        input:     batch_I : Batch Input Image [batch_size x I_channel_num x I_height x I_width]
+        output:    batch_C_prime : Predicted coordinates of fiducial points for input batch [batch_size x F x 2]
+        """
+        batch_size = batch_I.size(0)
+        features = self.conv(batch_I).view(batch_size, -1)
+        batch_C_prime = self.localization_fc2(self.localization_fc1(features)).view(batch_size, self.num_fiducial_points, 2)
+        return batch_C_prime
+class GridGenerator(nn.Module):
+    """ Grid Generator of RARE, which produces P_prime by multipling T with P """
+    def __init__(self, num_fiducial_points, I_r_size):
+        """ Generate P_hat and inv_delta_C for later """
+        super(GridGenerator, self).__init__()
+        self.eps = 1e-6
+        self.I_r_height, self.I_r_width = I_r_size
+        self.num_fiducial_points = num_fiducial_points
+        self.C = self._build_C(self.num_fiducial_points)  # F x 2
+        self.P = self._build_P(self.I_r_width, self.I_r_height)
+        ## for multi-gpu, you need register buffer
+        self.register_buffer("inv_delta_C", torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float())  # F+3 x F+3
+        self.register_buffer("P_hat", torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float())  # n x F+3
+        ## for fine-tuning with different image width, you may use below instead of self.register_buffer
+        #self.inv_delta_C = torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float().cuda()  # F+3 x F+3
+        #self.P_hat = torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float().cuda()  # n x F+3
+    def _build_C(self, F):
+        """ Return coordinates of fiducial points in I_r; C """
+        ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
+        ctrl_pts_y_top = -1 * np.ones(int(F / 2))
+        ctrl_pts_y_bottom = np.ones(int(F / 2))
+        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        C = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
+        return C  # F x 2
+    def _build_inv_delta_C(self, F, C):
+        """ Return inv_delta_C which is needed to calculate T """
+        hat_C = np.zeros((F, F), dtype=float)  # F x F
+        for i in range(0, F):
+            for j in range(i, F):
+                r = np.linalg.norm(C[i] - C[j])
+                hat_C[i, j] = r
+                hat_C[j, i] = r
+        np.fill_diagonal(hat_C, 1)
+        hat_C = (hat_C ** 2) * np.log(hat_C)
+        # print(C.shape, hat_C.shape)
+        delta_C = np.concatenate(  # F+3 x F+3
+            [
+                np.concatenate([np.ones((F, 1)), C, hat_C], axis=1),  # F x F+3
+                np.concatenate([np.zeros((2, 3)), np.transpose(C)], axis=1),  # 2 x F+3
+                np.concatenate([np.zeros((1, 3)), np.ones((1, F))], axis=1)  # 1 x F+3
+            ],
+            axis=0
+        )
+        inv_delta_C = np.linalg.inv(delta_C)
+        return inv_delta_C  # F+3 x F+3
+    def _build_P(self, I_r_width, I_r_height):
+        I_r_grid_x = (np.arange(-I_r_width, I_r_width, 2) + 1.0) / I_r_width  # self.I_r_width
+        I_r_grid_y = (np.arange(-I_r_height, I_r_height, 2) + 1.0) / I_r_height  # self.I_r_height
+        P = np.stack(  # self.I_r_width x self.I_r_height x 2
+            np.meshgrid(I_r_grid_x, I_r_grid_y),
+            axis=2
+        )
+        return P.reshape([-1, 2])  # n (= self.I_r_width x self.I_r_height) x 2
+    def _build_P_hat(self, F, C, P):
+        n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
+        P_tile = np.tile(np.expand_dims(P, axis=1), (1, F, 1))  # n x 2 -> n x 1 x 2 -> n x F x 2
+        C_tile = np.expand_dims(C, axis=0)  # 1 x F x 2
+        P_diff = P_tile - C_tile  # n x F x 2
+        rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False)  # n x F
+        rbf = np.multiply(np.square(rbf_norm), np.log(rbf_norm + self.eps))  # n x F
+        P_hat = np.concatenate([np.ones((n, 1)), P, rbf], axis=1)
+        return P_hat  # n x F+3
+    def build_P_prime(self, batch_C_prime):
+        """ Generate Grid from batch_C_prime [batch_size x F x 2] """
+        batch_size = batch_C_prime.size(0)
+        batch_inv_delta_C = self.inv_delta_C.repeat(batch_size, 1, 1)
+        batch_P_hat = self.P_hat.repeat(batch_size, 1, 1)
+        batch_C_prime_with_zeros = torch.cat((batch_C_prime, torch.zeros(
+            batch_size, 3, 2).float().to(device)), dim=1)  # batch_size x F+3 x 2
+        batch_T = torch.bmm(batch_inv_delta_C, batch_C_prime_with_zeros)  # batch_size x F+3 x 2
+        batch_P_prime = torch.bmm(batch_P_hat, batch_T)  # batch_size x n x 2
+        return batch_P_prime  # batch_size x n x 2
+"""
+########################################
+######## Pyramid Pooling Block #########
+########################################
+class PyramidPool(nn.Module):
+    def __init__(self, pool_kernel_size, in_channels, out_channels):
+        super().__init__()
+        self.pool_kernel_size = pool_kernel_size
+        self.avg_pool_block = nn.Sequential(
+            nn.AvgPool2d((1, self.pool_kernel_size), stride=(1, self.pool_kernel_size)),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding="same", bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ELU(inplace=True),
+        )
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        _, _, in_height, in_width = x.size()
+        x = self.avg_pool_block(x)
+        x = F.interpolate(x, size=(in_height, in_width), mode="bilinear")
+        return x
+class PyramidPoolBlock(nn.Module):
+    def __init__(self, pyramid_pool_kernel_sizes=[4, 8, 16, 32], num_channels=512):
+        super().__init__()
+        pp_out_channels = 256
+        self.pyramid_pool_layers = nn.ModuleList([PyramidPool(pool_kernel_size=k, in_channels=num_channels, out_channels=pp_out_channels) for k in pyramid_pool_kernel_sizes])
+        self.final_layer = nn.Sequential(
+            nn.Conv2d((num_channels + (pp_out_channels * len(self.pyramid_pool_layers))), num_channels, (1, 5), stride=1, padding="same"),
+            nn.BatchNorm2d(num_channels),
+            nn.ELU(inplace=True),
+            nn.Dropout(p=0.1),
+        )
+    def forward(self, input):
+        pp_outputs = []
+        for pp_layer in self.pyramid_pool_layers:
+            pp_output = pp_layer(input)
+            pp_outputs.append(pp_output)
+        pp_outputs.append(input)
+        x = torch.cat(pp_outputs, dim=1)
+        x = self.final_layer(x)
+        return x
+"""

iam_line_recognition/test_internal.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import sys
+import time
+import torch
+import argparse
+import torchvision
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from logger_utils import CSVWriter
+from model_main import CRNN, STN_CRNN
+from utils import ctc_decode, compute_wer_and_cer_for_sample
+from dataset import HWRecogIAMDataset, split_dataset, get_dataloader_for_testing
+def test(hw_model, test_loader, device, list_test_files, which_ctc_decoder="beam_search", save_prediction_stats=False):
+    """
+    ---------
+    Arguments
+    ---------
+    hw_model : object
+        handwriting recognition model object
+    test_loader : object
+        dataset loader object
+    device : str
+        device to be used for running the evaluation
+    list_test_files : list
+        list of all the test files
+    which_ctc_decoder : str
+        string indicating which ctc decoder to use
+    save_prediction_stats : bool
+        whether to save prediction stats
+    """
+    hw_model.eval()
+    num_test_samples = len(test_loader.dataset)
+    num_test_batches = len(test_loader)
+    count = 0
+    list_test_cers, list_test_wers = [], []
+    if save_prediction_stats:
+        csv_writer = CSVWriter(
+            file_name="pred_stats.csv",
+            column_names=["file_name", "num_chars", "num_words", "cer", "wer"]
+        )
+    with torch.no_grad():
+        for images, labels, length_labels in test_loader:
+            count += 1
+            images = images.to(device, dtype=torch.float)
+            log_probs = hw_model(images)
+            pred_labels = ctc_decode(log_probs, which_ctc_decoder=which_ctc_decoder)
+            labels = labels.cpu().numpy().tolist()
+            str_label = [HWRecogIAMDataset.LABEL_2_CHAR[i] for i in labels]
+            str_label = "".join(str_label)
+            str_pred = [HWRecogIAMDataset.LABEL_2_CHAR[i] for i in pred_labels[0]]
+            str_pred = "".join(str_pred)
+            cer_sample, wer_sample = compute_wer_and_cer_for_sample(str_pred, str_label)
+            list_test_cers.append(cer_sample)
+            list_test_wers.append(wer_sample)
+            print(f"progress: {count}/{num_test_samples}, test file: {list_test_files[count-1]}")
+            print(f"{str_label} - label")
+            print(f"{str_pred} - prediction")
+            print(f"cer: {cer_sample:.3f}, wer: {wer_sample:.3f}\n")
+            if save_prediction_stats:
+                csv_writer.write_row([
+                    list_test_files[count-1],
+                    len(str_label),
+                    len(str_label.split(" ")),
+                    cer_sample,
+                    wer_sample,
+                ])
+    list_test_cers = np.array(list_test_cers)
+    list_test_wers = np.array(list_test_wers)
+    mean_test_cer = np.mean(list_test_cers)
+    mean_test_wer = np.mean(list_test_wers)
+    print(f"test set - mean cer: {mean_test_cer:.3f}, mean wer: {mean_test_wer:.3f}\n")
+    if save_prediction_stats:
+        csv_writer.close()
+    return
+def test_hw_recognizer(FLAGS):
+    file_txt_labels = os.path.join(FLAGS.dir_dataset, "iam_lines_gt.txt")
+    dir_images = os.path.join(FLAGS.dir_dataset, "img")
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+    # choose a device for testing
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    # get the internal test set files
+    test_x, test_y = split_dataset(file_txt_labels, for_train=False)
+    num_test_samples = len(test_x)
+    # get the internal test set dataloader
+    test_loader = get_dataloader_for_testing(
+        test_x, test_y,
+        dir_images=dir_images, image_height=FLAGS.image_height, image_width=FLAGS.image_width,
+    )
+    num_classes = len(HWRecogIAMDataset.LABEL_2_CHAR) + 1
+    print(f"task - handwriting recognition")
+    print(f"model: {FLAGS.which_hw_model}, ctc decoder: {FLAGS.which_ctc_decoder}")
+    print(f"image height: {FLAGS.image_height}, image width: {FLAGS.image_width}")
+    print(f"num test samples: {num_test_samples}")
+    # load the right model
+    if FLAGS.which_hw_model == "crnn":
+        hw_model = CRNN(num_classes, FLAGS.image_height)
+    elif FLAGS.which_hw_model == "stn_crnn":
+        hw_model = STN_CRNN(num_classes, FLAGS.image_height, FLAGS.image_width)
+    else:
+        print(f"unidentified option : {FLAGS.which_hw_model}")
+        sys.exit(0)
+    hw_model.to(device)
+    hw_model.load_state_dict(torch.load(FLAGS.file_model))
+    # start testing of the model on the internal set
+    print(f"testing of handwriting recognition model {FLAGS.which_hw_model} started\n")
+    test(hw_model, test_loader, device, test_x, FLAGS.which_ctc_decoder, bool(FLAGS.save_prediction_stats))
+    print(f"testing handwriting recognition model completed!!!!")
+    return
+def main():
+    image_height = 32
+    image_width = 768
+    which_hw_model = "crnn"
+    dir_dataset = "/home/abhishek/Desktop/RUG/hw_recognition/IAM-data/"
+    file_model = "model_crnn/crnn_H_32_W_768_E_177.pth"
+    which_ctc_decoder = "beam_search"
+    save_prediction_stats = 0
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--image_height", default=image_height,
+        type=int, help="image height to be used to predict with the model")
+    parser.add_argument("--image_width", default=image_width,
+        type=int, help="image width to be used to predict with the model")
+    parser.add_argument("--dir_dataset", default=dir_dataset,
+        type=str, help="full directory path to the dataset")
+    parser.add_argument("--which_hw_model", default=which_hw_model,
+        type=str, choices=["crnn", "stn_crnn"], help="which model to be used for prediction")
+    parser.add_argument("--which_ctc_decoder", default=which_ctc_decoder,
+        type=str, choices=["beam_search", "greedy"], help="which ctc decoder to use")
+    parser.add_argument("--file_model", default=file_model,
+        type=str, help="full path to trained model file (.pth)")
+    parser.add_argument("--save_prediction_stats", default=save_prediction_stats,
+        type=int, choices=[0, 1], help="save prediction stats (1 - yes, 0 - no)")
+    FLAGS, unparsed = parser.parse_known_args()
+    test_hw_recognizer(FLAGS)
+    return
+if __name__ == "__main__":
+    main()

iam_line_recognition/train.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import os
+import sys
+import time
+import torch
+import argparse
+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+from model_main import CRNN, STN_CRNN
+from logger_utils import CSVWriter, write_json_file
+from utils import compute_wer_and_cer_for_sample, ctc_decode
+from dataset import HWRecogIAMDataset, split_dataset, get_dataloaders_for_training
+def train(hw_model, optimizer, criterion, train_loader, device):
+    """
+    ---------
+    Arguments
+    ---------
+    hw_model : object
+        handwriting recognition model object
+    optimizer : object
+        optimizer object to be used for optimization
+    criterion : object
+        criterion or loss object to be used as the objective function for optimization
+    train_loader : object
+        train set dataloader object
+    device : str
+        device to be used for running the evaluation
+    -------
+    Returns
+    -------
+    train_loss : float
+        mean training loss for an epoch
+    """
+    hw_model.train()
+    train_running_loss = 0.0
+    num_train_samples = len(train_loader.dataset)
+    num_train_batches = len(train_loader)
+    for images, labels, lengths_labels in train_loader:
+        images = images.to(device, dtype=torch.float)
+        labels = labels.to(device, dtype=torch.long)
+        lengths_labels = lengths_labels.to(device, torch.long)
+        batch_size = images.size(0)
+        optimizer.zero_grad()
+        log_probs = hw_model(images)
+        lengths_preds = torch.LongTensor([log_probs.size(0)] * batch_size)
+        lengths_labels = torch.flatten(lengths_labels)
+        loss = criterion(log_probs, labels, lengths_preds, lengths_labels)
+        train_running_loss += loss.item()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(hw_model.parameters(), 5) # gradient clipping with 5
+        optimizer.step()
+    train_loss = train_running_loss / num_train_batches
+    return train_loss
+def validate(hw_model, criterion, valid_loader, device):
+    """
+    ---------
+    Arguments
+    ---------
+    hw_model : object
+        handwriting recognition model object
+    criterion : object
+        criterion or loss object to be used as the objective function for optimization
+    valid_loader : object
+        validation set dataloader object
+    device : str
+        device to be used for running the evaluation
+    -------
+    Returns
+    -------
+    a 3 tuple of
+    valid_loss : float
+        mean validation loss for an epoch
+    valid_cer : float
+        mean character error rate (CER) for validation set
+    valid_wer : float
+        mean word error rate (WER) for validation set
+    """
+    hw_model.eval()
+    valid_running_loss = 0.0
+    valid_running_cer = 0.0
+    valid_running_wer = 0.0
+    num_valid_samples = len(valid_loader.dataset)
+    num_valid_batches = len(valid_loader)
+    count = 0
+    with torch.no_grad():
+        for images, labels, lengths_labels in valid_loader:
+            images = images.to(device, dtype=torch.float)
+            labels = labels.to(device, dtype=torch.long)
+            lengths_labels = lengths_labels.to(device, torch.long)
+            batch_size = images.size(0)
+            log_probs = hw_model(images)
+            lengths_preds = torch.LongTensor([log_probs.size(0)] * batch_size)
+            loss = criterion(log_probs, labels, lengths_preds, lengths_labels)
+            valid_running_loss += loss.item()
+            pred_labels = ctc_decode(log_probs)
+            labels_for_eval = labels.cpu().numpy().tolist()
+            lengths_labels_for_eval = lengths_labels.cpu().numpy().tolist()
+            final_labels_for_eval = []
+            length_label_counter = 0
+            for pred_label, length_label in zip(pred_labels, lengths_labels_for_eval):
+                label = labels_for_eval[length_label_counter:length_label_counter+length_label]
+                length_label_counter += length_label
+                final_labels_for_eval.append(label)
+            for i in range(len(final_labels_for_eval)):
+                if len(pred_labels[i]) != 0:
+                    str_label = [HWRecogIAMDataset.LABEL_2_CHAR[i] for i in final_labels_for_eval[i]]
+                    str_label = "".join(str_label)
+                    str_pred = [HWRecogIAMDataset.LABEL_2_CHAR[i] for i in pred_labels[i]]
+                    str_pred = "".join(str_pred)
+                    cer_sample, wer_sample = compute_wer_and_cer_for_sample(str_pred, str_label)
+                else:
+                    cer_sample, wer_sample = 100, 100
+                valid_running_cer += cer_sample
+                valid_running_wer += wer_sample
+        valid_loss = valid_running_loss / num_valid_batches
+        valid_cer = valid_running_cer / num_valid_samples
+        valid_wer = valid_running_wer / num_valid_samples
+    return valid_loss, valid_cer, valid_wer
+def train_hw_recognizer(FLAGS):
+    file_txt_labels = os.path.join(FLAGS.dir_dataset, "iam_lines_gt.txt")
+    dir_images = os.path.join(FLAGS.dir_dataset, "img")
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+    # train only on a CUDA device (GPU)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        print("CUDA device not found, so exiting....")
+        sys.exit(0)
+    # split dataset into train and validation sets
+    train_x, valid_x, train_y, valid_y = split_dataset(file_txt_labels, for_train=True)
+    num_train_samples = len(train_x)
+    num_valid_samples = len(valid_x)
+    # get dataloaders for train and validation sets
+    train_loader, valid_loader = get_dataloaders_for_training(
+        train_x, train_y, valid_x, valid_y,
+        dir_images=dir_images, image_height=FLAGS.image_height, image_width=FLAGS.image_width,
+        batch_size=FLAGS.batch_size,
+    )
+    # create a directory for saving the model
+    dir_model = f"model_{FLAGS.which_hw_model}"
+    if not os.path.isdir(dir_model):
+        print(f"creating directory: {dir_model}")
+        os.makedirs(dir_model)
+    # save train and validation metrics in a csv file
+    file_logger_train = os.path.join(dir_model, "train_metrics.csv")
+    csv_writer = CSVWriter(
+        file_name=file_logger_train,
+        column_names=["epoch", "loss_train", "loss_valid", "cer_valid", "wer_valid"]
+    )
+    file_params = os.path.join(dir_model, "params.json")
+    write_json_file(file_params, vars(FLAGS))
+    num_classes = len(HWRecogIAMDataset.LABEL_2_CHAR) + 1
+    print(f"task - handwriting recognition")
+    print(f"model: {FLAGS.which_hw_model}")
+    print(f"optimizer: {FLAGS.which_optimizer}, learning rate: {FLAGS.learning_rate:.6f}, weight decay: {FLAGS.weight_decay:.8f}")
+    print(f"batch size: {FLAGS.batch_size}, image height: {FLAGS.image_height}, image width: {FLAGS.image_width}")
+    print(f"num train samples: {num_train_samples}, num validation samples: {num_valid_samples}\n")
+    # load the right model
+    if FLAGS.which_hw_model == "crnn":
+        hw_model = CRNN(num_classes, FLAGS.image_height)
+    elif FLAGS.which_hw_model == "stn_crnn":
+        hw_model = STN_CRNN(num_classes, FLAGS.image_height, FLAGS.image_width)
+    else:
+        print(f"unidentified option: {FLAGS.which_hw_model}")
+        sys.exit(0)
+    hw_model.to(device)
+    # load the right optimizer based on user option
+    if FLAGS.which_optimizer == "adam":
+        optimizer = torch.optim.Adam(hw_model.parameters(), lr=FLAGS.learning_rate, weight_decay=FLAGS.weight_decay)
+    elif FLAGS.which_optimizer == "adadelta":
+        optimizer = torch.optim.Adadelta(hw_model.parameters(), lr=FLAGS.learning_rate, rho=0.95, eps=1e-8, weight_decay=FLAGS.weight_decay)
+    else:
+        print(f"unidentified option: {FLAGS.which_optimizer}")
+        sys.exit(0)
+    # use the CTC loss as the objective function for training
+    criterion = nn.CTCLoss(reduction="mean", zero_infinity=True)
+    # start training the model
+    print(f"training of handwriting recognition model {FLAGS.which_hw_model} started\n")
+    for epoch in range(1, FLAGS.num_epochs+1):
+        time_start = time.time()
+        train_loss = train(hw_model, optimizer, criterion, train_loader, device)
+        valid_loss, valid_cer, valid_wer = validate(hw_model, criterion, valid_loader, device)
+        time_end = time.time()
+        print(f"epoch: {epoch}/{FLAGS.num_epochs}, time: {time_end-time_start:.3f} sec.")
+        print(f"train loss: {train_loss:.6f}, validation loss: {valid_loss:.6f}, validation cer: {valid_cer:.4f}, validation wer: {valid_wer:.4f}\n")
+        csv_writer.write_row(
+            [
+                epoch,
+                round(train_loss, 6),
+                round(valid_loss, 6),
+                round(valid_cer, 4),
+                round(valid_wer, 4),
+            ]
+        )
+        torch.save(hw_model.state_dict(), os.path.join(dir_model, f"{FLAGS.which_hw_model}_H_{FLAGS.image_height}_W_{FLAGS.image_width}_E_{epoch}.pth"))
+    print(f"Training of handwriting recognition model {FLAGS.which_hw_model} complete!!!!")
+    # close the csv file
+    csv_writer.close()
+    return
+def main():
+    learning_rate = 1
+    # 3e-4 for Adam, 1 for Adadelta
+    weight_decay = 0
+    # 3e-5 with Adam for both CRNN and STN-CRNN
+    # 0 with Adadelta for CRNN and STN-CRNN
+    batch_size = 64
+    num_epochs = 100
+    image_height = 32
+    image_width = 768
+    which_hw_model = "crnn"
+    which_optimizer = "adadelta"
+    dir_dataset = "/home/abhishek/Desktop/RUG/hw_recognition/IAM-data/"
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--learning_rate", default=learning_rate,
+        type=float, help="learning rate to use for training")
+    parser.add_argument("--weight_decay", default=weight_decay,
+        type=float, help="weight decay to use for training")
+    parser.add_argument("--batch_size", default=batch_size,
+        type=int, help="batch size to use for training")
+    parser.add_argument("--num_epochs", default=num_epochs,
+        type=int, help="num epochs to train the model")
+    parser.add_argument("--image_height", default=image_height,
+        type=int, help="image height to be used to train the model")
+    parser.add_argument("--image_width", default=image_width,
+        type=int, help="image width to be used to train the model")
+    parser.add_argument("--dir_dataset", default=dir_dataset,
+        type=str, help="full directory path to the dataset")
+    parser.add_argument("--which_optimizer", default=which_optimizer,
+        type=str, choices=["adadelta", "adam"], help="which optimizer to use to train")
+    parser.add_argument("--which_hw_model", default=which_hw_model,
+        type=str, choices=["crnn", "stn_crnn", "stn_pp_crnn"], help="which model to train")
+    FLAGS, unparsed = parser.parse_known_args()
+    train_hw_recognizer(FLAGS)
+    return
+if __name__ == "__main__":
+    main()

iam_line_recognition/utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import fastwer
+import numpy as np
+from scipy.special import logsumexp
+"""
+-------------
+ CTC decoder
+-------------
+"""
+NINF = -1 * float("inf")
+DEFAULT_EMISSION_THRESHOLD = 0.01
+def _reconstruct(labels, blank=0):
+    new_labels = []
+    # merge same labels
+    previous = None
+    for l in labels:
+        if l != previous:
+            new_labels.append(l)
+            previous = l
+    # delete blank
+    new_labels = [l for l in new_labels if l != blank]
+    return new_labels
+def beam_search_decode(emission_log_prob, blank=0, **kwargs):
+    beam_size = kwargs["beam_size"]
+    emission_threshold = kwargs.get("emission_threshold", np.log(DEFAULT_EMISSION_THRESHOLD))
+    length, class_count = emission_log_prob.shape
+    beams = [([], 0)]  # (prefix, accumulated_log_prob)
+    for t in range(length):
+        new_beams = []
+        for prefix, accumulated_log_prob in beams:
+            for c in range(class_count):
+                log_prob = emission_log_prob[t, c]
+                if log_prob < emission_threshold:
+                    continue
+                new_prefix = prefix + [c]
+                # log(p1 * p2) = log_p1 + log_p2
+                new_accu_log_prob = accumulated_log_prob + log_prob
+                new_beams.append((new_prefix, new_accu_log_prob))
+        # sorted by accumulated_log_prob
+        new_beams.sort(key=lambda x: x[1], reverse=True)
+        beams = new_beams[:beam_size]
+    # sum up beams to produce labels
+    total_accu_log_prob = {}
+    for prefix, accu_log_prob in beams:
+        labels = tuple(_reconstruct(prefix, blank))
+        # log(p1 + p2) = logsumexp([log_p1, log_p2])
+        total_accu_log_prob[labels] = \
+            logsumexp([accu_log_prob, total_accu_log_prob.get(labels, NINF)])
+    labels_beams = [(list(labels), accu_log_prob)
+                    for labels, accu_log_prob in total_accu_log_prob.items()]
+    labels_beams.sort(key=lambda x: x[1], reverse=True)
+    labels = labels_beams[0][0]
+    return labels
+def greedy_decode(emission_log_prob, blank=0):
+    labels = np.argmax(emission_log_prob, axis=-1)
+    labels = _reconstruct(labels, blank=blank)
+    return labels
+def ctc_decode(log_probs, which_ctc_decoder="beam_search", label_2_char=None, blank=0, beam_size=25):
+    emission_log_probs = np.transpose(log_probs.cpu().numpy(), (1, 0, 2))
+    # size of emission_log_probs: (batch, length, class)
+    decoded_list = []
+    for emission_log_prob in emission_log_probs:
+        if which_ctc_decoder == "beam_search":
+            decoded = beam_search_decode(emission_log_prob, blank=blank, beam_size=beam_size)
+        elif which_ctc_decoder == "greedy":
+            decoded = greedy_decode(emission_log_prob, blank=blank)
+        else:
+            print(f"unidentified option for which_ctc_decoder : {which_ctc_decoder}")
+            sys.exit(0)
+        if label_2_char:
+            decoded = [label_2_char[l] for l in decoded]
+        decoded_list.append(decoded)
+    return decoded_list
+"""
+--------------------
+ Evaluation Metrics
+--------------------
+"""
+def compute_wer_and_cer_for_batch(batch_preds, batch_gts):
+    cer_batch = fastwer.score(batch_preds, batch_gts, char_level=True)
+    wer_batch = fastwer.score(batch_preds, batch_gts)
+    return cer_batch, wer_batch
+def compute_wer_and_cer_for_sample(str_pred, str_gt):
+    cer_sample = fastwer.score_sent(str_pred, str_gt, char_level=True)
+    wer_sample = fastwer.score_sent(str_pred, str_gt)
+    return cer_sample, wer_sample

iam_line_recognition/utils_unique_chars.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import argparse
+import numpy as np
+from dataset import read_IAM_label_txt_file
+def list_unique_characters_in_IAM_dataset(FLAGS):
+    _, all_labels = read_IAM_label_txt_file(FLAGS.file_txt_labels)
+    num_labels = len(all_labels)
+    print(f"num labels : {num_labels}")
+    unique_chars = []
+    for label in all_labels:
+        unique_chars = unique_chars + list(np.unique(np.array(list(label))))
+    unique_chars = sorted(unique_chars)
+    unique_chars = np.array(unique_chars)
+    unique_chars = np.unique(unique_chars)
+    unique_chars = ''.join(unique_chars)
+    # prints all unique chars in the IAM dataset
+    print(unique_chars)
+    # prints the number of unique chars in the IAM dataset
+    print(f"Number of unique characters : {len(unique_chars)}")
+    return
+def main():
+    file_txt_labels = "/home/abhishek/Desktop/RUG/hw_recognition/IAM-data/iam_lines_gt.txt"
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--file_txt_labels", default=file_txt_labels,
+        type=str, help="full path to label text file")
+    FLAGS, unparsed = parser.parse_known_args()
+    list_unique_characters_in_IAM_dataset(FLAGS)
+    return
+if __name__ == "__main__":
+    main()