Spaces:

abhishekrs4
/

Handwriting_Recognition

Sleeping

App Files Files Community

abhishekrs4 commited on Apr 12

Commit

44066b7

•

1 Parent(s): bd1dc81

code formatting

Browse files

Files changed (10) hide show

app.py +5 -7
iam_line_recognition/dataset.py +94 -30
iam_line_recognition/final_iam_line_recognizer.py +81 -28
iam_line_recognition/logger_utils.py +3 -0
iam_line_recognition/model_main.py +54 -9
iam_line_recognition/model_visual_features.py +117 -43
iam_line_recognition/test_internal.py +82 -27
iam_line_recognition/train.py +125 -37
iam_line_recognition/utils.py +25 -8
iam_line_recognition/utils_unique_chars.py +14 -5

app.py CHANGED Viewed

@@ -26,14 +26,10 @@ hw_recog_model = CRNN(num_classes, image_height)
 try:
     logging.info(f"loading model from {file_model_local}")
-    hw_recog_model.load_state_dict(
-        torch.load(file_model_local, map_location=device)
-    )
 except:
     logging.info(f"loading model from {file_model_cont}")
-    hw_recog_model.load_state_dict(
-        torch.load(file_model_cont, map_location=device)
-    )
 hw_recog_model.to(device)
 hw_recog_model.eval()
@@ -51,6 +47,7 @@ def predict_hw(img_test: np.ndarray) -> str:
     str_pred = "".join(str_pred)
     return str_pred
 @app.route("/predict", methods=["POST"])
 def predict() -> Response:
     logging.info("IAM Handwriting recognition app")
@@ -62,7 +59,7 @@ def predict() -> Response:
     img_dec = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
     img_dec = cv2.cvtColor(img_dec, cv2.COLOR_BGR2RGB)
-    img_dec = cv2.resize(img_dec, (768, 32), interpolation = cv2.INTER_LINEAR)
     str_pred = predict_hw(img_dec)
@@ -77,5 +74,6 @@ def predict() -> Response:
         json_pred = jsonify({"error": str(e)})
     return json_pred
 if __name__ == "__main__":
     app.run(host="0.0.0.0", debug=True, port=7860)

 try:
     logging.info(f"loading model from {file_model_local}")
+    hw_recog_model.load_state_dict(torch.load(file_model_local, map_location=device))
 except:
     logging.info(f"loading model from {file_model_cont}")
+    hw_recog_model.load_state_dict(torch.load(file_model_cont, map_location=device))
 hw_recog_model.to(device)
 hw_recog_model.eval()
     str_pred = "".join(str_pred)
     return str_pred
 @app.route("/predict", methods=["POST"])
 def predict() -> Response:
     logging.info("IAM Handwriting recognition app")
     img_dec = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
     img_dec = cv2.cvtColor(img_dec, cv2.COLOR_BGR2RGB)
+    img_dec = cv2.resize(img_dec, (768, 32), interpolation=cv2.INTER_LINEAR)
     str_pred = predict_hw(img_dec)
         json_pred = jsonify({"error": str(e)})
     return json_pred
 if __name__ == "__main__":
     app.run(host="0.0.0.0", debug=True, port=7860)

iam_line_recognition/dataset.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torchvision.transforms as transforms
 from torch.utils.data import Dataset, DataLoader
 from sklearn.model_selection import train_test_split
 def read_IAM_label_txt_file(file_txt_labels):
     """
     ---------
@@ -42,15 +43,25 @@ def read_IAM_label_txt_file(file_txt_labels):
     return all_image_files, all_labels
 class HWRecogIAMDataset(Dataset):
     """
     Main dataset class to be used only for training, validation and internal testing
     """
-    CHAR_SET = ' !"#&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
     CHAR_2_LABEL = {char: i + 1 for i, char in enumerate(CHAR_SET)}
     LABEL_2_CHAR = {label: char for char, label in CHAR_2_LABEL.items()}
-    def __init__(self, list_image_files, list_labels, dir_images, image_height=32, image_width=768, which_set="train"):
         """
         ---------
         Arguments
@@ -77,28 +88,41 @@ class HWRecogIAMDataset(Dataset):
         if self.which_set == "train":
             # apply data augmentation only for train set
-            self.transform = transforms.Compose([
-                transforms.ToPILImage(),
-                transforms.Resize((self.image_height, self.image_width), Image.BILINEAR),
-                transforms.RandomAffine(degrees=[-0.75, 0.75], translate=[0, 0.05], scale=[0.75, 1],
-                    shear=[-35, 35], interpolation=transforms.InterpolationMode.BILINEAR, fill=255,
-                ),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406],
-                    std=[0.229, 0.224, 0.225],
-                ),
-            ])
         else:
-            self.transform = transforms.Compose([
-                transforms.ToPILImage(),
-                transforms.Resize((self.image_height, self.image_width), Image.BILINEAR),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406],
-                    std=[0.229, 0.224, 0.225],
-                ),
-            ])
     def __len__(self):
         return len(self.list_image_files)
@@ -118,6 +142,7 @@ class HWRecogIAMDataset(Dataset):
         return image_3_channel, label_encoded, label_length
 def IAM_collate_fn(batch):
     """
     collate function
@@ -145,6 +170,7 @@ def IAM_collate_fn(batch):
     label_lengths = torch.cat(label_lengths, 0)
     return images, labels, label_lengths
 def split_dataset(file_txt_labels, for_train=True):
     """
     ---------
@@ -161,14 +187,28 @@ def split_dataset(file_txt_labels, for_train=True):
     a tuple of files depending for train or internal testing
     """
     all_image_files, all_labels = read_IAM_label_txt_file(file_txt_labels)
-    train_image_files, test_image_files, train_labels, test_labels = train_test_split(all_image_files, all_labels, test_size=0.1, random_state=4)
-    train_image_files, valid_image_files, train_labels, valid_labels = train_test_split(train_image_files, train_labels, test_size=0.1, random_state=4)
     if for_train:
         return train_image_files, valid_image_files, train_labels, valid_labels
     else:
         return test_image_files, test_labels
-def get_dataloaders_for_training(train_x, train_y, valid_x, valid_y, dir_images, image_height=32, image_width=768, batch_size=8):
     """
     ---------
     Arguments
@@ -199,8 +239,22 @@ def get_dataloaders_for_training(train_x, train_y, valid_x, valid_y, dir_images,
     valid_loader : object
         object of validation set dataloader
     """
-    train_dataset = HWRecogIAMDataset(train_x, train_y, dir_images, image_height=image_height, image_width=image_width, which_set="train")
-    valid_dataset = HWRecogIAMDataset(valid_x, valid_y, dir_images, image_height=image_height, image_width=image_width, which_set="valid")
     train_loader = DataLoader(
         train_dataset,
@@ -218,7 +272,10 @@ def get_dataloaders_for_training(train_x, train_y, valid_x, valid_y, dir_images,
     )
     return train_loader, valid_loader
-def get_dataloader_for_testing(test_x, test_y, dir_images, image_height=32, image_width=768, batch_size=1):
     """
     ---------
     Arguments
@@ -242,7 +299,14 @@ def get_dataloader_for_testing(test_x, test_y, dir_images, image_height=32, imag
     test_loader : object
         object of test set dataloader
     """
-    test_dataset = HWRecogIAMDataset(test_x, test_y, dir_images=dir_images, image_height=image_height, image_width=image_width, which_set="test")
     test_loader = DataLoader(
         test_dataset,
         batch_size=batch_size,

 from torch.utils.data import Dataset, DataLoader
 from sklearn.model_selection import train_test_split
 def read_IAM_label_txt_file(file_txt_labels):
     """
     ---------
     return all_image_files, all_labels
 class HWRecogIAMDataset(Dataset):
     """
     Main dataset class to be used only for training, validation and internal testing
     """
+    CHAR_SET = " !\"#&'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
     CHAR_2_LABEL = {char: i + 1 for i, char in enumerate(CHAR_SET)}
     LABEL_2_CHAR = {label: char for char, label in CHAR_2_LABEL.items()}
+    def __init__(
+        self,
+        list_image_files,
+        list_labels,
+        dir_images,
+        image_height=32,
+        image_width=768,
+        which_set="train",
+    ):
         """
         ---------
         Arguments
         if self.which_set == "train":
             # apply data augmentation only for train set
+            self.transform = transforms.Compose(
+                [
+                    transforms.ToPILImage(),
+                    transforms.Resize(
+                        (self.image_height, self.image_width), Image.BILINEAR
+                    ),
+                    transforms.RandomAffine(
+                        degrees=[-0.75, 0.75],
+                        translate=[0, 0.05],
+                        scale=[0.75, 1],
+                        shear=[-35, 35],
+                        interpolation=transforms.InterpolationMode.BILINEAR,
+                        fill=255,
+                    ),
+                    transforms.ToTensor(),
+                    transforms.Normalize(
+                        mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225],
+                    ),
+                ]
+            )
         else:
+            self.transform = transforms.Compose(
+                [
+                    transforms.ToPILImage(),
+                    transforms.Resize(
+                        (self.image_height, self.image_width), Image.BILINEAR
+                    ),
+                    transforms.ToTensor(),
+                    transforms.Normalize(
+                        mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225],
+                    ),
+                ]
+            )
     def __len__(self):
         return len(self.list_image_files)
         return image_3_channel, label_encoded, label_length
 def IAM_collate_fn(batch):
     """
     collate function
     label_lengths = torch.cat(label_lengths, 0)
     return images, labels, label_lengths
 def split_dataset(file_txt_labels, for_train=True):
     """
     ---------
     a tuple of files depending for train or internal testing
     """
     all_image_files, all_labels = read_IAM_label_txt_file(file_txt_labels)
+    train_image_files, test_image_files, train_labels, test_labels = train_test_split(
+        all_image_files, all_labels, test_size=0.1, random_state=4
+    )
+    train_image_files, valid_image_files, train_labels, valid_labels = train_test_split(
+        train_image_files, train_labels, test_size=0.1, random_state=4
+    )
     if for_train:
         return train_image_files, valid_image_files, train_labels, valid_labels
     else:
         return test_image_files, test_labels
+def get_dataloaders_for_training(
+    train_x,
+    train_y,
+    valid_x,
+    valid_y,
+    dir_images,
+    image_height=32,
+    image_width=768,
+    batch_size=8,
+):
     """
     ---------
     Arguments
     valid_loader : object
         object of validation set dataloader
     """
+    train_dataset = HWRecogIAMDataset(
+        train_x,
+        train_y,
+        dir_images,
+        image_height=image_height,
+        image_width=image_width,
+        which_set="train",
+    )
+    valid_dataset = HWRecogIAMDataset(
+        valid_x,
+        valid_y,
+        dir_images,
+        image_height=image_height,
+        image_width=image_width,
+        which_set="valid",
+    )
     train_loader = DataLoader(
         train_dataset,
     )
     return train_loader, valid_loader
+def get_dataloader_for_testing(
+    test_x, test_y, dir_images, image_height=32, image_width=768, batch_size=1
+):
     """
     ---------
     Arguments
     test_loader : object
         object of test set dataloader
     """
+    test_dataset = HWRecogIAMDataset(
+        test_x,
+        test_y,
+        dir_images=dir_images,
+        image_height=image_height,
+        image_width=image_width,
+        which_set="test",
+    )
     test_loader = DataLoader(
         test_dataset,
         batch_size=batch_size,

iam_line_recognition/final_iam_line_recognizer.py CHANGED Viewed

@@ -21,6 +21,7 @@ class DatasetFinalEval(HWRecogIAMDataset):
     """
     Dataset class for final evaluation - inherits main dataset class
     """
     def __init__(self, dir_images, image_height=32, image_width=768):
         """
         ---------
@@ -34,18 +35,24 @@ class DatasetFinalEval(HWRecogIAMDataset):
             image width (default: 768)
         """
         self.dir_images = dir_images
-        self.image_files = [f for f in os.listdir(self.dir_images) if f.endswith(".png")]
         self.image_width = image_width
         self.image_height = image_height
-        self.transform = transforms.Compose([
-            transforms.ToPILImage(),
-            transforms.Resize((self.image_height, self.image_width), Image.BILINEAR),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406],
-                std=[0.229, 0.224, 0.225],
-            ),
-        ])
     def __len__(self):
         return len(self.image_files)
@@ -57,7 +64,10 @@ class DatasetFinalEval(HWRecogIAMDataset):
         image_3_channel = self.transform(image_3_channel)
         return image_3_channel
-def get_dataloader_for_evaluation(dir_images, image_height=32, image_width=768, batch_size=1):
     """
     ---------
     Arguments
@@ -77,7 +87,9 @@ def get_dataloader_for_evaluation(dir_images, image_height=32, image_width=768,
     test_loader : object
         dataset loader object for final evaluation
     """
-    test_dataset = DatasetFinalEval(dir_images=dir_images, image_height=image_height, image_width=image_width)
     test_loader = DataLoader(
         test_dataset,
         batch_size=batch_size,
@@ -86,6 +98,7 @@ def get_dataloader_for_evaluation(dir_images, image_height=32, image_width=768,
     )
     return test_loader
 def final_eval(hw_model, device, test_loader, dir_images, dir_results):
     """
     ---------
@@ -126,14 +139,22 @@ def final_eval(hw_model, device, test_loader, dir_images, dir_results):
             str_pred = [DatasetFinalEval.LABEL_2_CHAR[i] for i in pred_labels[0]]
             str_pred = "".join(str_pred)
-            with open(os.path.join(dir_results, file_test+".txt"), "w", encoding="utf-8", newline="\n") as fh_pred:
                 fh_pred.write(str_pred)
-            print(f"progress: {count}/{num_test_samples}, test file: {list_test_files[count-1]}")
             print(f"{str_pred}\n")
     print(f"predictions saved in directory: ./{dir_results}\n")
     return
 def test_hw_recognizer(FLAGS):
     os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
@@ -163,15 +184,20 @@ def test_hw_recognizer(FLAGS):
     # get test set dataloader
     test_loader = get_dataloader_for_evaluation(
-        dir_images=FLAGS.dir_images, image_height=FLAGS.image_height, image_width=FLAGS.image_width,
     )
     # start the evaluation on the final test set
-    print(f"final evaluation of handwriting recognition model {FLAGS.which_hw_model} started\n")
     final_eval(hw_model, device, test_loader, FLAGS.dir_images, dir_results)
     print(f"final evaluation of handwriting recognition model completed!!!!")
     return
 def main():
     image_height = 32
     image_width = 768
@@ -184,22 +210,49 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--image_height", default=image_height,
-        type=int, help="image height to be used to predict with the model")
-    parser.add_argument("--image_width", default=image_width,
-        type=int, help="image width to be used to predict with the model")
-    parser.add_argument("--dir_images", default=dir_images,
-        type=str, help="full directory path to directory containing images")
-    parser.add_argument("--which_hw_model", default=which_hw_model,
-        type=str, choices=["crnn", "stn_crnn"], help="which model to be used for prediction")
-    parser.add_argument("--file_model", default=file_model,
-        type=str, help="full path to trained model file (.pth)")
-    parser.add_argument("--save_predictions", default=save_predictions,
-        type=int, choices=[0, 1], help="save or do not save the predictions (1 - save, 0 - do not save)")
     FLAGS, unparsed = parser.parse_known_args()
     test_hw_recognizer(FLAGS)
     return
 if __name__ == "__main__":
     main()

     """
     Dataset class for final evaluation - inherits main dataset class
     """
     def __init__(self, dir_images, image_height=32, image_width=768):
         """
         ---------
             image width (default: 768)
         """
         self.dir_images = dir_images
+        self.image_files = [
+            f for f in os.listdir(self.dir_images) if f.endswith(".png")
+        ]
         self.image_width = image_width
         self.image_height = image_height
+        self.transform = transforms.Compose(
+            [
+                transforms.ToPILImage(),
+                transforms.Resize(
+                    (self.image_height, self.image_width), Image.BILINEAR
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ]
+        )
     def __len__(self):
         return len(self.image_files)
         image_3_channel = self.transform(image_3_channel)
         return image_3_channel
+def get_dataloader_for_evaluation(
+    dir_images, image_height=32, image_width=768, batch_size=1
+):
     """
     ---------
     Arguments
     test_loader : object
         dataset loader object for final evaluation
     """
+    test_dataset = DatasetFinalEval(
+        dir_images=dir_images, image_height=image_height, image_width=image_width
+    )
     test_loader = DataLoader(
         test_dataset,
         batch_size=batch_size,
     )
     return test_loader
 def final_eval(hw_model, device, test_loader, dir_images, dir_results):
     """
     ---------
             str_pred = [DatasetFinalEval.LABEL_2_CHAR[i] for i in pred_labels[0]]
             str_pred = "".join(str_pred)
+            with open(
+                os.path.join(dir_results, file_test + ".txt"),
+                "w",
+                encoding="utf-8",
+                newline="\n",
+            ) as fh_pred:
                 fh_pred.write(str_pred)
+            print(
+                f"progress: {count}/{num_test_samples}, test file: {list_test_files[count-1]}"
+            )
             print(f"{str_pred}\n")
     print(f"predictions saved in directory: ./{dir_results}\n")
     return
 def test_hw_recognizer(FLAGS):
     os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
     # get test set dataloader
     test_loader = get_dataloader_for_evaluation(
+        dir_images=FLAGS.dir_images,
+        image_height=FLAGS.image_height,
+        image_width=FLAGS.image_width,
     )
     # start the evaluation on the final test set
+    print(
+        f"final evaluation of handwriting recognition model {FLAGS.which_hw_model} started\n"
+    )
     final_eval(hw_model, device, test_loader, FLAGS.dir_images, dir_results)
     print(f"final evaluation of handwriting recognition model completed!!!!")
     return
 def main():
     image_height = 32
     image_width = 768
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--image_height",
+        default=image_height,
+        type=int,
+        help="image height to be used to predict with the model",
+    )
+    parser.add_argument(
+        "--image_width",
+        default=image_width,
+        type=int,
+        help="image width to be used to predict with the model",
+    )
+    parser.add_argument(
+        "--dir_images",
+        default=dir_images,
+        type=str,
+        help="full directory path to directory containing images",
+    )
+    parser.add_argument(
+        "--which_hw_model",
+        default=which_hw_model,
+        type=str,
+        choices=["crnn", "stn_crnn"],
+        help="which model to be used for prediction",
+    )
+    parser.add_argument(
+        "--file_model",
+        default=file_model,
+        type=str,
+        help="full path to trained model file (.pth)",
+    )
+    parser.add_argument(
+        "--save_predictions",
+        default=save_predictions,
+        type=int,
+        choices=[0, 1],
+        help="save or do not save the predictions (1 - save, 0 - do not save)",
+    )
     FLAGS, unparsed = parser.parse_known_args()
     test_hw_recognizer(FLAGS)
     return
 if __name__ == "__main__":
     main()

iam_line_recognition/logger_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import json
 def write_json_file(file_json, dict_data):
     """
     ---------
@@ -15,10 +16,12 @@ def write_json_file(file_json, dict_data):
         fh.write(json.dumps(dict_data, indent=4))
     return
 class CSVWriter:
     """
     for writing tabular data to a csv file
     """
     def __init__(self, file_name, column_names):
         """
         ---------

 import csv
 import json
 def write_json_file(file_json, dict_data):
     """
     ---------
         fh.write(json.dumps(dict_data, indent=4))
     return
 class CSVWriter:
     """
     for writing tabular data to a csv file
     """
     def __init__(self, file_name, column_names):
         """
         ---------

iam_line_recognition/model_main.py CHANGED Viewed

@@ -4,11 +4,20 @@ import torch.nn.functional as F
 from model_visual_features import ResNetFeatureExtractor, TPS_SpatialTransformerNetwork
 class HW_RNN_Seq2Seq(nn.Module):
     """
     Visual Seq2Seq model using BiLSTM
     """
-    def __init__(self, num_classes, image_height, cnn_output_channels=512, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
         """
         ---------
         Arguments
@@ -28,10 +37,16 @@ class HW_RNN_Seq2Seq(nn.Module):
         self.output_height = image_height // 32
         self.dropout = nn.Dropout(p=0.25)
-        self.map_visual_to_seq = nn.Linear(cnn_output_channels * self.output_height, num_feats_mapped_seq_hidden)
-        self.b_lstm_1 = nn.LSTM(num_feats_mapped_seq_hidden, num_feats_seq_hidden, bidirectional=True)
-        self.b_lstm_2 = nn.LSTM(2 * num_feats_seq_hidden, num_feats_seq_hidden, bidirectional=True)
         self.final_dense = nn.Linear(2 * num_feats_seq_hidden, num_classes)
@@ -40,7 +55,9 @@ class HW_RNN_Seq2Seq(nn.Module):
         # WBCH
         # the sequence is along the width of the image as a sentence
-        visual_feats = visual_feats.contiguous().view(visual_feats.shape[0], visual_feats.shape[1], -1)
         # WBC
         seq = self.map_visual_to_seq(visual_feats)
@@ -63,7 +80,14 @@ class CRNN(nn.Module):
     CNN - Modified ResNet34 for visual features
     RNN - BiLSTM for seq2seq modeling
     """
-    def __init__(self, num_classes, image_height, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
         """
         ---------
         Arguments
@@ -79,7 +103,13 @@ class CRNN(nn.Module):
         """
         super().__init__()
         self.visual_feature_extractor = ResNetFeatureExtractor()
-        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(num_classes, image_height, self.visual_feature_extractor.output_channels, num_feats_mapped_seq_hidden, num_feats_seq_hidden)
     def forward(self, x):
         visual_feats = self.visual_feature_extractor(x)
@@ -96,7 +126,15 @@ class STN_CRNN(nn.Module):
     CNN - Modified ResNet34 for visual features
     RNN - BiLSTM for seq2seq modeling
     """
-    def __init__(self, num_classes, image_height, image_width, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):
         """
         ---------
         Arguments
@@ -120,7 +158,13 @@ class STN_CRNN(nn.Module):
             I_channel_num=3,
         )
         self.visual_feature_extractor = ResNetFeatureExtractor()
-        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(num_classes, image_height, self.visual_feature_extractor.output_channels, num_feats_mapped_seq_hidden, num_feats_seq_hidden)
     def forward(self, x):
         stn_output = self.stn(x)
@@ -128,6 +172,7 @@ class STN_CRNN(nn.Module):
         log_probs = self.rnn_seq2seq_module(visual_feats)
         return log_probs
 """
 class STN_PP_CRNN(nn.Module):
     def __init__(self, num_classes, image_height, image_width, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):

 from model_visual_features import ResNetFeatureExtractor, TPS_SpatialTransformerNetwork
 class HW_RNN_Seq2Seq(nn.Module):
     """
     Visual Seq2Seq model using BiLSTM
     """
+    def __init__(
+        self,
+        num_classes,
+        image_height,
+        cnn_output_channels=512,
+        num_feats_mapped_seq_hidden=128,
+        num_feats_seq_hidden=256,
+    ):
         """
         ---------
         Arguments
         self.output_height = image_height // 32
         self.dropout = nn.Dropout(p=0.25)
+        self.map_visual_to_seq = nn.Linear(
+            cnn_output_channels * self.output_height, num_feats_mapped_seq_hidden
+        )
+        self.b_lstm_1 = nn.LSTM(
+            num_feats_mapped_seq_hidden, num_feats_seq_hidden, bidirectional=True
+        )
+        self.b_lstm_2 = nn.LSTM(
+            2 * num_feats_seq_hidden, num_feats_seq_hidden, bidirectional=True
+        )
         self.final_dense = nn.Linear(2 * num_feats_seq_hidden, num_classes)
         # WBCH
         # the sequence is along the width of the image as a sentence
+        visual_feats = visual_feats.contiguous().view(
+            visual_feats.shape[0], visual_feats.shape[1], -1
+        )
         # WBC
         seq = self.map_visual_to_seq(visual_feats)
     CNN - Modified ResNet34 for visual features
     RNN - BiLSTM for seq2seq modeling
     """
+    def __init__(
+        self,
+        num_classes,
+        image_height,
+        num_feats_mapped_seq_hidden=128,
+        num_feats_seq_hidden=256,
+    ):
         """
         ---------
         Arguments
         """
         super().__init__()
         self.visual_feature_extractor = ResNetFeatureExtractor()
+        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(
+            num_classes,
+            image_height,
+            self.visual_feature_extractor.output_channels,
+            num_feats_mapped_seq_hidden,
+            num_feats_seq_hidden,
+        )
     def forward(self, x):
         visual_feats = self.visual_feature_extractor(x)
     CNN - Modified ResNet34 for visual features
     RNN - BiLSTM for seq2seq modeling
     """
+    def __init__(
+        self,
+        num_classes,
+        image_height,
+        image_width,
+        num_feats_mapped_seq_hidden=128,
+        num_feats_seq_hidden=256,
+    ):
         """
         ---------
         Arguments
             I_channel_num=3,
         )
         self.visual_feature_extractor = ResNetFeatureExtractor()
+        self.rnn_seq2seq_module = HW_RNN_Seq2Seq(
+            num_classes,
+            image_height,
+            self.visual_feature_extractor.output_channels,
+            num_feats_mapped_seq_hidden,
+            num_feats_seq_hidden,
+        )
     def forward(self, x):
         stn_output = self.stn(x)
         log_probs = self.rnn_seq2seq_module(visual_feats)
         return log_probs
 """
 class STN_PP_CRNN(nn.Module):
     def __init__(self, num_classes, image_height, image_width, num_feats_mapped_seq_hidden=128, num_feats_seq_hidden=256):

iam_line_recognition/model_visual_features.py CHANGED Viewed

@@ -4,10 +4,17 @@ import torch.nn as nn
 from typing import List
 from torch import Tensor
 import torch.nn.functional as F
-from torchvision.models.resnet import BasicBlock, model_urls, load_state_dict_from_url, conv1x1, conv3x3
 device = torch.device("cuda")
 class CustomResNet(nn.Module):
     def __init__(
         self,
@@ -43,14 +50,22 @@ class CustomResNet(nn.Module):
         self.groups = groups
         self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = self._norm_layer(self.inplanes)
         self.relu = nn.ReLU(inplace=True)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=(2, 1), padding=1)
         self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=(2, 1), dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=(2, 2), dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=(2, 1), dilate=replace_stride_with_dilation[2])
         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
         self.fc = nn.Linear(512 * block.expansion, num_classes)
@@ -92,7 +107,14 @@ class CustomResNet(nn.Module):
         layers = []
         layers.append(
             block(
-                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
             )
         )
         self.inplanes = planes * block.expansion
@@ -126,6 +148,7 @@ class CustomResNet(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         return self._forward_impl(x)
 def _resnet(layers: List[int], pretrained=True) -> CustomResNet:
     model = CustomResNet(layers)
@@ -134,6 +157,7 @@ def _resnet(layers: List[int], pretrained=True) -> CustomResNet:
     return model
 def resnet34(*, pretrained=True) -> CustomResNet:
     """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
     Args:
@@ -159,6 +183,7 @@ class ResNetFeatureExtractor(nn.Module):
     """
     Defines Base ResNet-34 feature extractor
     """
     def __init__(self, pretrained=True):
         """
         ---------
@@ -174,7 +199,7 @@ class ResNetFeatureExtractor(nn.Module):
     def forward(self, x):
         block1 = self.resnet34.conv1(x)
         block1 = self.resnet34.bn1(block1)
-        block1 = self.resnet34.relu(block1)   # [64, H/2, W/2]
         block2 = self.resnet34.maxpool(block1)
         block2 = self.resnet34.layer1(block2)  # [64, H/4, W/4]
@@ -190,10 +215,10 @@ class ResNetFeatureExtractor(nn.Module):
 ### STN - Spatial Transformer Network ###
 #########################################
 class TPS_SpatialTransformerNetwork(nn.Module):
-    """ Rectification Network of RARE, namely TPS based STN """
     def __init__(self, num_fiducial_points, I_size, I_r_size, I_channel_num=1):
-        """ Based on RARE TPS
         input:
             batch_I: Batch Input Image [batch_size x I_channel_num x I_height x I_width]
             I_size : (height, width) of the input image I
@@ -207,39 +232,66 @@ class TPS_SpatialTransformerNetwork(nn.Module):
         self.I_size = I_size
         self.I_r_size = I_r_size  # = (I_r_height, I_r_width)
         self.I_channel_num = I_channel_num
-        self.LocalizationNetwork = LocalizationNetwork(self.num_fiducial_points, self.I_channel_num)
         self.GridGenerator = GridGenerator(self.num_fiducial_points, self.I_r_size)
     def forward(self, batch_I):
         batch_C_prime = self.LocalizationNetwork(batch_I)  # batch_size x K x 2
-        build_P_prime = self.GridGenerator.build_P_prime(batch_C_prime)  # batch_size x n (= I_r_width x I_r_height) x 2
-        build_P_prime_reshape = build_P_prime.reshape([build_P_prime.size(0), self.I_r_size[0], self.I_r_size[1], 2])
         if torch.__version__ > "1.2.0":
-            batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape, padding_mode='border', align_corners=True)
         else:
-            batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape, padding_mode='border')
         return batch_I_r
 class LocalizationNetwork(nn.Module):
-    """ Localization Network of RARE, which predicts C' (K x 2) from I (I_width x I_height) """
     def __init__(self, num_fiducial_points, I_channel_num):
         super(LocalizationNetwork, self).__init__()
         self.num_fiducial_points = num_fiducial_points
         self.I_channel_num = I_channel_num
         self.conv = nn.Sequential(
-            nn.Conv2d(in_channels=self.I_channel_num, out_channels=64, kernel_size=3, stride=1, padding=1,
-                      bias=False), nn.BatchNorm2d(64), nn.ReLU(True),
             nn.MaxPool2d(2, 2),  # batch_size x 64 x I_height/2 x I_width/2
-            nn.Conv2d(64, 128, 3, 1, 1, bias=False), nn.BatchNorm2d(128), nn.ReLU(True),
             nn.MaxPool2d(2, 2),  # batch_size x 128 x I_height/4 x I_width/4
-            nn.Conv2d(128, 256, 3, 1, 1, bias=False), nn.BatchNorm2d(256), nn.ReLU(True),
             nn.MaxPool2d(2, 2),  # batch_size x 256 x I_height/8 x I_width/8
-            nn.Conv2d(256, 512, 3, 1, 1, bias=False), nn.BatchNorm2d(512), nn.ReLU(True),
-            nn.AdaptiveAvgPool2d(1)  # batch_size x 512
         )
         self.localization_fc1 = nn.Sequential(nn.Linear(512, 256), nn.ReLU(True))
@@ -254,7 +306,9 @@ class LocalizationNetwork(nn.Module):
         ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
         ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
         initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
-        self.localization_fc2.bias.data = torch.from_numpy(initial_bias).float().view(-1)
     def forward(self, batch_I):
         """
@@ -263,15 +317,17 @@ class LocalizationNetwork(nn.Module):
         """
         batch_size = batch_I.size(0)
         features = self.conv(batch_I).view(batch_size, -1)
-        batch_C_prime = self.localization_fc2(self.localization_fc1(features)).view(batch_size, self.num_fiducial_points, 2)
         return batch_C_prime
 class GridGenerator(nn.Module):
-    """ Grid Generator of RARE, which produces P_prime by multipling T with P """
     def __init__(self, num_fiducial_points, I_r_size):
-        """ Generate P_hat and inv_delta_C for later """
         super(GridGenerator, self).__init__()
         self.eps = 1e-6
         self.I_r_height, self.I_r_width = I_r_size
@@ -279,14 +335,24 @@ class GridGenerator(nn.Module):
         self.C = self._build_C(self.num_fiducial_points)  # F x 2
         self.P = self._build_P(self.I_r_width, self.I_r_height)
         ## for multi-gpu, you need register buffer
-        self.register_buffer("inv_delta_C", torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float())  # F+3 x F+3
-        self.register_buffer("P_hat", torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float())  # n x F+3
         ## for fine-tuning with different image width, you may use below instead of self.register_buffer
-        #self.inv_delta_C = torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float().cuda()  # F+3 x F+3
-        #self.P_hat = torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float().cuda()  # n x F+3
     def _build_C(self, F):
-        """ Return coordinates of fiducial points in I_r; C """
         ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
         ctrl_pts_y_top = -1 * np.ones(int(F / 2))
         ctrl_pts_y_bottom = np.ones(int(F / 2))
@@ -296,7 +362,7 @@ class GridGenerator(nn.Module):
         return C  # F x 2
     def _build_inv_delta_C(self, F, C):
-        """ Return inv_delta_C which is needed to calculate T """
         hat_C = np.zeros((F, F), dtype=float)  # F x F
         for i in range(0, F):
             for j in range(i, F):
@@ -304,31 +370,36 @@ class GridGenerator(nn.Module):
                 hat_C[i, j] = r
                 hat_C[j, i] = r
         np.fill_diagonal(hat_C, 1)
-        hat_C = (hat_C ** 2) * np.log(hat_C)
         # print(C.shape, hat_C.shape)
         delta_C = np.concatenate(  # F+3 x F+3
             [
                 np.concatenate([np.ones((F, 1)), C, hat_C], axis=1),  # F x F+3
                 np.concatenate([np.zeros((2, 3)), np.transpose(C)], axis=1),  # 2 x F+3
-                np.concatenate([np.zeros((1, 3)), np.ones((1, F))], axis=1)  # 1 x F+3
             ],
-            axis=0
         )
         inv_delta_C = np.linalg.inv(delta_C)
         return inv_delta_C  # F+3 x F+3
     def _build_P(self, I_r_width, I_r_height):
-        I_r_grid_x = (np.arange(-I_r_width, I_r_width, 2) + 1.0) / I_r_width  # self.I_r_width
-        I_r_grid_y = (np.arange(-I_r_height, I_r_height, 2) + 1.0) / I_r_height  # self.I_r_height
         P = np.stack(  # self.I_r_width x self.I_r_height x 2
-            np.meshgrid(I_r_grid_x, I_r_grid_y),
-            axis=2
         )
         return P.reshape([-1, 2])  # n (= self.I_r_width x self.I_r_height) x 2
     def _build_P_hat(self, F, C, P):
         n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
-        P_tile = np.tile(np.expand_dims(P, axis=1), (1, F, 1))  # n x 2 -> n x 1 x 2 -> n x F x 2
         C_tile = np.expand_dims(C, axis=0)  # 1 x F x 2
         P_diff = P_tile - C_tile  # n x F x 2
         rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False)  # n x F
@@ -337,13 +408,16 @@ class GridGenerator(nn.Module):
         return P_hat  # n x F+3
     def build_P_prime(self, batch_C_prime):
-        """ Generate Grid from batch_C_prime [batch_size x F x 2] """
         batch_size = batch_C_prime.size(0)
         batch_inv_delta_C = self.inv_delta_C.repeat(batch_size, 1, 1)
         batch_P_hat = self.P_hat.repeat(batch_size, 1, 1)
-        batch_C_prime_with_zeros = torch.cat((batch_C_prime, torch.zeros(
-            batch_size, 3, 2).float().to(device)), dim=1)  # batch_size x F+3 x 2
-        batch_T = torch.bmm(batch_inv_delta_C, batch_C_prime_with_zeros)  # batch_size x F+3 x 2
         batch_P_prime = torch.bmm(batch_P_hat, batch_T)  # batch_size x n x 2
         return batch_P_prime  # batch_size x n x 2

 from typing import List
 from torch import Tensor
 import torch.nn.functional as F
+from torchvision.models.resnet import (
+    BasicBlock,
+    model_urls,
+    load_state_dict_from_url,
+    conv1x1,
+    conv3x3,
+)
 device = torch.device("cuda")
 class CustomResNet(nn.Module):
     def __init__(
         self,
         self.groups = groups
         self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
+        )
         self.bn1 = self._norm_layer(self.inplanes)
         self.relu = nn.ReLU(inplace=True)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=(2, 1), padding=1)
         self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=(2, 1), dilate=replace_stride_with_dilation[0]
+        )
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=(2, 2), dilate=replace_stride_with_dilation[1]
+        )
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=(2, 1), dilate=replace_stride_with_dilation[2]
+        )
         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
         self.fc = nn.Linear(512 * block.expansion, num_classes)
         layers = []
         layers.append(
             block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
             )
         )
         self.inplanes = planes * block.expansion
     def forward(self, x: Tensor) -> Tensor:
         return self._forward_impl(x)
 def _resnet(layers: List[int], pretrained=True) -> CustomResNet:
     model = CustomResNet(layers)
     return model
 def resnet34(*, pretrained=True) -> CustomResNet:
     """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
     Args:
     """
     Defines Base ResNet-34 feature extractor
     """
     def __init__(self, pretrained=True):
         """
         ---------
     def forward(self, x):
         block1 = self.resnet34.conv1(x)
         block1 = self.resnet34.bn1(block1)
+        block1 = self.resnet34.relu(block1)  # [64, H/2, W/2]
         block2 = self.resnet34.maxpool(block1)
         block2 = self.resnet34.layer1(block2)  # [64, H/4, W/4]
 ### STN - Spatial Transformer Network ###
 #########################################
 class TPS_SpatialTransformerNetwork(nn.Module):
+    """Rectification Network of RARE, namely TPS based STN"""
     def __init__(self, num_fiducial_points, I_size, I_r_size, I_channel_num=1):
+        """Based on RARE TPS
         input:
             batch_I: Batch Input Image [batch_size x I_channel_num x I_height x I_width]
             I_size : (height, width) of the input image I
         self.I_size = I_size
         self.I_r_size = I_r_size  # = (I_r_height, I_r_width)
         self.I_channel_num = I_channel_num
+        self.LocalizationNetwork = LocalizationNetwork(
+            self.num_fiducial_points, self.I_channel_num
+        )
         self.GridGenerator = GridGenerator(self.num_fiducial_points, self.I_r_size)
     def forward(self, batch_I):
         batch_C_prime = self.LocalizationNetwork(batch_I)  # batch_size x K x 2
+        build_P_prime = self.GridGenerator.build_P_prime(
+            batch_C_prime
+        )  # batch_size x n (= I_r_width x I_r_height) x 2
+        build_P_prime_reshape = build_P_prime.reshape(
+            [build_P_prime.size(0), self.I_r_size[0], self.I_r_size[1], 2]
+        )
         if torch.__version__ > "1.2.0":
+            batch_I_r = F.grid_sample(
+                batch_I,
+                build_P_prime_reshape,
+                padding_mode="border",
+                align_corners=True,
+            )
         else:
+            batch_I_r = F.grid_sample(
+                batch_I, build_P_prime_reshape, padding_mode="border"
+            )
         return batch_I_r
 class LocalizationNetwork(nn.Module):
+    """Localization Network of RARE, which predicts C' (K x 2) from I (I_width x I_height)"""
     def __init__(self, num_fiducial_points, I_channel_num):
         super(LocalizationNetwork, self).__init__()
         self.num_fiducial_points = num_fiducial_points
         self.I_channel_num = I_channel_num
         self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.I_channel_num,
+                out_channels=64,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False,
+            ),
+            nn.BatchNorm2d(64),
+            nn.ReLU(True),
             nn.MaxPool2d(2, 2),  # batch_size x 64 x I_height/2 x I_width/2
+            nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.ReLU(True),
             nn.MaxPool2d(2, 2),  # batch_size x 128 x I_height/4 x I_width/4
+            nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(True),
             nn.MaxPool2d(2, 2),  # batch_size x 256 x I_height/8 x I_width/8
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.ReLU(True),
+            nn.AdaptiveAvgPool2d(1),  # batch_size x 512
         )
         self.localization_fc1 = nn.Sequential(nn.Linear(512, 256), nn.ReLU(True))
         ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
         ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
         initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
+        self.localization_fc2.bias.data = (
+            torch.from_numpy(initial_bias).float().view(-1)
+        )
     def forward(self, batch_I):
         """
         """
         batch_size = batch_I.size(0)
         features = self.conv(batch_I).view(batch_size, -1)
+        batch_C_prime = self.localization_fc2(self.localization_fc1(features)).view(
+            batch_size, self.num_fiducial_points, 2
+        )
         return batch_C_prime
 class GridGenerator(nn.Module):
+    """Grid Generator of RARE, which produces P_prime by multipling T with P"""
     def __init__(self, num_fiducial_points, I_r_size):
+        """Generate P_hat and inv_delta_C for later"""
         super(GridGenerator, self).__init__()
         self.eps = 1e-6
         self.I_r_height, self.I_r_width = I_r_size
         self.C = self._build_C(self.num_fiducial_points)  # F x 2
         self.P = self._build_P(self.I_r_width, self.I_r_height)
         ## for multi-gpu, you need register buffer
+        self.register_buffer(
+            "inv_delta_C",
+            torch.tensor(
+                self._build_inv_delta_C(self.num_fiducial_points, self.C)
+            ).float(),
+        )  # F+3 x F+3
+        self.register_buffer(
+            "P_hat",
+            torch.tensor(
+                self._build_P_hat(self.num_fiducial_points, self.C, self.P)
+            ).float(),
+        )  # n x F+3
         ## for fine-tuning with different image width, you may use below instead of self.register_buffer
+        # self.inv_delta_C = torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float().cuda()  # F+3 x F+3
+        # self.P_hat = torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float().cuda()  # n x F+3
     def _build_C(self, F):
+        """Return coordinates of fiducial points in I_r; C"""
         ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
         ctrl_pts_y_top = -1 * np.ones(int(F / 2))
         ctrl_pts_y_bottom = np.ones(int(F / 2))
         return C  # F x 2
     def _build_inv_delta_C(self, F, C):
+        """Return inv_delta_C which is needed to calculate T"""
         hat_C = np.zeros((F, F), dtype=float)  # F x F
         for i in range(0, F):
             for j in range(i, F):
                 hat_C[i, j] = r
                 hat_C[j, i] = r
         np.fill_diagonal(hat_C, 1)
+        hat_C = (hat_C**2) * np.log(hat_C)
         # print(C.shape, hat_C.shape)
         delta_C = np.concatenate(  # F+3 x F+3
             [
                 np.concatenate([np.ones((F, 1)), C, hat_C], axis=1),  # F x F+3
                 np.concatenate([np.zeros((2, 3)), np.transpose(C)], axis=1),  # 2 x F+3
+                np.concatenate([np.zeros((1, 3)), np.ones((1, F))], axis=1),  # 1 x F+3
             ],
+            axis=0,
         )
         inv_delta_C = np.linalg.inv(delta_C)
         return inv_delta_C  # F+3 x F+3
     def _build_P(self, I_r_width, I_r_height):
+        I_r_grid_x = (
+            np.arange(-I_r_width, I_r_width, 2) + 1.0
+        ) / I_r_width  # self.I_r_width
+        I_r_grid_y = (
+            np.arange(-I_r_height, I_r_height, 2) + 1.0
+        ) / I_r_height  # self.I_r_height
         P = np.stack(  # self.I_r_width x self.I_r_height x 2
+            np.meshgrid(I_r_grid_x, I_r_grid_y), axis=2
         )
         return P.reshape([-1, 2])  # n (= self.I_r_width x self.I_r_height) x 2
     def _build_P_hat(self, F, C, P):
         n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
+        P_tile = np.tile(
+            np.expand_dims(P, axis=1), (1, F, 1)
+        )  # n x 2 -> n x 1 x 2 -> n x F x 2
         C_tile = np.expand_dims(C, axis=0)  # 1 x F x 2
         P_diff = P_tile - C_tile  # n x F x 2
         rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False)  # n x F
         return P_hat  # n x F+3
     def build_P_prime(self, batch_C_prime):
+        """Generate Grid from batch_C_prime [batch_size x F x 2]"""
         batch_size = batch_C_prime.size(0)
         batch_inv_delta_C = self.inv_delta_C.repeat(batch_size, 1, 1)
         batch_P_hat = self.P_hat.repeat(batch_size, 1, 1)
+        batch_C_prime_with_zeros = torch.cat(
+            (batch_C_prime, torch.zeros(batch_size, 3, 2).float().to(device)), dim=1
+        )  # batch_size x F+3 x 2
+        batch_T = torch.bmm(
+            batch_inv_delta_C, batch_C_prime_with_zeros
+        )  # batch_size x F+3 x 2
         batch_P_prime = torch.bmm(batch_P_hat, batch_T)  # batch_size x n x 2
         return batch_P_prime  # batch_size x n x 2

iam_line_recognition/test_internal.py CHANGED Viewed

@@ -14,7 +14,14 @@ from utils import ctc_decode, compute_wer_and_cer_for_sample
 from dataset import HWRecogIAMDataset, split_dataset, get_dataloader_for_testing
-def test(hw_model, test_loader, device, list_test_files, which_ctc_decoder="beam_search", save_prediction_stats=False):
     """
     ---------
     Arguments
@@ -42,7 +49,7 @@ def test(hw_model, test_loader, device, list_test_files, which_ctc_decoder="beam
     if save_prediction_stats:
         csv_writer = CSVWriter(
             file_name="pred_stats.csv",
-            column_names=["file_name", "num_chars", "num_words", "cer", "wer"]
         )
     with torch.no_grad():
@@ -62,19 +69,23 @@ def test(hw_model, test_loader, device, list_test_files, which_ctc_decoder="beam
             list_test_cers.append(cer_sample)
             list_test_wers.append(wer_sample)
-            print(f"progress: {count}/{num_test_samples}, test file: {list_test_files[count-1]}")
             print(f"{str_label} - label")
             print(f"{str_pred} - prediction")
             print(f"cer: {cer_sample:.3f}, wer: {wer_sample:.3f}\n")
             if save_prediction_stats:
-                csv_writer.write_row([
-                    list_test_files[count-1],
-                    len(str_label),
-                    len(str_label.split(" ")),
-                    cer_sample,
-                    wer_sample,
-                ])
     list_test_cers = np.array(list_test_cers)
     list_test_wers = np.array(list_test_wers)
     mean_test_cer = np.mean(list_test_cers)
@@ -85,6 +96,7 @@ def test(hw_model, test_loader, device, list_test_files, which_ctc_decoder="beam
         csv_writer.close()
     return
 def test_hw_recognizer(FLAGS):
     file_txt_labels = os.path.join(FLAGS.dir_dataset, "iam_lines_gt.txt")
     dir_images = os.path.join(FLAGS.dir_dataset, "img")
@@ -101,8 +113,11 @@ def test_hw_recognizer(FLAGS):
     num_test_samples = len(test_x)
     # get the internal test set dataloader
     test_loader = get_dataloader_for_testing(
-        test_x, test_y,
-        dir_images=dir_images, image_height=FLAGS.image_height, image_width=FLAGS.image_width,
     )
     num_classes = len(HWRecogIAMDataset.LABEL_2_CHAR) + 1
@@ -124,10 +139,18 @@ def test_hw_recognizer(FLAGS):
     # start testing of the model on the internal set
     print(f"testing of handwriting recognition model {FLAGS.which_hw_model} started\n")
-    test(hw_model, test_loader, device, test_x, FLAGS.which_ctc_decoder, bool(FLAGS.save_prediction_stats))
     print(f"testing handwriting recognition model completed!!!!")
     return
 def main():
     image_height = 32
     image_width = 768
@@ -141,24 +164,56 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--image_height", default=image_height,
-        type=int, help="image height to be used to predict with the model")
-    parser.add_argument("--image_width", default=image_width,
-        type=int, help="image width to be used to predict with the model")
-    parser.add_argument("--dir_dataset", default=dir_dataset,
-        type=str, help="full directory path to the dataset")
-    parser.add_argument("--which_hw_model", default=which_hw_model,
-        type=str, choices=["crnn", "stn_crnn"], help="which model to be used for prediction")
-    parser.add_argument("--which_ctc_decoder", default=which_ctc_decoder,
-        type=str, choices=["beam_search", "greedy"], help="which ctc decoder to use")
-    parser.add_argument("--file_model", default=file_model,
-        type=str, help="full path to trained model file (.pth)")
-    parser.add_argument("--save_prediction_stats", default=save_prediction_stats,
-        type=int, choices=[0, 1], help="save prediction stats (1 - yes, 0 - no)")
     FLAGS, unparsed = parser.parse_known_args()
     test_hw_recognizer(FLAGS)
     return
 if __name__ == "__main__":
     main()

 from dataset import HWRecogIAMDataset, split_dataset, get_dataloader_for_testing
+def test(
+    hw_model,
+    test_loader,
+    device,
+    list_test_files,
+    which_ctc_decoder="beam_search",
+    save_prediction_stats=False,
+):
     """
     ---------
     Arguments
     if save_prediction_stats:
         csv_writer = CSVWriter(
             file_name="pred_stats.csv",
+            column_names=["file_name", "num_chars", "num_words", "cer", "wer"],
         )
     with torch.no_grad():
             list_test_cers.append(cer_sample)
             list_test_wers.append(wer_sample)
+            print(
+                f"progress: {count}/{num_test_samples}, test file: {list_test_files[count-1]}"
+            )
             print(f"{str_label} - label")
             print(f"{str_pred} - prediction")
             print(f"cer: {cer_sample:.3f}, wer: {wer_sample:.3f}\n")
             if save_prediction_stats:
+                csv_writer.write_row(
+                    [
+                        list_test_files[count - 1],
+                        len(str_label),
+                        len(str_label.split(" ")),
+                        cer_sample,
+                        wer_sample,
+                    ]
+                )
     list_test_cers = np.array(list_test_cers)
     list_test_wers = np.array(list_test_wers)
     mean_test_cer = np.mean(list_test_cers)
         csv_writer.close()
     return
 def test_hw_recognizer(FLAGS):
     file_txt_labels = os.path.join(FLAGS.dir_dataset, "iam_lines_gt.txt")
     dir_images = os.path.join(FLAGS.dir_dataset, "img")
     num_test_samples = len(test_x)
     # get the internal test set dataloader
     test_loader = get_dataloader_for_testing(
+        test_x,
+        test_y,
+        dir_images=dir_images,
+        image_height=FLAGS.image_height,
+        image_width=FLAGS.image_width,
     )
     num_classes = len(HWRecogIAMDataset.LABEL_2_CHAR) + 1
     # start testing of the model on the internal set
     print(f"testing of handwriting recognition model {FLAGS.which_hw_model} started\n")
+    test(
+        hw_model,
+        test_loader,
+        device,
+        test_x,
+        FLAGS.which_ctc_decoder,
+        bool(FLAGS.save_prediction_stats),
+    )
     print(f"testing handwriting recognition model completed!!!!")
     return
 def main():
     image_height = 32
     image_width = 768
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--image_height",
+        default=image_height,
+        type=int,
+        help="image height to be used to predict with the model",
+    )
+    parser.add_argument(
+        "--image_width",
+        default=image_width,
+        type=int,
+        help="image width to be used to predict with the model",
+    )
+    parser.add_argument(
+        "--dir_dataset",
+        default=dir_dataset,
+        type=str,
+        help="full directory path to the dataset",
+    )
+    parser.add_argument(
+        "--which_hw_model",
+        default=which_hw_model,
+        type=str,
+        choices=["crnn", "stn_crnn"],
+        help="which model to be used for prediction",
+    )
+    parser.add_argument(
+        "--which_ctc_decoder",
+        default=which_ctc_decoder,
+        type=str,
+        choices=["beam_search", "greedy"],
+        help="which ctc decoder to use",
+    )
+    parser.add_argument(
+        "--file_model",
+        default=file_model,
+        type=str,
+        help="full path to trained model file (.pth)",
+    )
+    parser.add_argument(
+        "--save_prediction_stats",
+        default=save_prediction_stats,
+        type=int,
+        choices=[0, 1],
+        help="save prediction stats (1 - yes, 0 - no)",
+    )
     FLAGS, unparsed = parser.parse_known_args()
     test_hw_recognizer(FLAGS)
     return
 if __name__ == "__main__":
     main()

iam_line_recognition/train.py CHANGED Viewed

@@ -55,12 +55,15 @@ def train(hw_model, optimizer, criterion, train_loader, device):
         loss = criterion(log_probs, labels, lengths_preds, lengths_labels)
         train_running_loss += loss.item()
         loss.backward()
-        torch.nn.utils.clip_grad_norm_(hw_model.parameters(), 5) # gradient clipping with 5
         optimizer.step()
     train_loss = train_running_loss / num_train_batches
     return train_loss
 def validate(hw_model, criterion, valid_loader, device):
     """
     ---------
@@ -114,19 +117,28 @@ def validate(hw_model, criterion, valid_loader, device):
             final_labels_for_eval = []
             length_label_counter = 0
             for pred_label, length_label in zip(pred_labels, lengths_labels_for_eval):
-                label = labels_for_eval[length_label_counter:length_label_counter+length_label]
                 length_label_counter += length_label
                 final_labels_for_eval.append(label)
             for i in range(len(final_labels_for_eval)):
                 if len(pred_labels[i]) != 0:
-                    str_label = [HWRecogIAMDataset.LABEL_2_CHAR[i] for i in final_labels_for_eval[i]]
                     str_label = "".join(str_label)
-                    str_pred = [HWRecogIAMDataset.LABEL_2_CHAR[i] for i in pred_labels[i]]
                     str_pred = "".join(str_pred)
-                    cer_sample, wer_sample = compute_wer_and_cer_for_sample(str_pred, str_label)
                 else:
                     cer_sample, wer_sample = 100, 100
@@ -138,6 +150,7 @@ def validate(hw_model, criterion, valid_loader, device):
         valid_wer = valid_running_wer / num_valid_samples
     return valid_loss, valid_cer, valid_wer
 def train_hw_recognizer(FLAGS):
     file_txt_labels = os.path.join(FLAGS.dir_dataset, "iam_lines_gt.txt")
     dir_images = os.path.join(FLAGS.dir_dataset, "img")
@@ -156,8 +169,13 @@ def train_hw_recognizer(FLAGS):
     num_valid_samples = len(valid_x)
     # get dataloaders for train and validation sets
     train_loader, valid_loader = get_dataloaders_for_training(
-        train_x, train_y, valid_x, valid_y,
-        dir_images=dir_images, image_height=FLAGS.image_height, image_width=FLAGS.image_width,
         batch_size=FLAGS.batch_size,
     )
@@ -171,7 +189,7 @@ def train_hw_recognizer(FLAGS):
     file_logger_train = os.path.join(dir_model, "train_metrics.csv")
     csv_writer = CSVWriter(
         file_name=file_logger_train,
-        column_names=["epoch", "loss_train", "loss_valid", "cer_valid", "wer_valid"]
     )
     file_params = os.path.join(dir_model, "params.json")
@@ -180,9 +198,15 @@ def train_hw_recognizer(FLAGS):
     num_classes = len(HWRecogIAMDataset.LABEL_2_CHAR) + 1
     print(f"task - handwriting recognition")
     print(f"model: {FLAGS.which_hw_model}")
-    print(f"optimizer: {FLAGS.which_optimizer}, learning rate: {FLAGS.learning_rate:.6f}, weight decay: {FLAGS.weight_decay:.8f}")
-    print(f"batch size: {FLAGS.batch_size}, image height: {FLAGS.image_height}, image width: {FLAGS.image_width}")
-    print(f"num train samples: {num_train_samples}, num validation samples: {num_valid_samples}\n")
     # load the right model
     if FLAGS.which_hw_model == "crnn":
@@ -196,9 +220,19 @@ def train_hw_recognizer(FLAGS):
     # load the right optimizer based on user option
     if FLAGS.which_optimizer == "adam":
-        optimizer = torch.optim.Adam(hw_model.parameters(), lr=FLAGS.learning_rate, weight_decay=FLAGS.weight_decay)
     elif FLAGS.which_optimizer == "adadelta":
-        optimizer = torch.optim.Adadelta(hw_model.parameters(), lr=FLAGS.learning_rate, rho=0.95, eps=1e-8, weight_decay=FLAGS.weight_decay)
     else:
         print(f"unidentified option: {FLAGS.which_optimizer}")
         sys.exit(0)
@@ -207,13 +241,19 @@ def train_hw_recognizer(FLAGS):
     # start training the model
     print(f"training of handwriting recognition model {FLAGS.which_hw_model} started\n")
-    for epoch in range(1, FLAGS.num_epochs+1):
         time_start = time.time()
         train_loss = train(hw_model, optimizer, criterion, train_loader, device)
-        valid_loss, valid_cer, valid_wer = validate(hw_model, criterion, valid_loader, device)
         time_end = time.time()
-        print(f"epoch: {epoch}/{FLAGS.num_epochs}, time: {time_end-time_start:.3f} sec.")
-        print(f"train loss: {train_loss:.6f}, validation loss: {valid_loss:.6f}, validation cer: {valid_cer:.4f}, validation wer: {valid_wer:.4f}\n")
         csv_writer.write_row(
             [
@@ -224,12 +264,21 @@ def train_hw_recognizer(FLAGS):
                 round(valid_wer, 4),
             ]
         )
-        torch.save(hw_model.state_dict(), os.path.join(dir_model, f"{FLAGS.which_hw_model}_H_{FLAGS.image_height}_W_{FLAGS.image_width}_E_{epoch}.pth"))
-    print(f"Training of handwriting recognition model {FLAGS.which_hw_model} complete!!!!")
     # close the csv file
     csv_writer.close()
     return
 def main():
     learning_rate = 1
     # 3e-4 for Adam, 1 for Adadelta
@@ -248,28 +297,67 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--learning_rate", default=learning_rate,
-        type=float, help="learning rate to use for training")
-    parser.add_argument("--weight_decay", default=weight_decay,
-        type=float, help="weight decay to use for training")
-    parser.add_argument("--batch_size", default=batch_size,
-        type=int, help="batch size to use for training")
-    parser.add_argument("--num_epochs", default=num_epochs,
-        type=int, help="num epochs to train the model")
-    parser.add_argument("--image_height", default=image_height,
-        type=int, help="image height to be used to train the model")
-    parser.add_argument("--image_width", default=image_width,
-        type=int, help="image width to be used to train the model")
-    parser.add_argument("--dir_dataset", default=dir_dataset,
-        type=str, help="full directory path to the dataset")
-    parser.add_argument("--which_optimizer", default=which_optimizer,
-        type=str, choices=["adadelta", "adam"], help="which optimizer to use to train")
-    parser.add_argument("--which_hw_model", default=which_hw_model,
-        type=str, choices=["crnn", "stn_crnn", "stn_pp_crnn"], help="which model to train")
     FLAGS, unparsed = parser.parse_known_args()
     train_hw_recognizer(FLAGS)
     return
 if __name__ == "__main__":
     main()

         loss = criterion(log_probs, labels, lengths_preds, lengths_labels)
         train_running_loss += loss.item()
         loss.backward()
+        torch.nn.utils.clip_grad_norm_(
+            hw_model.parameters(), 5
+        )  # gradient clipping with 5
         optimizer.step()
     train_loss = train_running_loss / num_train_batches
     return train_loss
 def validate(hw_model, criterion, valid_loader, device):
     """
     ---------
             final_labels_for_eval = []
             length_label_counter = 0
             for pred_label, length_label in zip(pred_labels, lengths_labels_for_eval):
+                label = labels_for_eval[
+                    length_label_counter : length_label_counter + length_label
+                ]
                 length_label_counter += length_label
                 final_labels_for_eval.append(label)
             for i in range(len(final_labels_for_eval)):
                 if len(pred_labels[i]) != 0:
+                    str_label = [
+                        HWRecogIAMDataset.LABEL_2_CHAR[i]
+                        for i in final_labels_for_eval[i]
+                    ]
                     str_label = "".join(str_label)
+                    str_pred = [
+                        HWRecogIAMDataset.LABEL_2_CHAR[i] for i in pred_labels[i]
+                    ]
                     str_pred = "".join(str_pred)
+                    cer_sample, wer_sample = compute_wer_and_cer_for_sample(
+                        str_pred, str_label
+                    )
                 else:
                     cer_sample, wer_sample = 100, 100
         valid_wer = valid_running_wer / num_valid_samples
     return valid_loss, valid_cer, valid_wer
 def train_hw_recognizer(FLAGS):
     file_txt_labels = os.path.join(FLAGS.dir_dataset, "iam_lines_gt.txt")
     dir_images = os.path.join(FLAGS.dir_dataset, "img")
     num_valid_samples = len(valid_x)
     # get dataloaders for train and validation sets
     train_loader, valid_loader = get_dataloaders_for_training(
+        train_x,
+        train_y,
+        valid_x,
+        valid_y,
+        dir_images=dir_images,
+        image_height=FLAGS.image_height,
+        image_width=FLAGS.image_width,
         batch_size=FLAGS.batch_size,
     )
     file_logger_train = os.path.join(dir_model, "train_metrics.csv")
     csv_writer = CSVWriter(
         file_name=file_logger_train,
+        column_names=["epoch", "loss_train", "loss_valid", "cer_valid", "wer_valid"],
     )
     file_params = os.path.join(dir_model, "params.json")
     num_classes = len(HWRecogIAMDataset.LABEL_2_CHAR) + 1
     print(f"task - handwriting recognition")
     print(f"model: {FLAGS.which_hw_model}")
+    print(
+        f"optimizer: {FLAGS.which_optimizer}, learning rate: {FLAGS.learning_rate:.6f}, weight decay: {FLAGS.weight_decay:.8f}"
+    )
+    print(
+        f"batch size: {FLAGS.batch_size}, image height: {FLAGS.image_height}, image width: {FLAGS.image_width}"
+    )
+    print(
+        f"num train samples: {num_train_samples}, num validation samples: {num_valid_samples}\n"
+    )
     # load the right model
     if FLAGS.which_hw_model == "crnn":
     # load the right optimizer based on user option
     if FLAGS.which_optimizer == "adam":
+        optimizer = torch.optim.Adam(
+            hw_model.parameters(),
+            lr=FLAGS.learning_rate,
+            weight_decay=FLAGS.weight_decay,
+        )
     elif FLAGS.which_optimizer == "adadelta":
+        optimizer = torch.optim.Adadelta(
+            hw_model.parameters(),
+            lr=FLAGS.learning_rate,
+            rho=0.95,
+            eps=1e-8,
+            weight_decay=FLAGS.weight_decay,
+        )
     else:
         print(f"unidentified option: {FLAGS.which_optimizer}")
         sys.exit(0)
     # start training the model
     print(f"training of handwriting recognition model {FLAGS.which_hw_model} started\n")
+    for epoch in range(1, FLAGS.num_epochs + 1):
         time_start = time.time()
         train_loss = train(hw_model, optimizer, criterion, train_loader, device)
+        valid_loss, valid_cer, valid_wer = validate(
+            hw_model, criterion, valid_loader, device
+        )
         time_end = time.time()
+        print(
+            f"epoch: {epoch}/{FLAGS.num_epochs}, time: {time_end-time_start:.3f} sec."
+        )
+        print(
+            f"train loss: {train_loss:.6f}, validation loss: {valid_loss:.6f}, validation cer: {valid_cer:.4f}, validation wer: {valid_wer:.4f}\n"
+        )
         csv_writer.write_row(
             [
                 round(valid_wer, 4),
             ]
         )
+        torch.save(
+            hw_model.state_dict(),
+            os.path.join(
+                dir_model,
+                f"{FLAGS.which_hw_model}_H_{FLAGS.image_height}_W_{FLAGS.image_width}_E_{epoch}.pth",
+            ),
+        )
+    print(
+        f"Training of handwriting recognition model {FLAGS.which_hw_model} complete!!!!"
+    )
     # close the csv file
     csv_writer.close()
     return
 def main():
     learning_rate = 1
     # 3e-4 for Adam, 1 for Adadelta
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--learning_rate",
+        default=learning_rate,
+        type=float,
+        help="learning rate to use for training",
+    )
+    parser.add_argument(
+        "--weight_decay",
+        default=weight_decay,
+        type=float,
+        help="weight decay to use for training",
+    )
+    parser.add_argument(
+        "--batch_size",
+        default=batch_size,
+        type=int,
+        help="batch size to use for training",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        default=num_epochs,
+        type=int,
+        help="num epochs to train the model",
+    )
+    parser.add_argument(
+        "--image_height",
+        default=image_height,
+        type=int,
+        help="image height to be used to train the model",
+    )
+    parser.add_argument(
+        "--image_width",
+        default=image_width,
+        type=int,
+        help="image width to be used to train the model",
+    )
+    parser.add_argument(
+        "--dir_dataset",
+        default=dir_dataset,
+        type=str,
+        help="full directory path to the dataset",
+    )
+    parser.add_argument(
+        "--which_optimizer",
+        default=which_optimizer,
+        type=str,
+        choices=["adadelta", "adam"],
+        help="which optimizer to use to train",
+    )
+    parser.add_argument(
+        "--which_hw_model",
+        default=which_hw_model,
+        type=str,
+        choices=["crnn", "stn_crnn", "stn_pp_crnn"],
+        help="which model to train",
+    )
     FLAGS, unparsed = parser.parse_known_args()
     train_hw_recognizer(FLAGS)
     return
 if __name__ == "__main__":
     main()

iam_line_recognition/utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ from scipy.special import logsumexp
 NINF = -1 * float("inf")
 DEFAULT_EMISSION_THRESHOLD = 0.01
 def _reconstruct(labels, blank=0):
     new_labels = []
     # merge same labels
@@ -25,9 +26,12 @@ def _reconstruct(labels, blank=0):
     new_labels = [l for l in new_labels if l != blank]
     return new_labels
 def beam_search_decode(emission_log_prob, blank=0, **kwargs):
     beam_size = kwargs["beam_size"]
-    emission_threshold = kwargs.get("emission_threshold", np.log(DEFAULT_EMISSION_THRESHOLD))
     length, class_count = emission_log_prob.shape
@@ -53,29 +57,38 @@ def beam_search_decode(emission_log_prob, blank=0, **kwargs):
     for prefix, accu_log_prob in beams:
         labels = tuple(_reconstruct(prefix, blank))
         # log(p1 + p2) = logsumexp([log_p1, log_p2])
-        total_accu_log_prob[labels] = \
-            logsumexp([accu_log_prob, total_accu_log_prob.get(labels, NINF)])
-    labels_beams = [(list(labels), accu_log_prob)
-                    for labels, accu_log_prob in total_accu_log_prob.items()]
     labels_beams.sort(key=lambda x: x[1], reverse=True)
     labels = labels_beams[0][0]
     return labels
 def greedy_decode(emission_log_prob, blank=0):
     labels = np.argmax(emission_log_prob, axis=-1)
     labels = _reconstruct(labels, blank=blank)
     return labels
-def ctc_decode(log_probs, which_ctc_decoder="beam_search", label_2_char=None, blank=0, beam_size=25):
     emission_log_probs = np.transpose(log_probs.cpu().numpy(), (1, 0, 2))
     # size of emission_log_probs: (batch, length, class)
     decoded_list = []
     for emission_log_prob in emission_log_probs:
         if which_ctc_decoder == "beam_search":
-            decoded = beam_search_decode(emission_log_prob, blank=blank, beam_size=beam_size)
         elif which_ctc_decoder == "greedy":
             decoded = greedy_decode(emission_log_prob, blank=blank)
         else:
@@ -87,16 +100,20 @@ def ctc_decode(log_probs, which_ctc_decoder="beam_search", label_2_char=None, bl
         decoded_list.append(decoded)
     return decoded_list
 """
 --------------------
  Evaluation Metrics
 --------------------
 """
 def compute_wer_and_cer_for_batch(batch_preds, batch_gts):
     cer_batch = fastwer.score(batch_preds, batch_gts, char_level=True)
     wer_batch = fastwer.score(batch_preds, batch_gts)
     return cer_batch, wer_batch
 def compute_wer_and_cer_for_sample(str_pred, str_gt):
     cer_sample = fastwer.score_sent(str_pred, str_gt, char_level=True)
     wer_sample = fastwer.score_sent(str_pred, str_gt)

 NINF = -1 * float("inf")
 DEFAULT_EMISSION_THRESHOLD = 0.01
 def _reconstruct(labels, blank=0):
     new_labels = []
     # merge same labels
     new_labels = [l for l in new_labels if l != blank]
     return new_labels
 def beam_search_decode(emission_log_prob, blank=0, **kwargs):
     beam_size = kwargs["beam_size"]
+    emission_threshold = kwargs.get(
+        "emission_threshold", np.log(DEFAULT_EMISSION_THRESHOLD)
+    )
     length, class_count = emission_log_prob.shape
     for prefix, accu_log_prob in beams:
         labels = tuple(_reconstruct(prefix, blank))
         # log(p1 + p2) = logsumexp([log_p1, log_p2])
+        total_accu_log_prob[labels] = logsumexp(
+            [accu_log_prob, total_accu_log_prob.get(labels, NINF)]
+        )
+    labels_beams = [
+        (list(labels), accu_log_prob)
+        for labels, accu_log_prob in total_accu_log_prob.items()
+    ]
     labels_beams.sort(key=lambda x: x[1], reverse=True)
     labels = labels_beams[0][0]
     return labels
 def greedy_decode(emission_log_prob, blank=0):
     labels = np.argmax(emission_log_prob, axis=-1)
     labels = _reconstruct(labels, blank=blank)
     return labels
+def ctc_decode(
+    log_probs, which_ctc_decoder="beam_search", label_2_char=None, blank=0, beam_size=25
+):
     emission_log_probs = np.transpose(log_probs.cpu().numpy(), (1, 0, 2))
     # size of emission_log_probs: (batch, length, class)
     decoded_list = []
     for emission_log_prob in emission_log_probs:
         if which_ctc_decoder == "beam_search":
+            decoded = beam_search_decode(
+                emission_log_prob, blank=blank, beam_size=beam_size
+            )
         elif which_ctc_decoder == "greedy":
             decoded = greedy_decode(emission_log_prob, blank=blank)
         else:
         decoded_list.append(decoded)
     return decoded_list
 """
 --------------------
  Evaluation Metrics
 --------------------
 """
 def compute_wer_and_cer_for_batch(batch_preds, batch_gts):
     cer_batch = fastwer.score(batch_preds, batch_gts, char_level=True)
     wer_batch = fastwer.score(batch_preds, batch_gts)
     return cer_batch, wer_batch
 def compute_wer_and_cer_for_sample(str_pred, str_gt):
     cer_sample = fastwer.score_sent(str_pred, str_gt, char_level=True)
     wer_sample = fastwer.score_sent(str_pred, str_gt)

iam_line_recognition/utils_unique_chars.py CHANGED Viewed

@@ -3,6 +3,7 @@ import numpy as np
 from dataset import read_IAM_label_txt_file
 def list_unique_characters_in_IAM_dataset(FLAGS):
     _, all_labels = read_IAM_label_txt_file(FLAGS.file_txt_labels)
@@ -16,8 +17,8 @@ def list_unique_characters_in_IAM_dataset(FLAGS):
     unique_chars = sorted(unique_chars)
     unique_chars = np.array(unique_chars)
     unique_chars = np.unique(unique_chars)
-    unique_chars = ''.join(unique_chars)
     # prints all unique chars in the IAM dataset
     print(unique_chars)
@@ -25,19 +26,27 @@ def list_unique_characters_in_IAM_dataset(FLAGS):
     print(f"Number of unique characters : {len(unique_chars)}")
     return
 def main():
-    file_txt_labels = "/home/abhishek/Desktop/RUG/hw_recognition/IAM-data/iam_lines_gt.txt"
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--file_txt_labels", default=file_txt_labels,
-        type=str, help="full path to label text file")
     FLAGS, unparsed = parser.parse_known_args()
     list_unique_characters_in_IAM_dataset(FLAGS)
     return
 if __name__ == "__main__":
     main()

 from dataset import read_IAM_label_txt_file
 def list_unique_characters_in_IAM_dataset(FLAGS):
     _, all_labels = read_IAM_label_txt_file(FLAGS.file_txt_labels)
     unique_chars = sorted(unique_chars)
     unique_chars = np.array(unique_chars)
     unique_chars = np.unique(unique_chars)
+    unique_chars = "".join(unique_chars)
     # prints all unique chars in the IAM dataset
     print(unique_chars)
     print(f"Number of unique characters : {len(unique_chars)}")
     return
 def main():
+    file_txt_labels = (
+        "/home/abhishek/Desktop/RUG/hw_recognition/IAM-data/iam_lines_gt.txt"
+    )
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--file_txt_labels",
+        default=file_txt_labels,
+        type=str,
+        help="full path to label text file",
+    )
     FLAGS, unparsed = parser.parse_known_args()
     list_unique_characters_in_IAM_dataset(FLAGS)
     return
 if __name__ == "__main__":
     main()