In [1]:
from datasets import load_dataset 
dataset = load_dataset("nielsr/funsd-layoutlmv3")

 from .autonotebook import tqdm as notebook_tqdm
Found cached dataset funsd-layoutlmv3 (C:/Users/csara/.cache/huggingface/datasets/nielsr___funsd-layoutlmv3/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9)
100%|██████████| 2/2 [00:00<00:00, 290.08it/s]


In [2]:
from transformers import AutoProcessor
from datasets.features import ClassLabel
tokenizer = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

In [3]:
def get_label_list(labels):
 unique_labels = set()
 for label in labels:
 unique_labels = unique_labels | set(label)
 label_list = list(unique_labels)
 label_list.sort()
 return label_list

In [4]:
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"


features = dataset["train"].features
column_names = dataset["train"].column_names

if isinstance(features["ner_tags"].feature, ClassLabel):
 label_list = features["ner_tags"].feature.names
 id2label = {k: v for k,v in enumerate(label_list)}
 label2id = {v: k for k,v in enumerate(label_list)}
else:
 label_list = get_label_list(dataset["train"]["ner_tags"])
 id2label = {k: v for k,v in enumerate(label_list)}
 label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

In [5]:
def encoder(examples):
 images = examples["image"]
 words = examples["tokens"]
 boxes = examples["bboxes"]
 word_labels = examples[label_column_name]

 encoding = tokenizer(images, words, boxes=boxes, word_labels=word_labels,
 truncation=True, padding="max_length")

 return encoding

In [6]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D


features = Features({
 'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
 'input_ids': Sequence(feature=Value(dtype='int64')),
 'attention_mask': Sequence(Value(dtype='int64')),
 'bbox': Array2D(dtype="int64", shape=(512, 4)),
 'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
 encoder,
 batched=True,
 remove_columns=column_names,
 features=features,
)
eval_dataset = dataset["test"].map(
 encoder,
 batched=True,
 remove_columns=column_names,
 features=features,
)

Loading cached processed dataset at C:\Users\csara\.cache\huggingface\datasets\nielsr___funsd-layoutlmv3\funsd\1.0.0\0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9\cache-1a2006e093366773.arrow
Loading cached processed dataset at C:\Users\csara\.cache\huggingface\datasets\nielsr___funsd-layoutlmv3\funsd\1.0.0\0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9\cache-b7746bd565a79622.arrow


In [7]:
train_dataset.set_format("torch")


In [3]:
import torch

In [9]:
from datasets import load_metric
metric = load_metric("seqeval")

import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
 predictions, labels = p
 predictions = np.argmax(predictions, axis=2)

 # Remove ignored index (special tokens)
 true_predictions = [
 [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
 for prediction, label in zip(predictions, labels)
 ]
 true_labels = [
 [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
 for prediction, label in zip(predictions, labels)
 ]

 results = metric.compute(predictions=true_predictions, references=true_labels)
 if return_entity_level_metrics:
 # Unpack nested dictionaries
 final_results = {}
 for key, value in results.items():
 if isinstance(value, dict):
 for n, v in value.items():
 final_results[f"{key}_{n}"] = v
 else:
 final_results[key] = value
 return final_results
 else:
 return {
 "precision": results["overall_precision"],
 "recall": results["overall_recall"],
 "f1": results["overall_f1"],
 "accuracy": results["overall_accuracy"],
 }


 metric = load_metric("seqeval")


In [10]:
from transformers import LayoutLMv3ForTokenClassification

model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
 id2label=id2label,
 label2id=label2id)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test",
 max_steps=1000,
 per_device_train_batch_size=2,
 per_device_eval_batch_size=3,
 learning_rate=1e-5,
 evaluation_strategy="steps",
 eval_steps=100,
 load_best_model_at_end=True,
 metric_for_best_model="f1")

In [12]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=eval_dataset,
 tokenizer=tokenizer,
 data_collator=default_data_collator,
 compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

 
 10%|█ | 100/1000 [05:03<46:20, 3.09s/it]

{'eval_loss': 0.6499422192573547, 'eval_precision': 0.7698232895333031, 'eval_recall': 0.8440139095876801, 'eval_f1': 0.8052132701421801, 'eval_accuracy': 0.8016165458219422, 'eval_runtime': 34.3294, 'eval_samples_per_second': 1.456, 'eval_steps_per_second': 0.495, 'epoch': 1.33}


 
 20%|██ | 200/1000 [11:12<38:47, 2.91s/it]

{'eval_loss': 0.4936561584472656, 'eval_precision': 0.8225578102878717, 'eval_recall': 0.8658718330849479, 'eval_f1': 0.8436592449177153, 'eval_accuracy': 0.8246760965172947, 'eval_runtime': 35.9544, 'eval_samples_per_second': 1.391, 'eval_steps_per_second': 0.473, 'epoch': 2.67}


 
 30%|███ | 300/1000 [17:00<33:27, 2.87s/it]

{'eval_loss': 0.4679512083530426, 'eval_precision': 0.8520765282314512, 'eval_recall': 0.907103825136612, 'eval_f1': 0.8787295476419633, 'eval_accuracy': 0.8576013312730298, 'eval_runtime': 35.098, 'eval_samples_per_second': 1.425, 'eval_steps_per_second': 0.484, 'epoch': 4.0}


 
 40%|████ | 400/1000 [22:50<29:56, 2.99s/it]

{'eval_loss': 0.5078234076499939, 'eval_precision': 0.8813806514341274, 'eval_recall': 0.9006458022851466, 'eval_f1': 0.890909090909091, 'eval_accuracy': 0.849994056816831, 'eval_runtime': 38.4245, 'eval_samples_per_second': 1.301, 'eval_steps_per_second': 0.442, 'epoch': 5.33}


 50%|█████ | 500/1000 [28:23<25:09, 3.02s/it] 

{'loss': 0.5405, 'learning_rate': 5e-06, 'epoch': 6.67}


 
 50%|█████ | 500/1000 [29:02<25:09, 3.02s/it]

{'eval_loss': 0.5313129425048828, 'eval_precision': 0.8702807357212003, 'eval_recall': 0.8931942374565326, 'eval_f1': 0.8815886246629075, 'eval_accuracy': 0.8495186021633186, 'eval_runtime': 37.6242, 'eval_samples_per_second': 1.329, 'eval_steps_per_second': 0.452, 'epoch': 6.67}


 
 60%|██████ | 600/1000 [36:52<19:39, 2.95s/it]

{'eval_loss': 0.5118691921234131, 'eval_precision': 0.8847441415590627, 'eval_recall': 0.9190263288623944, 'eval_f1': 0.9015594541910332, 'eval_accuracy': 0.8628313324616664, 'eval_runtime': 37.2452, 'eval_samples_per_second': 1.342, 'eval_steps_per_second': 0.456, 'epoch': 8.0}


 
 70%|███████ | 700/1000 [42:26<19:21, 3.87s/it]

{'eval_loss': 0.5196597576141357, 'eval_precision': 0.8799428299190091, 'eval_recall': 0.9175360158966717, 'eval_f1': 0.8983463035019456, 'eval_accuracy': 0.8707951979079995, 'eval_runtime': 38.9063, 'eval_samples_per_second': 1.285, 'eval_steps_per_second': 0.437, 'epoch': 9.33}


 
 80%|████████ | 800/1000 [49:13<12:57, 3.89s/it]

{'eval_loss': 0.5508859157562256, 'eval_precision': 0.8907603464870067, 'eval_recall': 0.9195230998509687, 'eval_f1': 0.9049132241505743, 'eval_accuracy': 0.8629501961250445, 'eval_runtime': 41.8293, 'eval_samples_per_second': 1.195, 'eval_steps_per_second': 0.406, 'epoch': 10.67}


 
 90%|█████████ | 900/1000 [55:10<04:35, 2.75s/it]

{'eval_loss': 0.5496693253517151, 'eval_precision': 0.8995098039215687, 'eval_recall': 0.9115747640337805, 'eval_f1': 0.9055020972119417, 'eval_accuracy': 0.8654463330559848, 'eval_runtime': 34.7336, 'eval_samples_per_second': 1.44, 'eval_steps_per_second': 0.489, 'epoch': 12.0}


100%|██████████| 1000/1000 [1:01:33<00:00, 3.32s/it]

{'loss': 0.123, 'learning_rate': 0.0, 'epoch': 13.33}


 
100%|██████████| 1000/1000 [1:02:05<00:00, 3.32s/it]

{'eval_loss': 0.5414420962333679, 'eval_precision': 0.8941005802707931, 'eval_recall': 0.9185295578738202, 'eval_f1': 0.9061504533202647, 'eval_accuracy': 0.8693688339474622, 'eval_runtime': 32.6105, 'eval_samples_per_second': 1.533, 'eval_steps_per_second': 0.521, 'epoch': 13.33}


100%|██████████| 1000/1000 [1:02:26<00:00, 3.75s/it]

{'train_runtime': 3746.26, 'train_samples_per_second': 0.534, 'train_steps_per_second': 0.267, 'train_loss': 0.3317529182434082, 'epoch': 13.33}





TrainOutput(global_step=1000, training_loss=0.3317529182434082, metrics={'train_runtime': 3746.26, 'train_samples_per_second': 0.534, 'train_steps_per_second': 0.267, 'train_loss': 0.3317529182434082, 'epoch': 13.33})

In [14]:
trainer.evaluate()

100%|██████████| 17/17 [00:50<00:00, 2.94s/it]


{'eval_loss': 0.5414420962333679,
 'eval_precision': 0.8941005802707931,
 'eval_recall': 0.9185295578738202,
 'eval_f1': 0.9061504533202647,
 'eval_accuracy': 0.8693688339474622,
 'eval_runtime': 53.511,
 'eval_samples_per_second': 0.934,
 'eval_steps_per_second': 0.318,
 'epoch': 13.33}

In [15]:
trainer.save_model("trained_model")


In [4]:
model = LayoutLMv3ForTokenClassification.from_pretrained(r"G:/Form understanding in Noisy scanned documents/trained_model")

NameError: name 'LayoutLMv3ForTokenClassification' is not defined

: 

In [None]:
device = torch.device("cuda")
model.cuda()

In [None]:
example = dataset["test"][2]
print(example.keys())

image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]
word_labels = example["ner_tags"]

encoding = tokenizer(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
encoding = encoding.to('cuda')
for k,v in encoding.items():
 print(k,v.shape)

with torch.no_grad():
 outputs = model.to('cuda')(**encoding)

logits = outputs.logits
logits.shape

In [None]:
predictions = logits.argmax(-1).squeeze().tolist()
#labels = encoding.labels.squeeze().tolist()


In [None]:
def unnormalize_box(bbox, width, height):
 return [
 width * (bbox[0] / 1000),
 height * (bbox[1] / 1000),
 width * (bbox[2] / 1000),
 height * (bbox[3] / 1000),
 ]

token_boxes = encoding.bbox.squeeze().tolist()
width, height = image.size

true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]

In [None]:
from PIL import ImageDraw, ImageFont

draw = ImageDraw.Draw(image)

font = ImageFont.load_default()

def iob_to_label(label):
 label = label[2:]
 if not label:
 return 'other'
 return label

label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}

for prediction, box in zip(true_predictions, true_boxes):
 predicted_label = iob_to_label(prediction).lower()
 draw.rectangle(box, outline=label2color[predicted_label])
 draw.text((box[0] + 10, box[1] - 10), text=predicted_label, fill=label2color[predicted_label], font=font)

image