File size: 4,566 Bytes
342ae6c
 
 
 
 
 
 
 
 
 
 
 
 
 
6a57b55
 
342ae6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d55fb27
342ae6c
d55fb27
 
 
 
342ae6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
"""Gradio with DocFormer

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_XBurG-8jYF4eJJK5VoCJ2Y1v6RV9iAW
"""

## Requirements.txt
import os
os.system('pip install pyyaml==5.1')
## install PyTesseract
os.system('pip install -q pytesseract')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Importing the functions from the DocFormer Repo
from dataset import create_features
from modeling import DocFormerEncoder,ResNetFeatureExtractor,DocFormerEmbeddings,LanguageFeatureExtractor
from transformers import BertTokenizerFast
from utils import DocFormer

## Hyperparameters
import torch

seed = 42
target_size = (500, 384)
max_len = 128

## Setting some hyperparameters

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = {
  "coordinate_size": 96,              ## (768/8), 8 for each of the 8 coordinates of x, y
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "image_feature_pool_shape": [7, 7, 256],
  "intermediate_ff_size_factor": 4,
  "max_2d_position_embeddings": 1024,
  "max_position_embeddings": 128,
  "max_relative_positions": 8,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "shape_size": 96,
  "vocab_size": 30522,
  "layer_norm_eps": 1e-12,
}

## Defining the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")



docformer = DocFormer(config)

# path_to_weights = 'drive/MyDrive/docformer_rvl_checkpoint/docformer_v1.ckpt'

url = 'https://www.kaggleusercontent.com/kf/97691030/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ztbnfHUlYK1kHw0jKXt1QA.DfJGkOgL9TBiATpTSuKwMoaKfApiVDyncy5kMQb-8FeayksRYddv3tummbzYjPOe9bYuSf1ZSqtcfMY4t1-HenQwnxWZ9HektDmQbcuQaGN7lPwxIzIIjUk3zOkDH6UIcmAeUrPpIbMQ9ZHRIGY9LVAWx1lDctT-9QEfEpdHceS4bNTTrftxi-GBCqd4aLACNz_veXM6YqsplQulb7D9ARZYDOxgpAYl3bDL2-KwduLgCusostp7-uzCTkBeJRQ8LpdmHdRY6FmWcf47vFBcTpG9Qoeml3Sr4EUXEcBKfPKMbDbwIbknoV9TuxGLtKHAu4kyWyRCvLb_20FJ4oZSoQHko0joTeIwOHVPeKpAadT0R3soXGXs7jbcEezdoCz48NFKLU_1lkzeg43ExAgf47iE4_4ErEoi_Hs0deINAY1TunkELGjAO8AuVI4z8fctJgIq_u6rg_-_zcQPDRGqCnoe3M4jtmRWSPFsnOGznezr87jg1bb3hTF1g8RIWWyqmpzUccpMqw27x_ZUkm3UZSQ3Axg7SdqH4XuhtqcujUlH4p51UP7Iv0NlLYMcMpWEFJ630e-kcx8IpKycMVg484Pm8SzI0rTUU6FqA-csBWX1GGAOJwDQR4VYiLTMkd35zNp7byO56uXd5cLXrmcOZdxetrXN8IHAw3GxmlEmi8u-iuZlBwbdWhTx_W3hnwWT.XyPnjS0IQxQ_QlNUd36QVQ/models/epoch=0-step=753.ckpt'

try:
  docformer.load_from_checkpoint(url)
except:
  pass

id2label = ['scientific_report',
 'resume',
 'memo',
 'file_folder',
 'specification',
 'news_article',
 'letter',
 'form',
 'budget',
 'handwritten',
 'email',
 'invoice',
 'presentation',
 'scientific_publication',
 'questionnaire',
 'advertisement']

import gradio as gr

## Taken from LayoutLMV2 space

image = gr.inputs.Image(type="pil")
label = gr.outputs.Label(num_top_classes=5)
examples = [['00093726.png'], ['00866042.png']]
title = "Interactive demo: DocFormer for Image Classification"
description = "Demo for classifying document images with DocFormer model. To use it, \
simply upload an image or use the example images below and click 'submit' to let the model predict the 5 most probable Document classes. \
Results will show up in a few seconds."

def classify_image(image):

  image.save('sample_img.png')
  final_encoding = create_features(
            './sample_img.png',
            tokenizer,
            add_batch_dim=True,
            target_size=target_size,
            max_seq_length=max_len,
            path_to_save=None,
            save_to_disk=False,
            apply_mask_for_mlm=False,
            extras_for_debugging=False,
            use_ocr = True
    )

  keys_to_reshape = ['x_features', 'y_features', 'resized_and_aligned_bounding_boxes']
  for key in keys_to_reshape:
      final_encoding[key] = final_encoding[key][:, :max_len]

  from torchvision import transforms
  # ## Normalization to these mean and std (I have seen some tutorials used this, and also in image reconstruction, so used it)
  transform = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

  final_encoding['resized_scaled_img'] = transform(final_encoding['resized_scaled_img'])
  output = docformer.forward(final_encoding)
  output = output[0].softmax(axis = -1)
  
  final_pred = {}
  for i, score in enumerate(output):
      score = output[i]
      final_pred[id2label[i]] = score.detach().cpu().tolist()
      
  return final_pred

gr.Interface(fn=classify_image, inputs=image, outputs=label, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)