Spaces:
Runtime error
Runtime error
File size: 4,566 Bytes
342ae6c 6a57b55 342ae6c d55fb27 342ae6c d55fb27 342ae6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# -*- coding: utf-8 -*-
"""Gradio with DocFormer
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1_XBurG-8jYF4eJJK5VoCJ2Y1v6RV9iAW
"""
## Requirements.txt
import os
os.system('pip install pyyaml==5.1')
## install PyTesseract
os.system('pip install -q pytesseract')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
## Importing the functions from the DocFormer Repo
from dataset import create_features
from modeling import DocFormerEncoder,ResNetFeatureExtractor,DocFormerEmbeddings,LanguageFeatureExtractor
from transformers import BertTokenizerFast
from utils import DocFormer
## Hyperparameters
import torch
seed = 42
target_size = (500, 384)
max_len = 128
## Setting some hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
"coordinate_size": 96, ## (768/8), 8 for each of the 8 coordinates of x, y
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"image_feature_pool_shape": [7, 7, 256],
"intermediate_ff_size_factor": 4,
"max_2d_position_embeddings": 1024,
"max_position_embeddings": 128,
"max_relative_positions": 8,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"shape_size": 96,
"vocab_size": 30522,
"layer_norm_eps": 1e-12,
}
## Defining the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
docformer = DocFormer(config)
# path_to_weights = 'drive/MyDrive/docformer_rvl_checkpoint/docformer_v1.ckpt'
url = 'https://www.kaggleusercontent.com/kf/97691030/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ztbnfHUlYK1kHw0jKXt1QA.DfJGkOgL9TBiATpTSuKwMoaKfApiVDyncy5kMQb-8FeayksRYddv3tummbzYjPOe9bYuSf1ZSqtcfMY4t1-HenQwnxWZ9HektDmQbcuQaGN7lPwxIzIIjUk3zOkDH6UIcmAeUrPpIbMQ9ZHRIGY9LVAWx1lDctT-9QEfEpdHceS4bNTTrftxi-GBCqd4aLACNz_veXM6YqsplQulb7D9ARZYDOxgpAYl3bDL2-KwduLgCusostp7-uzCTkBeJRQ8LpdmHdRY6FmWcf47vFBcTpG9Qoeml3Sr4EUXEcBKfPKMbDbwIbknoV9TuxGLtKHAu4kyWyRCvLb_20FJ4oZSoQHko0joTeIwOHVPeKpAadT0R3soXGXs7jbcEezdoCz48NFKLU_1lkzeg43ExAgf47iE4_4ErEoi_Hs0deINAY1TunkELGjAO8AuVI4z8fctJgIq_u6rg_-_zcQPDRGqCnoe3M4jtmRWSPFsnOGznezr87jg1bb3hTF1g8RIWWyqmpzUccpMqw27x_ZUkm3UZSQ3Axg7SdqH4XuhtqcujUlH4p51UP7Iv0NlLYMcMpWEFJ630e-kcx8IpKycMVg484Pm8SzI0rTUU6FqA-csBWX1GGAOJwDQR4VYiLTMkd35zNp7byO56uXd5cLXrmcOZdxetrXN8IHAw3GxmlEmi8u-iuZlBwbdWhTx_W3hnwWT.XyPnjS0IQxQ_QlNUd36QVQ/models/epoch=0-step=753.ckpt'
try:
docformer.load_from_checkpoint(url)
except:
pass
id2label = ['scientific_report',
'resume',
'memo',
'file_folder',
'specification',
'news_article',
'letter',
'form',
'budget',
'handwritten',
'email',
'invoice',
'presentation',
'scientific_publication',
'questionnaire',
'advertisement']
import gradio as gr
## Taken from LayoutLMV2 space
image = gr.inputs.Image(type="pil")
label = gr.outputs.Label(num_top_classes=5)
examples = [['00093726.png'], ['00866042.png']]
title = "Interactive demo: DocFormer for Image Classification"
description = "Demo for classifying document images with DocFormer model. To use it, \
simply upload an image or use the example images below and click 'submit' to let the model predict the 5 most probable Document classes. \
Results will show up in a few seconds."
def classify_image(image):
image.save('sample_img.png')
final_encoding = create_features(
'./sample_img.png',
tokenizer,
add_batch_dim=True,
target_size=target_size,
max_seq_length=max_len,
path_to_save=None,
save_to_disk=False,
apply_mask_for_mlm=False,
extras_for_debugging=False,
use_ocr = True
)
keys_to_reshape = ['x_features', 'y_features', 'resized_and_aligned_bounding_boxes']
for key in keys_to_reshape:
final_encoding[key] = final_encoding[key][:, :max_len]
from torchvision import transforms
# ## Normalization to these mean and std (I have seen some tutorials used this, and also in image reconstruction, so used it)
transform = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
final_encoding['resized_scaled_img'] = transform(final_encoding['resized_scaled_img'])
output = docformer.forward(final_encoding)
output = output[0].softmax(axis = -1)
final_pred = {}
for i, score in enumerate(output):
score = output[i]
final_pred[id2label[i]] = score.detach().cpu().tolist()
return final_pred
gr.Interface(fn=classify_image, inputs=image, outputs=label, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)
|