File size: 4,875 Bytes
cb4c41f
 
 
 
 
f50ef7b
dea123e
1592bb3
93fe32c
 
 
cb4c41f
789ac0d
 
cb4c41f
 
 
 
 
6b45627
5c58f13
6b45627
f50ef7b
21ab7aa
 
 
1592bb3
 
9485a97
1592bb3
4fa7e16
 
 
dea123e
cb4c41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7221a3
cb4c41f
a1a0ef2
cb4c41f
c2d53c9
2f8b3bf
 
e45a37b
 
 
df954f0
7384818
 
 
da509d5
7384818
 
 
c2d53c9
e45a37b
8732834
b7221a3
8732834
 
b7221a3
bc05740
 
b7221a3
bc05740
b7221a3
bc05740
b7221a3
 
 
8732834
b7221a3
cb4c41f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
import gradio as gr

import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import requests
from io import BytesIO
import json
import os


processor = DonutProcessor.from_pretrained("./donut-base-finetuned-inv")
model = VisionEncoderDecoderModel.from_pretrained("./donut-base-finetuned-inv")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def process_document(image):
    #can't save uploaded file locally, but needs to be converted from nparray to PIL
    im1 = Image.fromarray(image)
    
    #send notification through telegram
    TOKEN = os.getenv('TELEGRAM_BOT_TOKEN')
    CHAT_ID = os.getenv('TELEGRAM_CHANNEL_ID')
    url = f'https://api.telegram.org/bot{TOKEN}/sendPhoto?chat_id={CHAT_ID}'
    bio = BytesIO()
    bio.name = 'image.jpeg'
    im1.save(bio, 'JPEG')
    bio.seek(0)
    media = {"type": "photo", "media": "attach://photo", "caption": "New doc is being tried out:"}
    data = {"media": json.dumps(media)}
    response = requests.post(url, files={'photo': bio}, data=data)
    
    # prepare encoder inputs
    pixel_values = processor(image, return_tensors="pt").pixel_values
    
    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
          
    # generate answer
    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )
    
    # postprocess
    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    
    return processor.token2json(sequence), image

description = '<p>Using Donut model finetuned on Invoices for retrieval of following information:</p><ul><li><span style="color:black">DocType</span></span></li><li><span style="color:black">Currency</span></span></li><li><span style="color:black">DocumentDate</span></span></li><li><span style="color:black">GrossAmount</span></span></li><li><span style="color:black">InvoiceNumber</span></span></li><li><span style="color:black">NetAmount</span></span></li><li><span style="color:black">TaxAmount</span></span></li><li><span style="color:black">OrderNumber</span></span></li><li><span style="color:black">CreditorCountry</span></span></li></ul><p>To use it, simply upload your image and click &#39;submit&#39;, or click one of the examples to load them. Read more at the links below.</p><p>&nbsp;</p><p>(because this is running on the free cpu tier, it will take about 40 secs before you see a result)</p><p>Have fun&nbsp;😎</p><p>Toon Beerten</p>'
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
title = "Demo: Donut 🍩 for invoice header retrieval"


#demo = gr.Interface(fn=process_document,inputs=gr_image,outputs="json",title="Demo: Donut 🍩 for invoice header retrieval", description=description,
#    article=article,enable_queue=True, examples=[["example.jpg"], ["example_2.jpg"], ["example_3.jpg"]], cache_examples=False)

css = "#inp {height: auto !important; width: 100% !important;}"
# css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"

#css = ".image-preview {height: auto !important;}"


with gr.Blocks(css=css) as demo:
    gr.Markdown(title)
    gr.Markdown(description)

    
    with gr.Row().style():
        with gr.Column(scale=1):
            inp = gr.Image(label='Upload invoice here:')   #.style(height=400)          
        with gr.Column(scale=2):
             gr.Examples([["example.jpg"], ["example_2.jpg"], ["example_3.jpg"]], inputs=[inp],label='Or use one of these examples:')
    with gr.Row().style():       
             btn = gr.Button("↓   Extract   ↓")
    with gr.Row().style():
        with gr.Column(scale=2):
            imgout = gr.Image(label='Uploaded document:',elem_id="inp")
        with gr.Column(scale=1):
            jsonout = gr.JSON(label='Extracted information:')
            
    btn.click(fn=process_document, inputs=inp, outputs=[jsonout,imgout])

demo.launch()