enandhag commited on
Commit
4f8f6ef
1 Parent(s): f912b4b

pushed gradio app

Browse files
Dockerfile ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==================================================================
2
+ # Base image
3
+ # ------------------------------------------------------------------
4
+ FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04
5
+
6
+ # ==================================================================
7
+ # git, text editors, cmake
8
+ # ------------------------------------------------------------------
9
+
10
+ RUN apt-get update -y && \
11
+ apt-get upgrade -y && \
12
+ APT_INSTALL="apt-get install -y" && \
13
+ APT_INSTALL_NIR="apt-get install -y --no-install-recommends" && \
14
+ PIP_INSTALL="python -m pip --no-cache-dir install" && \
15
+ GIT_CLONE="git clone" && \
16
+ DEBIAN_FRONTEND=noninteractive $APT_INSTALL_NIR \
17
+ apt && \
18
+ DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
19
+ git-core \
20
+ ca-certificates \
21
+ cmake \
22
+ wget \
23
+ vim \
24
+ nano \
25
+ unzip \
26
+ ffmpeg \
27
+ libsm6 libxext6 libxrender-dev \
28
+ libgstreamer1.0-0 gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav gstreamer1.0-tools \
29
+ build-essential && \
30
+ # ==================================================================
31
+ # python, pip
32
+ # ------------------------------------------------------------------
33
+ # rm -rf /var/lib/apt/lists/* \
34
+ # /etc/apt/sources.list.d/cuda.list \
35
+ # /etc/apt/sources.list.d/nvidia-ml.list && \
36
+ apt-get update -y && \
37
+ apt-get upgrade -y && \
38
+ DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
39
+ software-properties-common && \
40
+ apt-get update && \
41
+ DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
42
+ python3.7 \
43
+ python3.7-dev \
44
+ python-tk \
45
+ python3-tk \
46
+ python3.7-tk \
47
+ python3-pip && \
48
+ ln -s /usr/bin/python3.7 /usr/local/bin/python3 && \
49
+ ln -s /usr/bin/python3.7 /usr/local/bin/python && \
50
+ python3.7 -m pip install pip --upgrade
51
+
52
+ # ==================================================================
53
+ # Tools and dependencies
54
+ # ------------------------------------------------------------------
55
+ RUN python -m pip install \
56
+ setuptools==41.0.0 \
57
+ transformers[sentencepiece] \
58
+ numpy\
59
+ h5py \
60
+ scipy \
61
+ pandas \
62
+ matplotlib \
63
+ datasets \
64
+ pillow \
65
+ jupyter \
66
+ scikit-learn \
67
+ tqdm \
68
+ torch \
69
+ torchvision \
70
+ pytesseract \
71
+ pdf2img \
72
+ img2pdf \
73
+ jupyterlab \
74
+ timm
README.md CHANGED
@@ -1,12 +1,11 @@
1
- ---
2
- title: DocAI
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 3.14.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Chequeeasy
2
+
3
+ ChequeEasy is a project that aims to simplify the process of approval of cheques and making it easier for both bank officials and customers.
4
+
5
+ This project leverages Donut model proposed in the paper <a href="https://arxiv.org/abs/2111.15664/"> OCR-free Document Understanding Transformer </a> for the parsing of the required data from cheques.'
6
+
7
+ 'Donut is based on a very simple transformer encoder and decoder architecture. It\'s main USP is that it is an OCR-free approach to Visual Document Understanding (VDU) and can perform tasks like document classification, information extraction as well as VQA. \
8
+
9
+
10
+ OCR based techniques come with several limitations such as requiring use of additional downstream models, lack of understanding about document structure, requiring use of hand crafted rules for information extraction,etc. \
11
+ Donut helps you get rid of all of these OCR specific limitations. The model for the project has been trained using a subset of this <a href="https://www.kaggle.com/datasets/medali1992/cheque-images/"> kaggle dataset </a>. The original dataset contains images of cheques of 10 different banks.
 
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import gradio as gr
4
+ from scripts.predict import parse_cheque_with_donut
5
+
6
+ ##Create list of examples to be loaded
7
+ example_list = glob.glob("data/*")
8
+ example_list = list(map(lambda el: [el], example_list))
9
+
10
+ demo = gr.Blocks()
11
+
12
+ with demo:
13
+
14
+ gr.Markdown("# **<p align='center'>ChequeEasy: Banking made easy </p>**")
15
+ gr.Markdown(
16
+ 'ChequeEasy is a project that aims to simplify the process of approval of cheques and making it easier for both bank officials and customers. \
17
+ This project leverages Donut model proposed in the paper <a href="https://arxiv.org/abs/2111.15664/"> OCR-free Document Understanding Transformer </a> for the parsing of the required data from cheques.'
18
+ 'Donut is based on a very simple transformer encoder and decoder architecture. It\'s main USP is that it is an OCR-free approach to Visual Document Understanding (VDU) and can perform tasks like document classification, information extraction as well as VQA. \
19
+ OCR based techniques come with several limitations such as requiring use of additional downstream models, lack of understanding about document structure, requiring use of hand crafted rules for information extraction,etc. \
20
+ Donut helps you get rid of all of these OCR specific limitations. The model for the project has been trained using a subset of this <a href="https://www.kaggle.com/datasets/medali1992/cheque-images/"> kaggle dataset </a>. The original dataset contains images of cheques of 10 different banks.'
21
+ )
22
+
23
+ with gr.Tabs():
24
+
25
+ with gr.TabItem("Cheque Parser"):
26
+ gr.Markdown(
27
+ "This module is used to extract details filled by a bank customer from cheques. At present the model is trained to extract details like - Payee Name, Amount in words, Amount in Figures, Bank Name. \
28
+ This model can be further trained to parse additional details like MICR Code, Cheque Number, Account Number, etc."
29
+ )
30
+ with gr.Box():
31
+ gr.Markdown("**Upload Cheque**")
32
+ input_image_parse = gr.Image(type="filepath", label="Input Cheque")
33
+ with gr.Box():
34
+ gr.Markdown("**Parsed Cheque Data**")
35
+
36
+ payee_name = gr.Textbox(label="Payee Name")
37
+ amt_in_words = gr.Textbox(label="Legal Amount")
38
+ amt_in_figures = gr.Textbox(label="Courtesy Amount")
39
+ bank_name = gr.Textbox(label="Bank Name")
40
+
41
+ with gr.Box():
42
+ gr.Markdown("**Predict**")
43
+ with gr.Row():
44
+ parse_cheque = gr.Button("Call Donut 🍩")
45
+
46
+ with gr.Column():
47
+ gr.Examples(
48
+ example_list,
49
+ [input_image_parse],
50
+ [payee_name, amt_in_words, amt_in_figures, bank_name],
51
+ parse_cheque_with_donut,
52
+ cache_examples=False,
53
+ )
54
+
55
+ parse_cheque.click(
56
+ parse_cheque_with_donut,
57
+ inputs=input_image_parse,
58
+ outputs=[payee_name, amt_in_words, amt_in_figures, bank_name],
59
+ )
60
+
61
+ gr.Markdown(
62
+ '\n Solution built by: <a href="https://github.com/Nandhagopalan">Nandhagopalan Elangovan</a>'
63
+ )
64
+
65
+ demo.launch()
data/1012.jpg ADDED
data/103.jpg ADDED
data/1031.jpg ADDED
data/1038.jpg ADDED
data/1046.jpg ADDED
docker_build.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ docker build -t harbor.hpc.ford.com/enandhag/docai:v1 \
2
+ --build-arg https_proxy=http://internet.ford.com:83/ \
3
+ --build-arg http_proxy=http://internet.ford.com:83/ \
4
+ --build-arg no_proxy=.ford.com,localhost,127.0.0.1 \
5
+ -f Dockerfile .
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "docai"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["enandhag <enandhag@ford.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.8"
10
+ transformers = {extras = ["sentencepiece"], version = "^4.25.1"}
11
+ datasets = "^2.8.0"
12
+ torch = "^1.13.1"
13
+ gradio = "^3.14.0"
14
+ numpy = "^1.24.0"
15
+ jupyter = "^1.0.0"
16
+
17
+
18
+ [build-system]
19
+ requires = ["poetry-core"]
20
+ build-backend = "poetry.core.masonry.api"
scripts/__pycache__/predict.cpython-38.pyc ADDED
Binary file (1.49 kB). View file
 
scripts/predict.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.donut_utils import (
2
+ load_donut_model_and_processor,
3
+ prepare_data_using_processor,
4
+ load_image,
5
+ )
6
+ import re
7
+
8
+ CHEQUE_PARSER_MODEL = "Nandhu/DocAI"
9
+ TASK_PROMPT = "<s>"
10
+
11
+
12
+ def parse_cheque_with_donut(input_image_path):
13
+
14
+ image = load_image(input_image_path)
15
+
16
+ donut_processor, model = load_donut_model_and_processor(CHEQUE_PARSER_MODEL)
17
+
18
+ cheque_image_tensor, input_for_decoder = prepare_data_using_processor(
19
+ donut_processor, image, TASK_PROMPT
20
+ )
21
+
22
+ outputs = model.generate(
23
+ cheque_image_tensor,
24
+ decoder_input_ids=input_for_decoder,
25
+ max_length=model.decoder.config.max_position_embeddings,
26
+ early_stopping=True,
27
+ pad_token_id=donut_processor.tokenizer.pad_token_id,
28
+ eos_token_id=donut_processor.tokenizer.eos_token_id,
29
+ use_cache=True,
30
+ num_beams=1,
31
+ bad_words_ids=[[donut_processor.tokenizer.unk_token_id]],
32
+ return_dict_in_generate=True,
33
+ output_scores=True,
34
+ )
35
+
36
+ decoded_output_sequence = donut_processor.batch_decode(outputs.sequences)[0]
37
+
38
+ extracted_cheque_details = decoded_output_sequence.replace(
39
+ donut_processor.tokenizer.eos_token, ""
40
+ ).replace(donut_processor.tokenizer.pad_token, "")
41
+
42
+ ## remove task prompt from token sequence
43
+ cleaned_cheque_details = re.sub(
44
+ r"<.*?>", "", extracted_cheque_details, count=1
45
+ ).strip()
46
+
47
+ ## generate ordered json sequence from output token sequence
48
+ cheque_details_json = donut_processor.token2json(cleaned_cheque_details)
49
+ print("cheque_details_json:", cheque_details_json)
50
+
51
+ ## extract required fields from predicted json
52
+
53
+ amt_in_words = cheque_details_json["VALUE_LETTERS"]
54
+ amt_in_figures = cheque_details_json["VALUE_NUMBERS"]
55
+
56
+ payee_name = cheque_details_json["USER2NAME"]
57
+
58
+ bank_name = cheque_details_json["BANK_NAME"]
59
+
60
+ return (payee_name, amt_in_words, amt_in_figures, bank_name)
utils/__pycache__/donut_utils.cpython-38.pyc ADDED
Binary file (1.09 kB). View file
 
utils/donut_utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
2
+ from PIL import Image
3
+ import torch
4
+
5
+ device = "cuda" if torch.cuda.is_available() else "cpu"
6
+
7
+
8
+ def load_image(image_path):
9
+ image = Image.open(image_path).convert("RGB")
10
+ return image
11
+
12
+
13
+ def load_donut_model_and_processor(trained_model_repo):
14
+ donut_processor = DonutProcessor.from_pretrained(trained_model_repo)
15
+ model = VisionEncoderDecoderModel.from_pretrained(trained_model_repo)
16
+ model.to(device)
17
+ return donut_processor, model
18
+
19
+
20
+ def prepare_data_using_processor(donut_processor, image, task_prompt):
21
+ ## Pass image through donut processor's feature extractor and retrieve image tensor
22
+ pixel_values = donut_processor(image, return_tensors="pt").pixel_values
23
+ pixel_values = pixel_values.to(device)
24
+
25
+ ## Pass task prompt for document (cheque) parsing task to donut processor's tokenizer and retrieve the input_ids
26
+ decoder_input_ids = donut_processor.tokenizer(
27
+ task_prompt, add_special_tokens=False, return_tensors="pt"
28
+ )["input_ids"]
29
+ decoder_input_ids = decoder_input_ids.to(device)
30
+
31
+ return pixel_values, decoder_input_ids