shivi commited on
Commit
0f164ad
1 Parent(s): 7d21637

updated model to extract bank_name and cheque_date

Browse files
Files changed (1) hide show
  1. predict_cheque_parser.py +25 -23
predict_cheque_parser.py CHANGED
@@ -1,15 +1,16 @@
1
  from transformers import DonutProcessor, VisionEncoderDecoderModel
 
 
2
  from word2number import w2n
3
  from dateutil import relativedelta
4
  from datetime import datetime
5
  from word2number import w2n
6
- from textblob import Word
7
  from PIL import Image
8
  import torch
9
  import re
10
 
11
- CHEQUE_PARSER_MODEL = "shivi/donut-base-cheque"
12
- TASK_PROMPT = "<s_cord-v2>"
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
  def load_donut_model_and_processor():
@@ -21,7 +22,6 @@ def load_donut_model_and_processor():
21
  def prepare_data_using_processor(donut_processor,image_path):
22
  ## Pass image through donut processor's feature extractor and retrieve image tensor
23
  image = load_image(image_path)
24
- print("type image:", type(image))
25
  pixel_values = donut_processor(image, return_tensors="pt").pixel_values
26
  pixel_values = pixel_values.to(device)
27
 
@@ -70,28 +70,31 @@ def parse_cheque_with_donut(input_image_path):
70
 
71
  payee_name = cheque_details_json['cheque_details'][2]['payee_name']
72
 
73
- ## In the cheques dataset used to train the model -> all the cheques are dated '06/05/22'
74
- ## Train model to extract cheque date -> to do
75
- cheque_date = '06/05/2022'
76
  stale_cheque = check_if_cheque_is_stale(cheque_date)
77
 
78
- return payee_name,amt_in_words,amt_in_figures,cheque_date,macthing_amts,stale_cheque
 
 
 
 
 
 
 
 
79
 
80
- def spell_correction(amt_in_words):
81
- corrected_amt_in_words =''
82
- words = amt_in_words.split()
83
- words = [word.lower() for word in words]
84
- for word in words:
85
- word = Word(word)
86
- corrected_word = word.correct()+' '
87
- corrected_amt_in_words += corrected_word
88
- return corrected_amt_in_words
89
 
90
  def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
91
  macthing_amts = False
92
  if len(legal_amount) == 0:
93
  return macthing_amts
94
- corrected_amt_in_words = spell_correction(legal_amount)
 
95
  print("corrected_amt_in_words:",corrected_amt_in_words)
96
 
97
  numeric_legal_amt = w2n.word_to_num(corrected_amt_in_words)
@@ -102,13 +105,12 @@ def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
102
 
103
  def check_if_cheque_is_stale(cheque_issue_date):
104
  stale_check = False
105
- current_date = datetime.now().strftime('%d/%m/%Y')
106
- current_date_ = datetime.strptime(current_date, "%d/%m/%Y")
107
- cheque_issue_date_ = datetime.strptime(cheque_issue_date, "%d/%m/%Y")
108
  relative_diff = relativedelta.relativedelta(current_date_, cheque_issue_date_)
109
  months_difference = (relative_diff.years * 12) + relative_diff.months
110
  print("months_difference:",months_difference)
111
  if months_difference > 3:
112
  stale_check = True
113
- return stale_check
114
-
 
1
  from transformers import DonutProcessor, VisionEncoderDecoderModel
2
+ import pkg_resources
3
+ from symspellpy import SymSpell
4
  from word2number import w2n
5
  from dateutil import relativedelta
6
  from datetime import datetime
7
  from word2number import w2n
 
8
  from PIL import Image
9
  import torch
10
  import re
11
 
12
+ CHEQUE_PARSER_MODEL = "shivi/donut-cheque-parser"
13
+ TASK_PROMPT = "<parse-cheque>"
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
  def load_donut_model_and_processor():
 
22
  def prepare_data_using_processor(donut_processor,image_path):
23
  ## Pass image through donut processor's feature extractor and retrieve image tensor
24
  image = load_image(image_path)
 
25
  pixel_values = donut_processor(image, return_tensors="pt").pixel_values
26
  pixel_values = pixel_values.to(device)
27
 
 
70
 
71
  payee_name = cheque_details_json['cheque_details'][2]['payee_name']
72
 
73
+ bank_name = cheque_details_json['cheque_details'][3]['bank_name']
74
+ cheque_date = cheque_details_json['cheque_details'][4]['cheque_date']
75
+
76
  stale_cheque = check_if_cheque_is_stale(cheque_date)
77
 
78
+ return payee_name,amt_in_words,amt_in_figures,bank_name,cheque_date,macthing_amts,stale_cheque
79
+
80
+ def spell_check(amt_in_words):
81
+ sym_spell = SymSpell(max_dictionary_edit_distance=2,prefix_length=7)
82
+ dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_82_765.txt")
83
+ bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
84
+
85
+ sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
86
+ sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
87
 
88
+ suggestions = sym_spell.lookup_compound(amt_in_words, max_edit_distance=2)
89
+
90
+ return suggestions[0].term
 
 
 
 
 
 
91
 
92
  def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
93
  macthing_amts = False
94
  if len(legal_amount) == 0:
95
  return macthing_amts
96
+
97
+ corrected_amt_in_words = spell_check(legal_amount)
98
  print("corrected_amt_in_words:",corrected_amt_in_words)
99
 
100
  numeric_legal_amt = w2n.word_to_num(corrected_amt_in_words)
 
105
 
106
  def check_if_cheque_is_stale(cheque_issue_date):
107
  stale_check = False
108
+ current_date = datetime.now().strftime('%d/%m/%y')
109
+ current_date_ = datetime.strptime(current_date, "%d/%m/%y")
110
+ cheque_issue_date_ = datetime.strptime(cheque_issue_date, "%d/%m/%y")
111
  relative_diff = relativedelta.relativedelta(current_date_, cheque_issue_date_)
112
  months_difference = (relative_diff.years * 12) + relative_diff.months
113
  print("months_difference:",months_difference)
114
  if months_difference > 3:
115
  stale_check = True
116
+ return stale_check