okara chidera commited on
Commit
1a2fb37
·
unverified ·
1 Parent(s): ffb1971

feat: added ocr scanning

Browse files
Files changed (2) hide show
  1. app.py +20 -15
  2. requirements.txt +1 -0
app.py CHANGED
@@ -3,35 +3,40 @@ import easyocr
3
  from transformers import pipeline
4
  import re
5
  import json
 
6
  from PIL import Image
7
 
8
  # ---------- INITIALIZE MODELS ----------
9
- # OCR reader for image text
10
  reader = easyocr.Reader(["en"], gpu=False)
11
-
12
- # NER model (fine-tuned for named entity extraction)
13
- ner_pipeline = pipeline("token-classification", model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")
14
-
 
15
 
16
  # ---------- HELPERS ----------
17
  def extract_text_from_image(image):
18
- """Extracts text from an uploaded ID or document image using EasyOCR."""
 
 
 
 
19
  result = reader.readtext(image)
20
  return " ".join([r[1] for r in result])
21
 
22
 
23
  def extract_with_ner(text):
24
- """Extracts key identity info using both regex + transformer-based NER."""
25
  entities = ner_pipeline(text)
26
  extracted = {}
27
 
28
- # Pre-fill with regex findings
29
  extracted["Email"] = ", ".join(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)) or None
30
  extracted["Phone"] = ", ".join(re.findall(r"\+?\d[\d\s\-]{7,14}", text)) or None
31
  extracted["Date"] = ", ".join(re.findall(r"\d{1,2}[\/\-.]\d{1,2}[\/\-.]\d{2,4}", text)) or None
32
  extracted["Document Numbers"] = ", ".join(re.findall(r"[A-Z]{1,3}\d{6,10}", text)) or None
33
 
34
- # Add entities from transformer
35
  for ent in entities:
36
  label = ent["entity_group"]
37
  value = ent["word"].strip()
@@ -43,16 +48,16 @@ def extract_with_ner(text):
43
  elif label in ["LOC", "ADDRESS"]:
44
  extracted.setdefault("Address", set()).add(value)
45
 
46
- # Convert sets to strings
47
- for key, val in extracted.items():
48
- if isinstance(val, set):
49
- extracted[key] = ", ".join(val)
50
 
51
  return json.dumps(extracted, indent=2, ensure_ascii=False)
52
 
53
 
54
  def analyze_kyc_document(image):
55
- """Main function to process the uploaded KYC image."""
56
  text = extract_text_from_image(image)
57
  structured = extract_with_ner(text)
58
  return structured, text
@@ -73,4 +78,4 @@ with gr.Blocks(title="AI KYC Extractor") as demo:
73
  extract_btn.click(fn=analyze_kyc_document, inputs=doc_input, outputs=[json_output, text_output])
74
 
75
  if __name__ == "__main__":
76
- demo.launch(share=True)
 
3
  from transformers import pipeline
4
  import re
5
  import json
6
+ import numpy as np
7
  from PIL import Image
8
 
9
  # ---------- INITIALIZE MODELS ----------
 
10
  reader = easyocr.Reader(["en"], gpu=False)
11
+ ner_pipeline = pipeline(
12
+ "token-classification",
13
+ model="Davlan/bert-base-multilingual-cased-ner-hrl",
14
+ aggregation_strategy="simple"
15
+ )
16
 
17
  # ---------- HELPERS ----------
18
  def extract_text_from_image(image):
19
+ """Extracts text from uploaded ID image using EasyOCR."""
20
+ # Convert PIL image → NumPy array for EasyOCR
21
+ if isinstance(image, Image.Image):
22
+ image = np.array(image)
23
+
24
  result = reader.readtext(image)
25
  return " ".join([r[1] for r in result])
26
 
27
 
28
  def extract_with_ner(text):
29
+ """Extracts KYC details using regex + transformer NER."""
30
  entities = ner_pipeline(text)
31
  extracted = {}
32
 
33
+ # Regex fields
34
  extracted["Email"] = ", ".join(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)) or None
35
  extracted["Phone"] = ", ".join(re.findall(r"\+?\d[\d\s\-]{7,14}", text)) or None
36
  extracted["Date"] = ", ".join(re.findall(r"\d{1,2}[\/\-.]\d{1,2}[\/\-.]\d{2,4}", text)) or None
37
  extracted["Document Numbers"] = ", ".join(re.findall(r"[A-Z]{1,3}\d{6,10}", text)) or None
38
 
39
+ # Transformer entities
40
  for ent in entities:
41
  label = ent["entity_group"]
42
  value = ent["word"].strip()
 
48
  elif label in ["LOC", "ADDRESS"]:
49
  extracted.setdefault("Address", set()).add(value)
50
 
51
+ # Flatten sets
52
+ for k, v in extracted.items():
53
+ if isinstance(v, set):
54
+ extracted[k] = ", ".join(v)
55
 
56
  return json.dumps(extracted, indent=2, ensure_ascii=False)
57
 
58
 
59
  def analyze_kyc_document(image):
60
+ """Main function to process uploaded KYC image."""
61
  text = extract_text_from_image(image)
62
  structured = extract_with_ner(text)
63
  return structured, text
 
78
  extract_btn.click(fn=analyze_kyc_document, inputs=doc_input, outputs=[json_output, text_output])
79
 
80
  if __name__ == "__main__":
81
+ demo.launch()
requirements.txt CHANGED
@@ -3,3 +3,4 @@ easyocr
3
  torch
4
  transformers
5
  Pillow
 
 
3
  torch
4
  transformers
5
  Pillow
6
+ numpy