okara chidera commited on
Commit
ffb1971
·
unverified ·
1 Parent(s): 9910622

feat: added ocr scanning

Browse files
Files changed (2) hide show
  1. app.py +68 -43
  2. requirements.txt +3 -3
app.py CHANGED
@@ -1,51 +1,76 @@
1
  import gradio as gr
2
- import pdfplumber
3
- from PIL import Image
4
- import pytesseract
5
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- def extract_text(file):
8
- if not file:
9
- return ""
10
- if file.name.endswith(".pdf"):
11
- text = ""
12
- with pdfplumber.open(file.name) as pdf:
13
- for page in pdf.pages:
14
- text += page.extract_text() or ""
15
- return text
16
- else:
17
- img = Image.open(file.name)
18
- return pytesseract.image_to_string(img)
19
-
20
- def analyze_kyc(name, id_doc, selfie_doc, address_doc):
21
- all_text = "\n".join([
22
- extract_text(f) for f in [id_doc, selfie_doc, address_doc] if f
23
- ])
24
- findings = {
25
- "Applicant Name": name or "N/A",
26
- "Email": ", ".join(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", all_text)) or "N/A",
27
- "Phone": ", ".join(re.findall(r"\+?\d[\d\s-]{7,14}", all_text)) or "N/A",
28
- "Detected IDs": ", ".join(re.findall(r"\b[A-Z]{1,3}\d{6,10}\b", all_text)) or "N/A",
29
- "Possible DOBs": ", ".join(re.findall(r"\b\d{2,4}[-/.]\d{1,2}[-/.]\d{1,2}\b", all_text)) or "N/A",
30
- }
31
- summary = "\n".join([f"**{k}:** {v}" for k, v in findings.items()])
32
- return summary, all_text[:1000]
33
-
34
- demo = gr.Blocks(title="CreditCopilot — KYC Manager")
35
-
36
- with demo:
37
- gr.Markdown("## 🧠 CreditCopilot — KYC Manager\nExtract and summarize KYC documents for quick review.")
38
  with gr.Row():
39
  with gr.Column(scale=1):
40
- name = gr.Textbox(label="Applicant Name", placeholder="Enter full name")
41
- id_doc = gr.File(label="Government ID")
42
- selfie_doc = gr.File(label="Selfie / Liveness")
43
- address_doc = gr.File(label="Proof of Address")
44
- run_btn = gr.Button("Analyze Documents", variant="primary")
45
  with gr.Column(scale=2):
46
- summary = gr.Markdown()
47
- text_preview = gr.Textbox(label="Extracted Text (Preview)", lines=15)
48
- run_btn.click(analyze_kyc, [name, id_doc, selfie_doc, address_doc], [summary, text_preview])
 
49
 
50
  if __name__ == "__main__":
51
- demo.launch()
 
1
  import gradio as gr
2
+ import easyocr
3
+ from transformers import pipeline
 
4
  import re
5
+ import json
6
+ from PIL import Image
7
+
8
+ # ---------- INITIALIZE MODELS ----------
9
+ # OCR reader for image text
10
+ reader = easyocr.Reader(["en"], gpu=False)
11
+
12
+ # NER model (fine-tuned for named entity extraction)
13
+ ner_pipeline = pipeline("token-classification", model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")
14
+
15
+
16
+ # ---------- HELPERS ----------
17
+ def extract_text_from_image(image):
18
+ """Extracts text from an uploaded ID or document image using EasyOCR."""
19
+ result = reader.readtext(image)
20
+ return " ".join([r[1] for r in result])
21
+
22
+
23
+ def extract_with_ner(text):
24
+ """Extracts key identity info using both regex + transformer-based NER."""
25
+ entities = ner_pipeline(text)
26
+ extracted = {}
27
+
28
+ # Pre-fill with regex findings
29
+ extracted["Email"] = ", ".join(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)) or None
30
+ extracted["Phone"] = ", ".join(re.findall(r"\+?\d[\d\s\-]{7,14}", text)) or None
31
+ extracted["Date"] = ", ".join(re.findall(r"\d{1,2}[\/\-.]\d{1,2}[\/\-.]\d{2,4}", text)) or None
32
+ extracted["Document Numbers"] = ", ".join(re.findall(r"[A-Z]{1,3}\d{6,10}", text)) or None
33
+
34
+ # Add entities from transformer
35
+ for ent in entities:
36
+ label = ent["entity_group"]
37
+ value = ent["word"].strip()
38
+
39
+ if label in ["PER", "NAME"]:
40
+ extracted.setdefault("Full Name", set()).add(value)
41
+ elif label in ["ORG", "GOVERNMENT", "ID"]:
42
+ extracted.setdefault("Issuing Authority", set()).add(value)
43
+ elif label in ["LOC", "ADDRESS"]:
44
+ extracted.setdefault("Address", set()).add(value)
45
+
46
+ # Convert sets to strings
47
+ for key, val in extracted.items():
48
+ if isinstance(val, set):
49
+ extracted[key] = ", ".join(val)
50
+
51
+ return json.dumps(extracted, indent=2, ensure_ascii=False)
52
+
53
+
54
+ def analyze_kyc_document(image):
55
+ """Main function to process the uploaded KYC image."""
56
+ text = extract_text_from_image(image)
57
+ structured = extract_with_ner(text)
58
+ return structured, text
59
+
60
+
61
+ # ---------- UI ----------
62
+ with gr.Blocks(title="AI KYC Extractor") as demo:
63
+ gr.Markdown("## 🧠 AI KYC Document Extractor\nUpload an ID, Passport, or Driver’s License to extract structured data with OCR + AI.")
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  with gr.Row():
66
  with gr.Column(scale=1):
67
+ doc_input = gr.Image(type="pil", label="Upload Document")
68
+ extract_btn = gr.Button("Run AI Extraction", variant="primary")
 
 
 
69
  with gr.Column(scale=2):
70
+ json_output = gr.Textbox(label="Extracted Information (JSON)", lines=12)
71
+ text_output = gr.Textbox(label="Extracted Raw Text", lines=10)
72
+
73
+ extract_btn.click(fn=analyze_kyc_document, inputs=doc_input, outputs=[json_output, text_output])
74
 
75
  if __name__ == "__main__":
76
+ demo.launch(share=True)
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- # requirements.txt
2
  gradio==4.44.1
3
- pdfplumber
4
- pytesseract
 
5
  Pillow
 
 
1
  gradio==4.44.1
2
+ easyocr
3
+ torch
4
+ transformers
5
  Pillow