Spaces:

okarachidera
/

CreditCopilot

Sleeping

App Files Files Community

okara chidera commited on 16 days ago

Commit

ffb1971

unverified ·

1 Parent(s): 9910622

feat: added ocr scanning

Browse files

Files changed (2) hide show

app.py +68 -43
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -1,51 +1,76 @@
 import gradio as gr
-import pdfplumber
-from PIL import Image
-import pytesseract
 import re
-def extract_text(file):
-    if not file:
-        return ""
-    if file.name.endswith(".pdf"):
-        text = ""
-        with pdfplumber.open(file.name) as pdf:
-            for page in pdf.pages:
-                text += page.extract_text() or ""
-        return text
-    else:
-        img = Image.open(file.name)
-        return pytesseract.image_to_string(img)
-def analyze_kyc(name, id_doc, selfie_doc, address_doc):
-    all_text = "\n".join([
-        extract_text(f) for f in [id_doc, selfie_doc, address_doc] if f
-    ])
-    findings = {
-        "Applicant Name": name or "N/A",
-        "Email": ", ".join(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", all_text)) or "N/A",
-        "Phone": ", ".join(re.findall(r"\+?\d[\d\s-]{7,14}", all_text)) or "N/A",
-        "Detected IDs": ", ".join(re.findall(r"\b[A-Z]{1,3}\d{6,10}\b", all_text)) or "N/A",
-        "Possible DOBs": ", ".join(re.findall(r"\b\d{2,4}[-/.]\d{1,2}[-/.]\d{1,2}\b", all_text)) or "N/A",
-    }
-    summary = "\n".join([f"**{k}:** {v}" for k, v in findings.items()])
-    return summary, all_text[:1000]
-demo = gr.Blocks(title="CreditCopilot — KYC Manager")
-with demo:
-    gr.Markdown("## 🧠 CreditCopilot — KYC Manager\nExtract and summarize KYC documents for quick review.")
     with gr.Row():
         with gr.Column(scale=1):
-            name = gr.Textbox(label="Applicant Name", placeholder="Enter full name")
-            id_doc = gr.File(label="Government ID")
-            selfie_doc = gr.File(label="Selfie / Liveness")
-            address_doc = gr.File(label="Proof of Address")
-            run_btn = gr.Button("Analyze Documents", variant="primary")
         with gr.Column(scale=2):
-            summary = gr.Markdown()
-            text_preview = gr.Textbox(label="Extracted Text (Preview)", lines=15)
-    run_btn.click(analyze_kyc, [name, id_doc, selfie_doc, address_doc], [summary, text_preview])
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import easyocr
+from transformers import pipeline
 import re
+import json
+from PIL import Image
+# ---------- INITIALIZE MODELS ----------
+# OCR reader for image text
+reader = easyocr.Reader(["en"], gpu=False)
+# NER model (fine-tuned for named entity extraction)
+ner_pipeline = pipeline("token-classification", model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")
+# ---------- HELPERS ----------
+def extract_text_from_image(image):
+    """Extracts text from an uploaded ID or document image using EasyOCR."""
+    result = reader.readtext(image)
+    return " ".join([r[1] for r in result])
+def extract_with_ner(text):
+    """Extracts key identity info using both regex + transformer-based NER."""
+    entities = ner_pipeline(text)
+    extracted = {}
+    # Pre-fill with regex findings
+    extracted["Email"] = ", ".join(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)) or None
+    extracted["Phone"] = ", ".join(re.findall(r"\+?\d[\d\s\-]{7,14}", text)) or None
+    extracted["Date"] = ", ".join(re.findall(r"\d{1,2}[\/\-.]\d{1,2}[\/\-.]\d{2,4}", text)) or None
+    extracted["Document Numbers"] = ", ".join(re.findall(r"[A-Z]{1,3}\d{6,10}", text)) or None
+    # Add entities from transformer
+    for ent in entities:
+        label = ent["entity_group"]
+        value = ent["word"].strip()
+        if label in ["PER", "NAME"]:
+            extracted.setdefault("Full Name", set()).add(value)
+        elif label in ["ORG", "GOVERNMENT", "ID"]:
+            extracted.setdefault("Issuing Authority", set()).add(value)
+        elif label in ["LOC", "ADDRESS"]:
+            extracted.setdefault("Address", set()).add(value)
+    # Convert sets to strings
+    for key, val in extracted.items():
+        if isinstance(val, set):
+            extracted[key] = ", ".join(val)
+    return json.dumps(extracted, indent=2, ensure_ascii=False)
+def analyze_kyc_document(image):
+    """Main function to process the uploaded KYC image."""
+    text = extract_text_from_image(image)
+    structured = extract_with_ner(text)
+    return structured, text
+# ---------- UI ----------
+with gr.Blocks(title="AI KYC Extractor") as demo:
+    gr.Markdown("## 🧠 AI KYC Document Extractor\nUpload an ID, Passport, or Driver’s License to extract structured data with OCR + AI.")
     with gr.Row():
         with gr.Column(scale=1):
+            doc_input = gr.Image(type="pil", label="Upload Document")
+            extract_btn = gr.Button("Run AI Extraction", variant="primary")
         with gr.Column(scale=2):
+            json_output = gr.Textbox(label="Extracted Information (JSON)", lines=12)
+            text_output = gr.Textbox(label="Extracted Raw Text", lines=10)
+    extract_btn.click(fn=analyze_kyc_document, inputs=doc_input, outputs=[json_output, text_output])
 if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-# requirements.txt
 gradio==4.44.1
-pdfplumber
-pytesseract
 Pillow

 gradio==4.44.1
+easyocr
+torch
+transformers
 Pillow