Spaces:

okarachidera
/

CreditCopilot

Sleeping

App Files Files Community

okara chidera commited on 29 days ago

Commit

2683024

unverified ·

1 Parent(s): b5b36ad

feat: credit copilot

Browse files

Files changed (3) hide show

README.md +24 -14
app.py +243 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,14 +1,24 @@
----
-title: CreditCopilotKYCManager
-emoji: 👁
-colorFrom: yellow
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: Automated KYC verification with AI.
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# CreditCopilot — KYC Manager
+CreditCopilot automates Know Your Customer (KYC) checks by analyzing uploaded documents and producing structured compliance reports.
+**Features**
+- Upload ID, selfie, and address proofs (PDF or images)
+- Extract text and data automatically using OCR
+- Parse name, DOB, ID number, expiry date, email, phone, and address
+- Validate document completeness and consistency
+- Highlight missing or expired documents
+- Generate both human-readable summaries and downloadable JSON reports
+- Optional AI refinement via Hugging Face Inference API
+**Built with:** Python · Gradio · pdfplumber · pytesseract · Hugging Face Hub
+### Optional Variables
+Add these under **Settings → Variables and secrets**:
+- `HF_TOKEN`: Hugging Face token for AI summarization
+- `HF_MODEL`: Optional model (default: `mistralai/Mistral-7B-Instruct-v0.2`)
+### Run locally
+```bash
+pip install -r requirements.txt
+python app.py

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os, re, json, base64
+import gradio as gr
+from datetime import datetime
+from dataclasses import dataclass, asdict
+from typing import List, Optional, Dict, Any
+# Optional LLM summary (set HF_TOKEN in Space secrets)
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
+USE_LLM = bool(HF_TOKEN)
+if USE_LLM:
+    from huggingface_hub import InferenceClient
+    client = InferenceClient(token=HF_TOKEN)
+# PDF & image parsing
+import pdfplumber
+from PIL import Image
+try:
+    import pytesseract
+    OCR_AVAILABLE = True
+except Exception:
+    OCR_AVAILABLE = False
+# ---------- CONFIG ----------
+REQUIRED_DOCS = ["Government ID", "Selfie / Liveness", "Proof of Address"]
+DATE_PAT = r"(?:\b(20\d{2}|19\d{2})[-/.](0?[1-9]|1[0-2])[-/.](0?[1-9]|[12]\d|3[01])\b)"
+EMAIL_PAT = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
+PHONE_PAT = r"(?:\+?\d{1,3})?[\s-]?\d{6,14}"
+ID_PAT = r"\b([A-Z]{1,3}\d{6,12}|[0-9]{8,14})\b"
+@dataclass
+class KYCSummary:
+    applicant_name: Optional[str]
+    dob: Optional[str]
+    doc_id_number: Optional[str]
+    doc_type: Optional[str]
+    doc_expiry: Optional[str]
+    email: Optional[str]
+    phone: Optional[str]
+    address_snippet: Optional[str]
+    required_docs: List[str]
+    provided_docs: List[str]
+    missing_docs: List[str]
+    red_flags: List[str]
+    extracted_text_preview: str
+    generated_at: str
+# ---------- HELPERS ----------
+def read_pdf(file_path: str) -> str:
+    text_chunks = []
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            t = page.extract_text() or ""
+            if not t and OCR_AVAILABLE:
+                img = page.to_image(resolution=200).original
+                t = pytesseract.image_to_string(img)
+            text_chunks.append(t)
+    return "\n".join(text_chunks)
+def read_image(file_path: str) -> str:
+    if not OCR_AVAILABLE:
+        return ""
+    img = Image.open(file_path).convert("RGB")
+    return pytesseract.image_to_string(img)
+def extract_text(file) -> str:
+    if not file:
+        return ""
+    _, ext = os.path.splitext(file.name)
+    ext = ext.lower()
+    try:
+        if ext == ".pdf":
+            return read_pdf(file.name)
+        elif ext in [".png", ".jpg", ".jpeg", ".webp"]:
+            return read_image(file.name)
+        return ""
+    except Exception:
+        return ""
+def parse_fields(text: str) -> Dict[str, Optional[str]]:
+    if not text:
+        return {k: None for k in ["name", "dob", "id_number", "doc_type", "doc_expiry", "email", "phone", "address"]}
+    name = None
+    for line in text.splitlines():
+        if "name" in line.lower() and ":" in line:
+            name = line.split(":", 1)[1].strip()
+            break
+    if not name:
+        m = re.search(r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){1,3})\b", text)
+        name = m.group(1) if m else None
+    dob = (re.search(DATE_PAT, text) or [None])
+    dob = dob.group(0) if hasattr(dob, "group") else None
+    email = (re.search(EMAIL_PAT, text) or [None])
+    email = email.group(0) if hasattr(email, "group") else None
+    phone = (re.search(PHONE_PAT, text) or [None])
+    phone = phone.group(0) if hasattr(phone, "group") else None
+    id_number = (re.search(ID_PAT, text) or [None])
+    id_number = id_number.group(1) if hasattr(id_number, "group") else None
+    doc_expiry = None
+    mexp = re.search(r"(exp.*?:?\s*)(%s)" % DATE_PAT, text, flags=re.I)
+    if mexp:
+        doc_expiry = mexp.group(2) or mexp.group(0).split()[-1]
+    address = None
+    for line in text.splitlines():
+        if re.search(r"\b(road|street|st\.|rd\.|avenue|estate|close)\b", line, re.I):
+            address = line.strip()
+            break
+    doc_type = None
+    for label in ["Passport", "Driver", "Voter", "National ID"]:
+        if re.search(label, text, re.I):
+            doc_type = label
+            break
+    return {
+        "name": name,
+        "dob": dob,
+        "id_number": id_number,
+        "doc_type": doc_type,
+        "doc_expiry": doc_expiry,
+        "email": email,
+        "phone": phone,
+        "address": address,
+    }
+def evaluate_rules(parsed: Dict[str, Optional[str]], expected_name: str, provided_docs: List[str]) -> Dict[str, Any]:
+    missing = [d for d in REQUIRED_DOCS if d not in provided_docs]
+    red_flags = []
+    if expected_name and parsed.get("name") and expected_name.lower() not in parsed["name"].lower():
+        red_flags.append("Name mismatch between input and document.")
+    if not parsed.get("dob"):
+        red_flags.append("Date of birth not found.")
+    if parsed.get("doc_expiry"):
+        try:
+            year = int(re.search(r"\d{4}", parsed["doc_expiry"]).group(0))
+            if year < datetime.now().year:
+                red_flags.append("Document may be expired.")
+        except Exception:
+            pass
+    if not parsed.get("address"):
+        red_flags.append("Address not detected.")
+    return {"missing_docs": missing, "red_flags": red_flags}
+def summarize(summary: KYCSummary) -> str:
+    lines = [
+        f"Applicant: **{summary.applicant_name or 'Unknown'}**",
+        f"DOB: {summary.dob or 'N/A'}",
+        f"ID: {summary.doc_type or 'Document'} — {summary.doc_id_number or 'N/A'}",
+        f"Expiry: {summary.doc_expiry or 'N/A'}",
+        f"Email: {summary.email or 'N/A'}",
+        f"Phone: {summary.phone or 'N/A'}",
+        f"Address: {summary.address_snippet or 'N/A'}",
+        f"Provided: {', '.join(summary.provided_docs) or 'None'}",
+        f"Missing: {', '.join(summary.missing_docs) or 'None'}",
+        f"Red Flags: {', '.join(summary.red_flags) or 'None'}",
+    ]
+    return "\n".join(lines)
+def llm_refine(text: str) -> str:
+    if not USE_LLM:
+        return text
+    try:
+        result = client.text_generation(
+            f"Rewrite this KYC review more clearly:\n\n{text}",
+            max_new_tokens=180,
+            temperature=0.2
+        )
+        return result.strip()
+    except Exception:
+        return text
+def run_kyc(name, provided_docs, id_doc, selfie_doc, address_doc):
+    docs = [id_doc, selfie_doc, address_doc]
+    text = "\n\n".join([extract_text(d) for d in docs if d])
+    parsed = parse_fields(text)
+    evals = evaluate_rules(parsed, name, provided_docs)
+    summary = KYCSummary(
+        applicant_name=parsed["name"] or name,
+        dob=parsed["dob"],
+        doc_id_number=parsed["id_number"],
+        doc_type=parsed["doc_type"],
+        doc_expiry=parsed["doc_expiry"],
+        email=parsed["email"],
+        phone=parsed["phone"],
+        address_snippet=parsed["address"],
+        required_docs=REQUIRED_DOCS,
+        provided_docs=provided_docs,
+        missing_docs=evals["missing_docs"],
+        red_flags=evals["red_flags"],
+        extracted_text_preview=text[:800],
+        generated_at=datetime.utcnow().isoformat() + "Z",
+    )
+    human = summarize(summary)
+    human_refined = llm_refine(human)
+    json_report = json.dumps(asdict(summary), indent=2)
+    download = f'<a href="data:application/json;base64,{base64.b64encode(json_report.encode()).decode()}" download="kyc_report.json">Download JSON report</a>'
+    return human_refined, json_report, download
+# ---------- UI ----------
+with gr.Blocks(title="CreditCopilot — KYC Manager") as demo:
+    gr.Markdown("## 🧠 CreditCopilot — KYC Management\nUpload KYC documents, extract details, and generate a compliance summary.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            name = gr.Textbox(label="Applicant Name", placeholder="Enter applicant full name")
+            provided_docs = gr.CheckboxGroup(
+                choices=REQUIRED_DOCS, value=["Government ID"], label="Provided Documents"
+            )
+            id_doc = gr.File(label="Government ID (PDF or image)")
+            selfie_doc = gr.File(label="Selfie / Liveness (optional)")
+            address_doc = gr.File(label="Proof of Address (optional)")
+            run_btn = gr.Button("Run KYC Analysis", variant="primary")
+        with gr.Column(scale=2):
+            summary_out = gr.Markdown()
+            json_out = gr.Code(language="json")
+            download_link = gr.HTML()
+    run_btn.click(
+        fn=run_kyc,
+        inputs=[name, provided_docs, id_doc, selfie_doc, address_doc],
+        outputs=[summary_out, json_out, download_link],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==4.44.0
+pdfplumber==0.11.4
+Pillow==10.4.0
+pytesseract==0.3.13
+huggingface_hub==0.24.6