Spaces:

okarachidera
/

CreditCopilot

Sleeping

App Files Files Community

okara chidera commited on 15 days ago

Commit

4e36c6c

unverified ·

1 Parent(s): 777a487

chore: refactored code

Browse files

Files changed (15) hide show

__pycache__/app.cpython-313.pyc +0 -0
__pycache__/models.cpython-313.pyc +0 -0
__pycache__/pipelines.cpython-313.pyc +0 -0
__pycache__/policy.cpython-313.pyc +0 -0
__pycache__/rag_store.cpython-313.pyc +0 -0
__pycache__/text_utils.cpython-313.pyc +0 -0
__pycache__/ui.cpython-313.pyc +0 -0
app.py +5 -247
models.py +35 -0
pipelines.py +52 -0
policy.py +90 -0
rag_store.py +60 -0
requirements.txt +4 -0
text_utils.py +32 -0
ui.py +39 -0

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (320 Bytes). View file

__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (1.37 kB). View file

__pycache__/pipelines.cpython-313.pyc ADDED Viewed

Binary file (2.81 kB). View file

__pycache__/policy.cpython-313.pyc ADDED Viewed

Binary file (4.54 kB). View file

__pycache__/rag_store.cpython-313.pyc ADDED Viewed

Binary file (3.57 kB). View file

__pycache__/text_utils.cpython-313.pyc ADDED Viewed

Binary file (1.91 kB). View file

__pycache__/ui.cpython-313.pyc ADDED Viewed

Binary file (2.73 kB). View file

app.py CHANGED Viewed

@@ -1,251 +1,9 @@
-import gradio as gr
-import pdfplumber, re, json, yaml, numpy as np
-from pathlib import Path
-from typing import List, Tuple
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
-from sentence_transformers import SentenceTransformer
-try:
-    import faiss  # type: ignore
-    FAISS_OK = True
-except Exception:
-    FAISS_OK = False
-# ---------------------------
-# Models (CPU-friendly)
-# ---------------------------
-EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-GEN_MODEL_NAME   = "google/flan-t5-base"
-_embed = SentenceTransformer(EMBED_MODEL_NAME)
-_tok   = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
-_gen   = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
-t2t    = pipeline("text2text-generation", model=_gen, tokenizer=_tok, device_map=None)
-# ---------------------------
-# Utils
-# ---------------------------
-def read_pdf_text(fobj) -> str:
-    text = []
-    with pdfplumber.open(fobj.name) as pdf:
-        for p in pdf.pages:
-            text.append(p.extract_text() or "")
-    return "\n".join(text)
-def chunk_text(text: str, max_chars=900, overlap=120) -> List[str]:
-    text = re.sub(r"\s+", " ", text).strip()
-    chunks, i = [], 0
-    while i < len(text):
-        j = min(i + max_chars, len(text))
-        # try to break on sentence end
-        if j < len(text):
-            k = text.rfind(".", i, j)
-            if k != -1 and k > i + 200:
-                j = k + 1
-        chunks.append(text[i:j].strip())
-        i = max(j - overlap, j)
-    return [c for c in chunks if c]
-def embed_texts(texts: List[str]) -> np.ndarray:
-    return _embed.encode(texts, batch_size=32, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
-def cosine_topk(query_vec: np.ndarray, mat: np.ndarray, k=5) -> List[int]:
-    sims = (mat @ query_vec)
-    return np.argsort(-sims)[:k].tolist()
-# ---------------------------
-# Tiny rule engine (YAML)
-# ---------------------------
-DEFAULT_POLICY = """\
-# Example policy rules
-min_credit_score: 620
-max_dti_ratio: 0.45   # debt-to-income
-max_ltv_ratio: 0.80   # loan-to-value
-required_keywords:
-  - "employment verification"
-  - "collateral"
-  - "interest rate"
-"""
-def parse_numeric(pattern: str, text: str, cast=float, scale=1.0):
-    m = re.search(pattern, text, re.I)
-    if not m: return None
-    try:
-        return cast(m.group(1)) * scale
-    except Exception:
-        return None
-def evaluate_policy(all_text: str, policy_yaml: str) -> dict:
-    try:
-        pol = yaml.safe_load(policy_yaml) if policy_yaml.strip() else {}
-    except Exception:
-        return {"error": "Invalid YAML in policy rules."}
-    report = {"checks": [], "pass": True}
-    # Example numeric fields we try to parse from docs
-    credit_score = parse_numeric(r"credit score[^0-9]{0,10}(\d{3})", all_text, int)
-    dti          = parse_numeric(r"\bDTI[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01)
-    ltv          = parse_numeric(r"\bLTV[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", float, float, 0.01) or \
-                   parse_numeric(r"\bloan[- ]to[- ]value[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", float, float, 0.01)
-    # Numeric checks
-    if "min_credit_score" in pol and credit_score is not None:
-        ok = credit_score >= pol["min_credit_score"]
-        report["checks"].append({"rule": f"credit_score ≥ {pol['min_credit_score']}", "observed": credit_score, "ok": ok})
-        report["pass"] &= ok
-    if "max_dti_ratio" in pol and dti is not None:
-        ok = dti <= pol["max_dti_ratio"]
-        report["checks"].append({"rule": f"dti ≤ {pol['max_dti_ratio']}", "observed": dti, "ok": ok})
-        report["pass"] &= ok
-    if "max_ltv_ratio" in pol and ltv is not None:
-        ok = ltv <= pol["max_ltv_ratio"]
-        report["checks"].append({"rule": f"ltv ≤ {pol['max_ltv_ratio']}", "observed": ltv, "ok": ok})
-        report["pass"] &= ok
-    # Keyword presence checks
-    for kw in pol.get("required_keywords", []):
-        present = bool(re.search(re.escape(kw), all_text, re.I))
-        report["checks"].append({"rule": f'require "{kw}"', "observed": "found" if present else "missing", "ok": present})
-        report["pass"] &= present
-    # Notes for missing observables
-    if "min_credit_score" in pol and credit_score is None:
-        report["checks"].append({"rule": "credit_score present", "observed": "not found", "ok": False})
-        report["pass"] = False
-    return report
-def next_actions(policy_report: dict) -> List[str]:
-    actions = []
-    if "error" in policy_report:
-        return ["Fix policy YAML (could not parse)."]
-    for c in policy_report["checks"]:
-        if not c["ok"]:
-            if "credit_score" in c["rule"]:
-                actions.append("Request updated bureau report or alternative credit data.")
-            elif "dti" in c["rule"]:
-                actions.append("Obtain income docs or reduce loan amount to meet DTI.")
-            elif "ltv" in c["rule"]:
-                actions.append("Ask for additional collateral or higher down payment.")
-            elif "require" in c["rule"]:
-                actions.append(f'Add documentation for "{c["rule"].split(chr(34))[1]}".')
-    if not actions:
-        actions.append("Move application to underwriting/approval queue.")
-    return sorted(set(actions))
-# ---------------------------
-# RAG store
-# ---------------------------
-class RAGStore:
-    def __init__(self):
-        self.docs: List[str] = []
-        self.doc_ids: List[Tuple[int,int]] = []  # (file_idx, chunk_idx)
-        self.embs: np.ndarray | None = None
-        self.index = None
-    def ingest(self, files: List[gr.File]) -> Tuple[int,int,str]:
-        self.docs, self.doc_ids = [], []
-        combined_text = []
-        for fi, f in enumerate(files or []):
-            text = read_pdf_text(f)
-            chunks = chunk_text(text)
-            self.docs.extend(chunks)
-            self.doc_ids.extend([(fi, ci) for ci in range(len(chunks))])
-            combined_text.append(text)
-        return len(files or []), len(self.docs), "\n".join(combined_text)
-    def build(self):
-        if not self.docs:
-            return 0
-        self.embs = embed_texts(self.docs).astype("float32")
-        if FAISS_OK:
-            dim = self.embs.shape[1]
-            self.index = faiss.IndexFlatIP(dim)
-            self.index.add(self.embs)
-        return len(self.docs)
-    def search(self, query: str, k=5) -> List[str]:
-        if not self.docs: return []
-        q = embed_texts([query]).astype("float32")[0]
-        if self.index is not None:
-            D, I = self.index.search(np.expand_dims(q,0), k)
-            idxs = I[0].tolist()
-        else:
-            idxs = cosine_topk(q, self.embs, k)
-        return [self.docs[i] for i in idxs if i is not None]
-RAG = RAGStore()
-# ---------------------------
-# Pipelines
-# ---------------------------
-def build_kb(files, policy_text):
-    n_files, n_chunks, all_text = RAG.ingest(files)
-    n_vecs = RAG.build()
-    pol = policy_text or DEFAULT_POLICY
-    return (
-        f"✅ Ingested {n_files} file(s), created {n_chunks} chunk(s), indexed {n_vecs} vector(s).",
-        pol
-    )
-def ask(question, policy_yaml):
-    if not question.strip():
-        return "Please enter a question.", "", ""
-    contexts = RAG.search(question, k=6)
-    context_block = "\n\n".join(contexts[:6]) if contexts else "No context found."
-    prompt = (
-        "You are a credit-analyst assistant. Using ONLY the provided context, "
-        "answer the question concisely and cite key terms. "
-        "Then provide a 3-bullet summary.\n\n"
-        f"Context:\n{context_block}\n\nQuestion: {question}\nAnswer:"
-    )
-    answer = t2t(prompt, max_new_tokens=256)[0]["generated_text"]
-    # Policy cross-check on the union of top chunks
-    combined = " ".join(contexts)
-    report = evaluate_policy(combined, policy_yaml or DEFAULT_POLICY)
-    actions = next_actions(report)
-    return answer, json.dumps(report, indent=2), "\n".join(f"• {a}" for a in actions)
-def summarize():
-    if not RAG.docs:
-        return "No documents indexed yet."
-    joined = " ".join(RAG.docs[:18])  # keep prompt small
-    prompt = (
-        "Summarize the loan/application documents: list borrower(s), purpose, amount, "
-        "rate, tenor, collateral, covenants, key risks. Keep to 7 bullets.\n\n"
-        f"{joined}"
-    )
-    return t2t(prompt, max_new_tokens=220)[0]["generated_text"]
-# ---------------------------
-# UI
-# ---------------------------
-with gr.Blocks(title="CreditCopilot — RAG for Loan Docs") as demo:
-    gr.Markdown("# 🧠 CreditCopilot\nRetrieval-augmented assistant that summarizes loan documents, checks policy rules, and suggests next actions.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            files = gr.Files(label="Upload loan PDFs", file_count="multiple", file_types=[".pdf"])
-            policy = gr.Code(value=DEFAULT_POLICY, language="yaml", label="Policy rules (YAML)")
-            build_btn = gr.Button("Build knowledge base", variant="primary")
-            build_status = gr.Markdown()
-            sum_btn = gr.Button("Quick summarize")
-            sum_out = gr.Textbox(label="Portfolio-ready summary", lines=8)
-        with gr.Column(scale=2):
-            q = gr.Textbox(label="Ask a question (e.g., What are the key risks and missing docs?)", lines=2)
-            ask_btn = gr.Button("Ask")
-            ans = gr.Markdown(label="Answer")
-            pol_report = gr.Code(label="Policy check report (JSON)")
-            actions = gr.Markdown(label="Suggested next actions")
-    build_btn.click(build_kb, [files, policy], [build_status, policy])
-    ask_btn.click(ask, [q, policy], [ans, pol_report, actions])
-    sum_btn.click(summarize, None, sum_out)
 if __name__ == "__main__":
-    demo.launch()

+from __future__ import annotations
+from ui import create_interface
+demo = create_interface()
 if __name__ == "__main__":
+    demo.launch()

models.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from __future__ import annotations
+from typing import List
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+GEN_MODEL_NAME = "google/flan-t5-base"
+_embedder = SentenceTransformer(EMBED_MODEL_NAME)
+_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
+_generator_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
+_text2text = pipeline(
+    "text2text-generation",
+    model=_generator_model,
+    tokenizer=_tokenizer,
+    device_map=None,
+)
+def embed_texts(texts: List[str]) -> np.ndarray:
+    return _embedder.encode(
+        texts,
+        batch_size=32,
+        show_progress_bar=False,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+    )
+def generate_text(prompt: str, max_new_tokens: int = 256) -> str:
+    return _text2text(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]

pipelines.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from __future__ import annotations
+import json
+from typing import List, Tuple
+from models import generate_text
+from policy import DEFAULT_POLICY, evaluate_policy, next_actions
+from rag_store import RAG
+def build_kb(files, policy_text: str) -> Tuple[str, str]:
+    n_files, n_chunks, _ = RAG.ingest(files)
+    n_vectors = RAG.build()
+    policy_value = policy_text or DEFAULT_POLICY
+    status = f"✅ Ingested {n_files} file(s), created {n_chunks} chunk(s), indexed {n_vectors} vector(s)."
+    return status, policy_value
+def ask(question: str, policy_yaml: str):
+    if not question.strip():
+        return "Please enter a question.", "", ""
+    contexts = RAG.search(question, k=6)
+    context_block = "\n\n".join(contexts[:6]) if contexts else "No context found."
+    prompt = (
+        "You are a credit-analyst assistant. Using ONLY the provided context, "
+        "answer the question concisely and cite key terms. "
+        "Then provide a 3-bullet summary.\n\n"
+        f"Context:\n{context_block}\n\nQuestion: {question}\nAnswer:"
+    )
+    answer = generate_text(prompt, max_new_tokens=256)
+    combined = " ".join(contexts)
+    report = evaluate_policy(combined, policy_yaml or DEFAULT_POLICY)
+    actions = next_actions(report)
+    return answer, json.dumps(report, indent=2), "\n".join(f"• {item}" for item in actions)
+def summarize():
+    if not RAG.docs:
+        return "No documents indexed yet."
+    joined = " ".join(RAG.docs[:18])
+    prompt = (
+        "Summarize the loan/application documents: list borrower(s), purpose, amount, "
+        "rate, tenor, collateral, covenants, key risks. Keep to 7 bullets.\n\n"
+        f"{joined}"
+    )
+    return generate_text(prompt, max_new_tokens=220)

policy.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from __future__ import annotations
+import re
+from typing import Dict, List
+import yaml
+DEFAULT_POLICY = """\
+# Example policy rules
+min_credit_score: 620
+max_dti_ratio: 0.45   # debt-to-income
+max_ltv_ratio: 0.80   # loan-to-value
+required_keywords:
+  - "employment verification"
+  - "collateral"
+  - "interest rate"
+"""
+def parse_numeric(pattern: str, text: str, cast=float, scale: float = 1.0):
+    match = re.search(pattern, text, re.I)
+    if not match:
+        return None
+    try:
+        return cast(match.group(1)) * scale
+    except Exception:
+        return None
+def evaluate_policy(all_text: str, policy_yaml: str) -> Dict:
+    try:
+        policy = yaml.safe_load(policy_yaml) if policy_yaml.strip() else {}
+    except Exception:
+        return {"error": "Invalid YAML in policy rules."}
+    report = {"checks": [], "pass": True}
+    credit_score = parse_numeric(r"credit score[^0-9]{0,10}(\d{3})", all_text, int)
+    dti = parse_numeric(r"\bDTI[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01)
+    ltv = parse_numeric(r"\bLTV[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01) or parse_numeric(
+        r"\bloan[- ]to[- ]value[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01
+    )
+    if "min_credit_score" in policy and credit_score is not None:
+        ok = credit_score >= policy["min_credit_score"]
+        report["checks"].append(
+            {"rule": f"credit_score ≥ {policy['min_credit_score']}", "observed": credit_score, "ok": ok}
+        )
+        report["pass"] &= ok
+    if "max_dti_ratio" in policy and dti is not None:
+        ok = dti <= policy["max_dti_ratio"]
+        report["checks"].append({"rule": f"dti ≤ {policy['max_dti_ratio']}", "observed": dti, "ok": ok})
+        report["pass"] &= ok
+    if "max_ltv_ratio" in policy and ltv is not None:
+        ok = ltv <= policy["max_ltv_ratio"]
+        report["checks"].append({"rule": f"ltv ≤ {policy['max_ltv_ratio']}", "observed": ltv, "ok": ok})
+        report["pass"] &= ok
+    for kw in policy.get("required_keywords", []):
+        present = bool(re.search(re.escape(kw), all_text, re.I))
+        report["checks"].append(
+            {"rule": f'require "{kw}"', "observed": "found" if present else "missing", "ok": present}
+        )
+        report["pass"] &= present
+    if "min_credit_score" in policy and credit_score is None:
+        report["checks"].append({"rule": "credit_score present", "observed": "not found", "ok": False})
+        report["pass"] = False
+    return report
+def next_actions(policy_report: Dict) -> List[str]:
+    actions: List[str] = []
+    if "error" in policy_report:
+        return ["Fix policy YAML (could not parse)."]
+    for check in policy_report["checks"]:
+        if check["ok"]:
+            continue
+        if "credit_score" in check["rule"]:
+            actions.append("Request updated bureau report or alternative credit data.")
+        elif "dti" in check["rule"]:
+            actions.append("Obtain income docs or reduce loan amount to meet DTI.")
+        elif "ltv" in check["rule"]:
+            actions.append("Ask for additional collateral or higher down payment.")
+        elif "require" in check["rule"]:
+            actions.append(f'Add documentation for "{check["rule"].split(chr(34))[1]}".')
+    if not actions:
+        actions.append("Move application to underwriting/approval queue.")
+    return sorted(set(actions))

rag_store.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from __future__ import annotations
+from typing import List, Tuple
+import numpy as np
+from models import embed_texts
+from text_utils import chunk_text, read_pdf_text
+try:
+    import faiss  # type: ignore
+    FAISS_OK = True
+except Exception:
+    FAISS_OK = False
+class RAGStore:
+    def __init__(self):
+        self.docs: List[str] = []
+        self.doc_ids: List[Tuple[int, int]] = []
+        self.embs: np.ndarray | None = None
+        self.index = None
+    def ingest(self, files) -> Tuple[int, int, str]:
+        self.docs, self.doc_ids = [], []
+        combined_text: List[str] = []
+        for file_idx, file in enumerate(files or []):
+            text = read_pdf_text(file)
+            chunks = chunk_text(text)
+            self.docs.extend(chunks)
+            self.doc_ids.extend([(file_idx, chunk_idx) for chunk_idx in range(len(chunks))])
+            combined_text.append(text)
+        return len(files or []), len(self.docs), "\n".join(combined_text)
+    def build(self) -> int:
+        if not self.docs:
+            return 0
+        self.embs = embed_texts(self.docs).astype("float32")
+        if FAISS_OK:
+            dim = self.embs.shape[1]
+            self.index = faiss.IndexFlatIP(dim)
+            self.index.add(self.embs)
+        return len(self.docs)
+    def search(self, query: str, k: int = 5) -> List[str]:
+        if not self.docs:
+            return []
+        query_vec = embed_texts([query]).astype("float32")[0]
+        if self.index is not None:
+            _, indices = self.index.search(np.expand_dims(query_vec, 0), k)
+            ranked_indices = indices[0].tolist()
+        else:
+            sims = self.embs @ query_vec  # type: ignore[operator]
+            ranked_indices = np.argsort(-sims)[:k].tolist()
+        return [self.docs[idx] for idx in ranked_indices if idx is not None]
+RAG = RAGStore()

requirements.txt CHANGED Viewed

@@ -2,5 +2,9 @@ gradio==4.44.1
 easyocr==1.7.1
 torch==2.3.1
 transformers==4.44.2
 Pillow==10.4.0
 numpy==1.26.4

 easyocr==1.7.1
 torch==2.3.1
 transformers==4.44.2
+sentence-transformers==2.2.2
+pdfplumber==0.11.4
+PyYAML==6.0.2
+faiss-cpu==1.7.4
 Pillow==10.4.0
 numpy==1.26.4

text_utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from __future__ import annotations
+import re
+from typing import List
+import pdfplumber
+def read_pdf_text(pathlike) -> str:
+    """Return concatenated text from every page of the PDF."""
+    text: List[str] = []
+    with pdfplumber.open(pathlike.name) as pdf:
+        for page in pdf.pages:
+            text.append(page.extract_text() or "")
+    return "\n".join(text)
+def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
+    """Split text into overlapping chunks with light sentence-aware boundaries."""
+    text = re.sub(r"\s+", " ", text).strip()
+    chunks: List[str] = []
+    i = 0
+    while i < len(text):
+        j = min(i + max_chars, len(text))
+        if j < len(text):
+            candidate = text.rfind(".", i, j)
+            if candidate != -1 and candidate > i + 200:
+                j = candidate + 1
+        chunks.append(text[i:j].strip())
+        i = max(j - overlap, j)
+    return [chunk for chunk in chunks if chunk]

ui.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from __future__ import annotations
+import gradio as gr
+from pipelines import ask, build_kb, summarize
+from policy import DEFAULT_POLICY
+def create_interface() -> gr.Blocks:
+    with gr.Blocks(title="CreditCopilot — RAG for Loan Docs") as demo:
+        gr.Markdown(
+            "# 🧠 CreditCopilot\nRetrieval-augmented assistant that summarizes loan documents, checks policy rules, and suggests next actions."
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                files = gr.Files(label="Upload loan PDFs", file_count="multiple", file_types=[".pdf"])
+                policy = gr.Code(value=DEFAULT_POLICY, language="yaml", label="Policy rules (YAML)")
+                build_btn = gr.Button("Build knowledge base", variant="primary")
+                build_status = gr.Markdown()
+                sum_btn = gr.Button("Quick summarize")
+                sum_out = gr.Textbox(label="Portfolio-ready summary", lines=8)
+            with gr.Column(scale=2):
+                question = gr.Textbox(
+                    label="Ask a question (e.g., What are the key risks and missing docs?)", lines=2
+                )
+                ask_btn = gr.Button("Ask")
+                answer = gr.Markdown(label="Answer")
+                policy_report = gr.Code(label="Policy check report (JSON)")
+                actions = gr.Markdown(label="Suggested next actions")
+        build_btn.click(build_kb, [files, policy], [build_status, policy])
+        ask_btn.click(ask, [question, policy], [answer, policy_report, actions])
+        sum_btn.click(summarize, None, sum_out)
+    return demo