Spaces:
Sleeping
Sleeping
okara chidera
commited on
chore: refactored code
Browse files- __pycache__/app.cpython-313.pyc +0 -0
- __pycache__/models.cpython-313.pyc +0 -0
- __pycache__/pipelines.cpython-313.pyc +0 -0
- __pycache__/policy.cpython-313.pyc +0 -0
- __pycache__/rag_store.cpython-313.pyc +0 -0
- __pycache__/text_utils.cpython-313.pyc +0 -0
- __pycache__/ui.cpython-313.pyc +0 -0
- app.py +5 -247
- models.py +35 -0
- pipelines.py +52 -0
- policy.py +90 -0
- rag_store.py +60 -0
- requirements.txt +4 -0
- text_utils.py +32 -0
- ui.py +39 -0
__pycache__/app.cpython-313.pyc
ADDED
|
Binary file (320 Bytes). View file
|
|
|
__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (1.37 kB). View file
|
|
|
__pycache__/pipelines.cpython-313.pyc
ADDED
|
Binary file (2.81 kB). View file
|
|
|
__pycache__/policy.cpython-313.pyc
ADDED
|
Binary file (4.54 kB). View file
|
|
|
__pycache__/rag_store.cpython-313.pyc
ADDED
|
Binary file (3.57 kB). View file
|
|
|
__pycache__/text_utils.cpython-313.pyc
ADDED
|
Binary file (1.91 kB). View file
|
|
|
__pycache__/ui.cpython-313.pyc
ADDED
|
Binary file (2.73 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,251 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
import pdfplumber, re, json, yaml, numpy as np
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import List, Tuple
|
| 5 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 6 |
-
from sentence_transformers import SentenceTransformer
|
| 7 |
-
try:
|
| 8 |
-
import faiss # type: ignore
|
| 9 |
-
FAISS_OK = True
|
| 10 |
-
except Exception:
|
| 11 |
-
FAISS_OK = False
|
| 12 |
|
| 13 |
-
|
| 14 |
-
# Models (CPU-friendly)
|
| 15 |
-
# ---------------------------
|
| 16 |
-
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 17 |
-
GEN_MODEL_NAME = "google/flan-t5-base"
|
| 18 |
|
| 19 |
-
|
| 20 |
-
_tok = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
|
| 21 |
-
_gen = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
|
| 22 |
-
t2t = pipeline("text2text-generation", model=_gen, tokenizer=_tok, device_map=None)
|
| 23 |
-
|
| 24 |
-
# ---------------------------
|
| 25 |
-
# Utils
|
| 26 |
-
# ---------------------------
|
| 27 |
-
def read_pdf_text(fobj) -> str:
|
| 28 |
-
text = []
|
| 29 |
-
with pdfplumber.open(fobj.name) as pdf:
|
| 30 |
-
for p in pdf.pages:
|
| 31 |
-
text.append(p.extract_text() or "")
|
| 32 |
-
return "\n".join(text)
|
| 33 |
-
|
| 34 |
-
def chunk_text(text: str, max_chars=900, overlap=120) -> List[str]:
|
| 35 |
-
text = re.sub(r"\s+", " ", text).strip()
|
| 36 |
-
chunks, i = [], 0
|
| 37 |
-
while i < len(text):
|
| 38 |
-
j = min(i + max_chars, len(text))
|
| 39 |
-
# try to break on sentence end
|
| 40 |
-
if j < len(text):
|
| 41 |
-
k = text.rfind(".", i, j)
|
| 42 |
-
if k != -1 and k > i + 200:
|
| 43 |
-
j = k + 1
|
| 44 |
-
chunks.append(text[i:j].strip())
|
| 45 |
-
i = max(j - overlap, j)
|
| 46 |
-
return [c for c in chunks if c]
|
| 47 |
-
|
| 48 |
-
def embed_texts(texts: List[str]) -> np.ndarray:
|
| 49 |
-
return _embed.encode(texts, batch_size=32, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
|
| 50 |
-
|
| 51 |
-
def cosine_topk(query_vec: np.ndarray, mat: np.ndarray, k=5) -> List[int]:
|
| 52 |
-
sims = (mat @ query_vec)
|
| 53 |
-
return np.argsort(-sims)[:k].tolist()
|
| 54 |
-
|
| 55 |
-
# ---------------------------
|
| 56 |
-
# Tiny rule engine (YAML)
|
| 57 |
-
# ---------------------------
|
| 58 |
-
DEFAULT_POLICY = """\
|
| 59 |
-
# Example policy rules
|
| 60 |
-
min_credit_score: 620
|
| 61 |
-
max_dti_ratio: 0.45 # debt-to-income
|
| 62 |
-
max_ltv_ratio: 0.80 # loan-to-value
|
| 63 |
-
required_keywords:
|
| 64 |
-
- "employment verification"
|
| 65 |
-
- "collateral"
|
| 66 |
-
- "interest rate"
|
| 67 |
-
"""
|
| 68 |
-
|
| 69 |
-
def parse_numeric(pattern: str, text: str, cast=float, scale=1.0):
|
| 70 |
-
m = re.search(pattern, text, re.I)
|
| 71 |
-
if not m: return None
|
| 72 |
-
try:
|
| 73 |
-
return cast(m.group(1)) * scale
|
| 74 |
-
except Exception:
|
| 75 |
-
return None
|
| 76 |
-
|
| 77 |
-
def evaluate_policy(all_text: str, policy_yaml: str) -> dict:
|
| 78 |
-
try:
|
| 79 |
-
pol = yaml.safe_load(policy_yaml) if policy_yaml.strip() else {}
|
| 80 |
-
except Exception:
|
| 81 |
-
return {"error": "Invalid YAML in policy rules."}
|
| 82 |
-
|
| 83 |
-
report = {"checks": [], "pass": True}
|
| 84 |
-
|
| 85 |
-
# Example numeric fields we try to parse from docs
|
| 86 |
-
credit_score = parse_numeric(r"credit score[^0-9]{0,10}(\d{3})", all_text, int)
|
| 87 |
-
dti = parse_numeric(r"\bDTI[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01)
|
| 88 |
-
ltv = parse_numeric(r"\bLTV[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", float, float, 0.01) or \
|
| 89 |
-
parse_numeric(r"\bloan[- ]to[- ]value[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", float, float, 0.01)
|
| 90 |
-
|
| 91 |
-
# Numeric checks
|
| 92 |
-
if "min_credit_score" in pol and credit_score is not None:
|
| 93 |
-
ok = credit_score >= pol["min_credit_score"]
|
| 94 |
-
report["checks"].append({"rule": f"credit_score ≥ {pol['min_credit_score']}", "observed": credit_score, "ok": ok})
|
| 95 |
-
report["pass"] &= ok
|
| 96 |
-
if "max_dti_ratio" in pol and dti is not None:
|
| 97 |
-
ok = dti <= pol["max_dti_ratio"]
|
| 98 |
-
report["checks"].append({"rule": f"dti ≤ {pol['max_dti_ratio']}", "observed": dti, "ok": ok})
|
| 99 |
-
report["pass"] &= ok
|
| 100 |
-
if "max_ltv_ratio" in pol and ltv is not None:
|
| 101 |
-
ok = ltv <= pol["max_ltv_ratio"]
|
| 102 |
-
report["checks"].append({"rule": f"ltv ≤ {pol['max_ltv_ratio']}", "observed": ltv, "ok": ok})
|
| 103 |
-
report["pass"] &= ok
|
| 104 |
-
|
| 105 |
-
# Keyword presence checks
|
| 106 |
-
for kw in pol.get("required_keywords", []):
|
| 107 |
-
present = bool(re.search(re.escape(kw), all_text, re.I))
|
| 108 |
-
report["checks"].append({"rule": f'require "{kw}"', "observed": "found" if present else "missing", "ok": present})
|
| 109 |
-
report["pass"] &= present
|
| 110 |
-
|
| 111 |
-
# Notes for missing observables
|
| 112 |
-
if "min_credit_score" in pol and credit_score is None:
|
| 113 |
-
report["checks"].append({"rule": "credit_score present", "observed": "not found", "ok": False})
|
| 114 |
-
report["pass"] = False
|
| 115 |
-
|
| 116 |
-
return report
|
| 117 |
-
|
| 118 |
-
def next_actions(policy_report: dict) -> List[str]:
|
| 119 |
-
actions = []
|
| 120 |
-
if "error" in policy_report:
|
| 121 |
-
return ["Fix policy YAML (could not parse)."]
|
| 122 |
-
for c in policy_report["checks"]:
|
| 123 |
-
if not c["ok"]:
|
| 124 |
-
if "credit_score" in c["rule"]:
|
| 125 |
-
actions.append("Request updated bureau report or alternative credit data.")
|
| 126 |
-
elif "dti" in c["rule"]:
|
| 127 |
-
actions.append("Obtain income docs or reduce loan amount to meet DTI.")
|
| 128 |
-
elif "ltv" in c["rule"]:
|
| 129 |
-
actions.append("Ask for additional collateral or higher down payment.")
|
| 130 |
-
elif "require" in c["rule"]:
|
| 131 |
-
actions.append(f'Add documentation for "{c["rule"].split(chr(34))[1]}".')
|
| 132 |
-
if not actions:
|
| 133 |
-
actions.append("Move application to underwriting/approval queue.")
|
| 134 |
-
return sorted(set(actions))
|
| 135 |
-
|
| 136 |
-
# ---------------------------
|
| 137 |
-
# RAG store
|
| 138 |
-
# ---------------------------
|
| 139 |
-
class RAGStore:
|
| 140 |
-
def __init__(self):
|
| 141 |
-
self.docs: List[str] = []
|
| 142 |
-
self.doc_ids: List[Tuple[int,int]] = [] # (file_idx, chunk_idx)
|
| 143 |
-
self.embs: np.ndarray | None = None
|
| 144 |
-
self.index = None
|
| 145 |
-
|
| 146 |
-
def ingest(self, files: List[gr.File]) -> Tuple[int,int,str]:
|
| 147 |
-
self.docs, self.doc_ids = [], []
|
| 148 |
-
combined_text = []
|
| 149 |
-
for fi, f in enumerate(files or []):
|
| 150 |
-
text = read_pdf_text(f)
|
| 151 |
-
chunks = chunk_text(text)
|
| 152 |
-
self.docs.extend(chunks)
|
| 153 |
-
self.doc_ids.extend([(fi, ci) for ci in range(len(chunks))])
|
| 154 |
-
combined_text.append(text)
|
| 155 |
-
return len(files or []), len(self.docs), "\n".join(combined_text)
|
| 156 |
-
|
| 157 |
-
def build(self):
|
| 158 |
-
if not self.docs:
|
| 159 |
-
return 0
|
| 160 |
-
self.embs = embed_texts(self.docs).astype("float32")
|
| 161 |
-
if FAISS_OK:
|
| 162 |
-
dim = self.embs.shape[1]
|
| 163 |
-
self.index = faiss.IndexFlatIP(dim)
|
| 164 |
-
self.index.add(self.embs)
|
| 165 |
-
return len(self.docs)
|
| 166 |
-
|
| 167 |
-
def search(self, query: str, k=5) -> List[str]:
|
| 168 |
-
if not self.docs: return []
|
| 169 |
-
q = embed_texts([query]).astype("float32")[0]
|
| 170 |
-
if self.index is not None:
|
| 171 |
-
D, I = self.index.search(np.expand_dims(q,0), k)
|
| 172 |
-
idxs = I[0].tolist()
|
| 173 |
-
else:
|
| 174 |
-
idxs = cosine_topk(q, self.embs, k)
|
| 175 |
-
return [self.docs[i] for i in idxs if i is not None]
|
| 176 |
-
|
| 177 |
-
RAG = RAGStore()
|
| 178 |
-
|
| 179 |
-
# ---------------------------
|
| 180 |
-
# Pipelines
|
| 181 |
-
# ---------------------------
|
| 182 |
-
def build_kb(files, policy_text):
|
| 183 |
-
n_files, n_chunks, all_text = RAG.ingest(files)
|
| 184 |
-
n_vecs = RAG.build()
|
| 185 |
-
pol = policy_text or DEFAULT_POLICY
|
| 186 |
-
return (
|
| 187 |
-
f"✅ Ingested {n_files} file(s), created {n_chunks} chunk(s), indexed {n_vecs} vector(s).",
|
| 188 |
-
pol
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
def ask(question, policy_yaml):
|
| 192 |
-
if not question.strip():
|
| 193 |
-
return "Please enter a question.", "", ""
|
| 194 |
-
contexts = RAG.search(question, k=6)
|
| 195 |
-
context_block = "\n\n".join(contexts[:6]) if contexts else "No context found."
|
| 196 |
-
|
| 197 |
-
prompt = (
|
| 198 |
-
"You are a credit-analyst assistant. Using ONLY the provided context, "
|
| 199 |
-
"answer the question concisely and cite key terms. "
|
| 200 |
-
"Then provide a 3-bullet summary.\n\n"
|
| 201 |
-
f"Context:\n{context_block}\n\nQuestion: {question}\nAnswer:"
|
| 202 |
-
)
|
| 203 |
-
answer = t2t(prompt, max_new_tokens=256)[0]["generated_text"]
|
| 204 |
-
|
| 205 |
-
# Policy cross-check on the union of top chunks
|
| 206 |
-
combined = " ".join(contexts)
|
| 207 |
-
report = evaluate_policy(combined, policy_yaml or DEFAULT_POLICY)
|
| 208 |
-
actions = next_actions(report)
|
| 209 |
-
|
| 210 |
-
return answer, json.dumps(report, indent=2), "\n".join(f"• {a}" for a in actions)
|
| 211 |
-
|
| 212 |
-
def summarize():
|
| 213 |
-
if not RAG.docs:
|
| 214 |
-
return "No documents indexed yet."
|
| 215 |
-
joined = " ".join(RAG.docs[:18]) # keep prompt small
|
| 216 |
-
prompt = (
|
| 217 |
-
"Summarize the loan/application documents: list borrower(s), purpose, amount, "
|
| 218 |
-
"rate, tenor, collateral, covenants, key risks. Keep to 7 bullets.\n\n"
|
| 219 |
-
f"{joined}"
|
| 220 |
-
)
|
| 221 |
-
return t2t(prompt, max_new_tokens=220)[0]["generated_text"]
|
| 222 |
-
|
| 223 |
-
# ---------------------------
|
| 224 |
-
# UI
|
| 225 |
-
# ---------------------------
|
| 226 |
-
with gr.Blocks(title="CreditCopilot — RAG for Loan Docs") as demo:
|
| 227 |
-
gr.Markdown("# 🧠 CreditCopilot\nRetrieval-augmented assistant that summarizes loan documents, checks policy rules, and suggests next actions.")
|
| 228 |
-
|
| 229 |
-
with gr.Row():
|
| 230 |
-
with gr.Column(scale=1):
|
| 231 |
-
files = gr.Files(label="Upload loan PDFs", file_count="multiple", file_types=[".pdf"])
|
| 232 |
-
policy = gr.Code(value=DEFAULT_POLICY, language="yaml", label="Policy rules (YAML)")
|
| 233 |
-
build_btn = gr.Button("Build knowledge base", variant="primary")
|
| 234 |
-
build_status = gr.Markdown()
|
| 235 |
-
|
| 236 |
-
sum_btn = gr.Button("Quick summarize")
|
| 237 |
-
sum_out = gr.Textbox(label="Portfolio-ready summary", lines=8)
|
| 238 |
-
|
| 239 |
-
with gr.Column(scale=2):
|
| 240 |
-
q = gr.Textbox(label="Ask a question (e.g., What are the key risks and missing docs?)", lines=2)
|
| 241 |
-
ask_btn = gr.Button("Ask")
|
| 242 |
-
ans = gr.Markdown(label="Answer")
|
| 243 |
-
pol_report = gr.Code(label="Policy check report (JSON)")
|
| 244 |
-
actions = gr.Markdown(label="Suggested next actions")
|
| 245 |
-
|
| 246 |
-
build_btn.click(build_kb, [files, policy], [build_status, policy])
|
| 247 |
-
ask_btn.click(ask, [q, policy], [ans, pol_report, actions])
|
| 248 |
-
sum_btn.click(summarize, None, sum_out)
|
| 249 |
|
| 250 |
if __name__ == "__main__":
|
| 251 |
-
demo.launch()
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from ui import create_interface
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
demo = create_interface()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
if __name__ == "__main__":
|
| 8 |
+
demo.launch()
|
| 9 |
+
|
models.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
| 8 |
+
|
| 9 |
+
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 10 |
+
GEN_MODEL_NAME = "google/flan-t5-base"
|
| 11 |
+
|
| 12 |
+
_embedder = SentenceTransformer(EMBED_MODEL_NAME)
|
| 13 |
+
_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
|
| 14 |
+
_generator_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
|
| 15 |
+
_text2text = pipeline(
|
| 16 |
+
"text2text-generation",
|
| 17 |
+
model=_generator_model,
|
| 18 |
+
tokenizer=_tokenizer,
|
| 19 |
+
device_map=None,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def embed_texts(texts: List[str]) -> np.ndarray:
|
| 24 |
+
return _embedder.encode(
|
| 25 |
+
texts,
|
| 26 |
+
batch_size=32,
|
| 27 |
+
show_progress_bar=False,
|
| 28 |
+
convert_to_numpy=True,
|
| 29 |
+
normalize_embeddings=True,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def generate_text(prompt: str, max_new_tokens: int = 256) -> str:
|
| 34 |
+
return _text2text(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]
|
| 35 |
+
|
pipelines.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from typing import List, Tuple
|
| 5 |
+
|
| 6 |
+
from models import generate_text
|
| 7 |
+
from policy import DEFAULT_POLICY, evaluate_policy, next_actions
|
| 8 |
+
from rag_store import RAG
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_kb(files, policy_text: str) -> Tuple[str, str]:
|
| 12 |
+
n_files, n_chunks, _ = RAG.ingest(files)
|
| 13 |
+
n_vectors = RAG.build()
|
| 14 |
+
policy_value = policy_text or DEFAULT_POLICY
|
| 15 |
+
status = f"✅ Ingested {n_files} file(s), created {n_chunks} chunk(s), indexed {n_vectors} vector(s)."
|
| 16 |
+
return status, policy_value
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def ask(question: str, policy_yaml: str):
|
| 20 |
+
if not question.strip():
|
| 21 |
+
return "Please enter a question.", "", ""
|
| 22 |
+
|
| 23 |
+
contexts = RAG.search(question, k=6)
|
| 24 |
+
context_block = "\n\n".join(contexts[:6]) if contexts else "No context found."
|
| 25 |
+
|
| 26 |
+
prompt = (
|
| 27 |
+
"You are a credit-analyst assistant. Using ONLY the provided context, "
|
| 28 |
+
"answer the question concisely and cite key terms. "
|
| 29 |
+
"Then provide a 3-bullet summary.\n\n"
|
| 30 |
+
f"Context:\n{context_block}\n\nQuestion: {question}\nAnswer:"
|
| 31 |
+
)
|
| 32 |
+
answer = generate_text(prompt, max_new_tokens=256)
|
| 33 |
+
|
| 34 |
+
combined = " ".join(contexts)
|
| 35 |
+
report = evaluate_policy(combined, policy_yaml or DEFAULT_POLICY)
|
| 36 |
+
actions = next_actions(report)
|
| 37 |
+
|
| 38 |
+
return answer, json.dumps(report, indent=2), "\n".join(f"• {item}" for item in actions)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def summarize():
|
| 42 |
+
if not RAG.docs:
|
| 43 |
+
return "No documents indexed yet."
|
| 44 |
+
|
| 45 |
+
joined = " ".join(RAG.docs[:18])
|
| 46 |
+
prompt = (
|
| 47 |
+
"Summarize the loan/application documents: list borrower(s), purpose, amount, "
|
| 48 |
+
"rate, tenor, collateral, covenants, key risks. Keep to 7 bullets.\n\n"
|
| 49 |
+
f"{joined}"
|
| 50 |
+
)
|
| 51 |
+
return generate_text(prompt, max_new_tokens=220)
|
| 52 |
+
|
policy.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, List
|
| 5 |
+
|
| 6 |
+
import yaml
|
| 7 |
+
|
| 8 |
+
DEFAULT_POLICY = """\
|
| 9 |
+
# Example policy rules
|
| 10 |
+
min_credit_score: 620
|
| 11 |
+
max_dti_ratio: 0.45 # debt-to-income
|
| 12 |
+
max_ltv_ratio: 0.80 # loan-to-value
|
| 13 |
+
required_keywords:
|
| 14 |
+
- "employment verification"
|
| 15 |
+
- "collateral"
|
| 16 |
+
- "interest rate"
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def parse_numeric(pattern: str, text: str, cast=float, scale: float = 1.0):
|
| 21 |
+
match = re.search(pattern, text, re.I)
|
| 22 |
+
if not match:
|
| 23 |
+
return None
|
| 24 |
+
try:
|
| 25 |
+
return cast(match.group(1)) * scale
|
| 26 |
+
except Exception:
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def evaluate_policy(all_text: str, policy_yaml: str) -> Dict:
|
| 31 |
+
try:
|
| 32 |
+
policy = yaml.safe_load(policy_yaml) if policy_yaml.strip() else {}
|
| 33 |
+
except Exception:
|
| 34 |
+
return {"error": "Invalid YAML in policy rules."}
|
| 35 |
+
|
| 36 |
+
report = {"checks": [], "pass": True}
|
| 37 |
+
|
| 38 |
+
credit_score = parse_numeric(r"credit score[^0-9]{0,10}(\d{3})", all_text, int)
|
| 39 |
+
dti = parse_numeric(r"\bDTI[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01)
|
| 40 |
+
ltv = parse_numeric(r"\bLTV[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01) or parse_numeric(
|
| 41 |
+
r"\bloan[- ]to[- ]value[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
if "min_credit_score" in policy and credit_score is not None:
|
| 45 |
+
ok = credit_score >= policy["min_credit_score"]
|
| 46 |
+
report["checks"].append(
|
| 47 |
+
{"rule": f"credit_score ≥ {policy['min_credit_score']}", "observed": credit_score, "ok": ok}
|
| 48 |
+
)
|
| 49 |
+
report["pass"] &= ok
|
| 50 |
+
if "max_dti_ratio" in policy and dti is not None:
|
| 51 |
+
ok = dti <= policy["max_dti_ratio"]
|
| 52 |
+
report["checks"].append({"rule": f"dti ≤ {policy['max_dti_ratio']}", "observed": dti, "ok": ok})
|
| 53 |
+
report["pass"] &= ok
|
| 54 |
+
if "max_ltv_ratio" in policy and ltv is not None:
|
| 55 |
+
ok = ltv <= policy["max_ltv_ratio"]
|
| 56 |
+
report["checks"].append({"rule": f"ltv ≤ {policy['max_ltv_ratio']}", "observed": ltv, "ok": ok})
|
| 57 |
+
report["pass"] &= ok
|
| 58 |
+
|
| 59 |
+
for kw in policy.get("required_keywords", []):
|
| 60 |
+
present = bool(re.search(re.escape(kw), all_text, re.I))
|
| 61 |
+
report["checks"].append(
|
| 62 |
+
{"rule": f'require "{kw}"', "observed": "found" if present else "missing", "ok": present}
|
| 63 |
+
)
|
| 64 |
+
report["pass"] &= present
|
| 65 |
+
|
| 66 |
+
if "min_credit_score" in policy and credit_score is None:
|
| 67 |
+
report["checks"].append({"rule": "credit_score present", "observed": "not found", "ok": False})
|
| 68 |
+
report["pass"] = False
|
| 69 |
+
|
| 70 |
+
return report
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def next_actions(policy_report: Dict) -> List[str]:
|
| 74 |
+
actions: List[str] = []
|
| 75 |
+
if "error" in policy_report:
|
| 76 |
+
return ["Fix policy YAML (could not parse)."]
|
| 77 |
+
for check in policy_report["checks"]:
|
| 78 |
+
if check["ok"]:
|
| 79 |
+
continue
|
| 80 |
+
if "credit_score" in check["rule"]:
|
| 81 |
+
actions.append("Request updated bureau report or alternative credit data.")
|
| 82 |
+
elif "dti" in check["rule"]:
|
| 83 |
+
actions.append("Obtain income docs or reduce loan amount to meet DTI.")
|
| 84 |
+
elif "ltv" in check["rule"]:
|
| 85 |
+
actions.append("Ask for additional collateral or higher down payment.")
|
| 86 |
+
elif "require" in check["rule"]:
|
| 87 |
+
actions.append(f'Add documentation for "{check["rule"].split(chr(34))[1]}".')
|
| 88 |
+
if not actions:
|
| 89 |
+
actions.append("Move application to underwriting/approval queue.")
|
| 90 |
+
return sorted(set(actions))
|
rag_store.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from models import embed_texts
|
| 8 |
+
from text_utils import chunk_text, read_pdf_text
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
import faiss # type: ignore
|
| 12 |
+
|
| 13 |
+
FAISS_OK = True
|
| 14 |
+
except Exception:
|
| 15 |
+
FAISS_OK = False
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RAGStore:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.docs: List[str] = []
|
| 21 |
+
self.doc_ids: List[Tuple[int, int]] = []
|
| 22 |
+
self.embs: np.ndarray | None = None
|
| 23 |
+
self.index = None
|
| 24 |
+
|
| 25 |
+
def ingest(self, files) -> Tuple[int, int, str]:
|
| 26 |
+
self.docs, self.doc_ids = [], []
|
| 27 |
+
combined_text: List[str] = []
|
| 28 |
+
for file_idx, file in enumerate(files or []):
|
| 29 |
+
text = read_pdf_text(file)
|
| 30 |
+
chunks = chunk_text(text)
|
| 31 |
+
self.docs.extend(chunks)
|
| 32 |
+
self.doc_ids.extend([(file_idx, chunk_idx) for chunk_idx in range(len(chunks))])
|
| 33 |
+
combined_text.append(text)
|
| 34 |
+
return len(files or []), len(self.docs), "\n".join(combined_text)
|
| 35 |
+
|
| 36 |
+
def build(self) -> int:
|
| 37 |
+
if not self.docs:
|
| 38 |
+
return 0
|
| 39 |
+
self.embs = embed_texts(self.docs).astype("float32")
|
| 40 |
+
if FAISS_OK:
|
| 41 |
+
dim = self.embs.shape[1]
|
| 42 |
+
self.index = faiss.IndexFlatIP(dim)
|
| 43 |
+
self.index.add(self.embs)
|
| 44 |
+
return len(self.docs)
|
| 45 |
+
|
| 46 |
+
def search(self, query: str, k: int = 5) -> List[str]:
|
| 47 |
+
if not self.docs:
|
| 48 |
+
return []
|
| 49 |
+
query_vec = embed_texts([query]).astype("float32")[0]
|
| 50 |
+
if self.index is not None:
|
| 51 |
+
_, indices = self.index.search(np.expand_dims(query_vec, 0), k)
|
| 52 |
+
ranked_indices = indices[0].tolist()
|
| 53 |
+
else:
|
| 54 |
+
sims = self.embs @ query_vec # type: ignore[operator]
|
| 55 |
+
ranked_indices = np.argsort(-sims)[:k].tolist()
|
| 56 |
+
return [self.docs[idx] for idx in ranked_indices if idx is not None]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
RAG = RAGStore()
|
| 60 |
+
|
requirements.txt
CHANGED
|
@@ -2,5 +2,9 @@ gradio==4.44.1
|
|
| 2 |
easyocr==1.7.1
|
| 3 |
torch==2.3.1
|
| 4 |
transformers==4.44.2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
Pillow==10.4.0
|
| 6 |
numpy==1.26.4
|
|
|
|
| 2 |
easyocr==1.7.1
|
| 3 |
torch==2.3.1
|
| 4 |
transformers==4.44.2
|
| 5 |
+
sentence-transformers==2.2.2
|
| 6 |
+
pdfplumber==0.11.4
|
| 7 |
+
PyYAML==6.0.2
|
| 8 |
+
faiss-cpu==1.7.4
|
| 9 |
Pillow==10.4.0
|
| 10 |
numpy==1.26.4
|
text_utils.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import pdfplumber
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def read_pdf_text(pathlike) -> str:
|
| 10 |
+
"""Return concatenated text from every page of the PDF."""
|
| 11 |
+
text: List[str] = []
|
| 12 |
+
with pdfplumber.open(pathlike.name) as pdf:
|
| 13 |
+
for page in pdf.pages:
|
| 14 |
+
text.append(page.extract_text() or "")
|
| 15 |
+
return "\n".join(text)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
|
| 19 |
+
"""Split text into overlapping chunks with light sentence-aware boundaries."""
|
| 20 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 21 |
+
chunks: List[str] = []
|
| 22 |
+
i = 0
|
| 23 |
+
while i < len(text):
|
| 24 |
+
j = min(i + max_chars, len(text))
|
| 25 |
+
if j < len(text):
|
| 26 |
+
candidate = text.rfind(".", i, j)
|
| 27 |
+
if candidate != -1 and candidate > i + 200:
|
| 28 |
+
j = candidate + 1
|
| 29 |
+
chunks.append(text[i:j].strip())
|
| 30 |
+
i = max(j - overlap, j)
|
| 31 |
+
return [chunk for chunk in chunks if chunk]
|
| 32 |
+
|
ui.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
from pipelines import ask, build_kb, summarize
|
| 6 |
+
from policy import DEFAULT_POLICY
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def create_interface() -> gr.Blocks:
|
| 10 |
+
with gr.Blocks(title="CreditCopilot — RAG for Loan Docs") as demo:
|
| 11 |
+
gr.Markdown(
|
| 12 |
+
"# 🧠 CreditCopilot\nRetrieval-augmented assistant that summarizes loan documents, checks policy rules, and suggests next actions."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
with gr.Row():
|
| 16 |
+
with gr.Column(scale=1):
|
| 17 |
+
files = gr.Files(label="Upload loan PDFs", file_count="multiple", file_types=[".pdf"])
|
| 18 |
+
policy = gr.Code(value=DEFAULT_POLICY, language="yaml", label="Policy rules (YAML)")
|
| 19 |
+
build_btn = gr.Button("Build knowledge base", variant="primary")
|
| 20 |
+
build_status = gr.Markdown()
|
| 21 |
+
|
| 22 |
+
sum_btn = gr.Button("Quick summarize")
|
| 23 |
+
sum_out = gr.Textbox(label="Portfolio-ready summary", lines=8)
|
| 24 |
+
|
| 25 |
+
with gr.Column(scale=2):
|
| 26 |
+
question = gr.Textbox(
|
| 27 |
+
label="Ask a question (e.g., What are the key risks and missing docs?)", lines=2
|
| 28 |
+
)
|
| 29 |
+
ask_btn = gr.Button("Ask")
|
| 30 |
+
answer = gr.Markdown(label="Answer")
|
| 31 |
+
policy_report = gr.Code(label="Policy check report (JSON)")
|
| 32 |
+
actions = gr.Markdown(label="Suggested next actions")
|
| 33 |
+
|
| 34 |
+
build_btn.click(build_kb, [files, policy], [build_status, policy])
|
| 35 |
+
ask_btn.click(ask, [question, policy], [answer, policy_report, actions])
|
| 36 |
+
sum_btn.click(summarize, None, sum_out)
|
| 37 |
+
|
| 38 |
+
return demo
|
| 39 |
+
|