okara chidera commited on
Commit
4e36c6c
·
unverified ·
1 Parent(s): 777a487

chore: refactored code

Browse files
__pycache__/app.cpython-313.pyc ADDED
Binary file (320 Bytes). View file
 
__pycache__/models.cpython-313.pyc ADDED
Binary file (1.37 kB). View file
 
__pycache__/pipelines.cpython-313.pyc ADDED
Binary file (2.81 kB). View file
 
__pycache__/policy.cpython-313.pyc ADDED
Binary file (4.54 kB). View file
 
__pycache__/rag_store.cpython-313.pyc ADDED
Binary file (3.57 kB). View file
 
__pycache__/text_utils.cpython-313.pyc ADDED
Binary file (1.91 kB). View file
 
__pycache__/ui.cpython-313.pyc ADDED
Binary file (2.73 kB). View file
 
app.py CHANGED
@@ -1,251 +1,9 @@
1
- import gradio as gr
2
- import pdfplumber, re, json, yaml, numpy as np
3
- from pathlib import Path
4
- from typing import List, Tuple
5
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
6
- from sentence_transformers import SentenceTransformer
7
- try:
8
- import faiss # type: ignore
9
- FAISS_OK = True
10
- except Exception:
11
- FAISS_OK = False
12
 
13
- # ---------------------------
14
- # Models (CPU-friendly)
15
- # ---------------------------
16
- EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
17
- GEN_MODEL_NAME = "google/flan-t5-base"
18
 
19
- _embed = SentenceTransformer(EMBED_MODEL_NAME)
20
- _tok = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
21
- _gen = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
22
- t2t = pipeline("text2text-generation", model=_gen, tokenizer=_tok, device_map=None)
23
-
24
- # ---------------------------
25
- # Utils
26
- # ---------------------------
27
- def read_pdf_text(fobj) -> str:
28
- text = []
29
- with pdfplumber.open(fobj.name) as pdf:
30
- for p in pdf.pages:
31
- text.append(p.extract_text() or "")
32
- return "\n".join(text)
33
-
34
- def chunk_text(text: str, max_chars=900, overlap=120) -> List[str]:
35
- text = re.sub(r"\s+", " ", text).strip()
36
- chunks, i = [], 0
37
- while i < len(text):
38
- j = min(i + max_chars, len(text))
39
- # try to break on sentence end
40
- if j < len(text):
41
- k = text.rfind(".", i, j)
42
- if k != -1 and k > i + 200:
43
- j = k + 1
44
- chunks.append(text[i:j].strip())
45
- i = max(j - overlap, j)
46
- return [c for c in chunks if c]
47
-
48
- def embed_texts(texts: List[str]) -> np.ndarray:
49
- return _embed.encode(texts, batch_size=32, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
50
-
51
- def cosine_topk(query_vec: np.ndarray, mat: np.ndarray, k=5) -> List[int]:
52
- sims = (mat @ query_vec)
53
- return np.argsort(-sims)[:k].tolist()
54
-
55
- # ---------------------------
56
- # Tiny rule engine (YAML)
57
- # ---------------------------
58
- DEFAULT_POLICY = """\
59
- # Example policy rules
60
- min_credit_score: 620
61
- max_dti_ratio: 0.45 # debt-to-income
62
- max_ltv_ratio: 0.80 # loan-to-value
63
- required_keywords:
64
- - "employment verification"
65
- - "collateral"
66
- - "interest rate"
67
- """
68
-
69
- def parse_numeric(pattern: str, text: str, cast=float, scale=1.0):
70
- m = re.search(pattern, text, re.I)
71
- if not m: return None
72
- try:
73
- return cast(m.group(1)) * scale
74
- except Exception:
75
- return None
76
-
77
- def evaluate_policy(all_text: str, policy_yaml: str) -> dict:
78
- try:
79
- pol = yaml.safe_load(policy_yaml) if policy_yaml.strip() else {}
80
- except Exception:
81
- return {"error": "Invalid YAML in policy rules."}
82
-
83
- report = {"checks": [], "pass": True}
84
-
85
- # Example numeric fields we try to parse from docs
86
- credit_score = parse_numeric(r"credit score[^0-9]{0,10}(\d{3})", all_text, int)
87
- dti = parse_numeric(r"\bDTI[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01)
88
- ltv = parse_numeric(r"\bLTV[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", float, float, 0.01) or \
89
- parse_numeric(r"\bloan[- ]to[- ]value[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", float, float, 0.01)
90
-
91
- # Numeric checks
92
- if "min_credit_score" in pol and credit_score is not None:
93
- ok = credit_score >= pol["min_credit_score"]
94
- report["checks"].append({"rule": f"credit_score ≥ {pol['min_credit_score']}", "observed": credit_score, "ok": ok})
95
- report["pass"] &= ok
96
- if "max_dti_ratio" in pol and dti is not None:
97
- ok = dti <= pol["max_dti_ratio"]
98
- report["checks"].append({"rule": f"dti ≤ {pol['max_dti_ratio']}", "observed": dti, "ok": ok})
99
- report["pass"] &= ok
100
- if "max_ltv_ratio" in pol and ltv is not None:
101
- ok = ltv <= pol["max_ltv_ratio"]
102
- report["checks"].append({"rule": f"ltv ≤ {pol['max_ltv_ratio']}", "observed": ltv, "ok": ok})
103
- report["pass"] &= ok
104
-
105
- # Keyword presence checks
106
- for kw in pol.get("required_keywords", []):
107
- present = bool(re.search(re.escape(kw), all_text, re.I))
108
- report["checks"].append({"rule": f'require "{kw}"', "observed": "found" if present else "missing", "ok": present})
109
- report["pass"] &= present
110
-
111
- # Notes for missing observables
112
- if "min_credit_score" in pol and credit_score is None:
113
- report["checks"].append({"rule": "credit_score present", "observed": "not found", "ok": False})
114
- report["pass"] = False
115
-
116
- return report
117
-
118
- def next_actions(policy_report: dict) -> List[str]:
119
- actions = []
120
- if "error" in policy_report:
121
- return ["Fix policy YAML (could not parse)."]
122
- for c in policy_report["checks"]:
123
- if not c["ok"]:
124
- if "credit_score" in c["rule"]:
125
- actions.append("Request updated bureau report or alternative credit data.")
126
- elif "dti" in c["rule"]:
127
- actions.append("Obtain income docs or reduce loan amount to meet DTI.")
128
- elif "ltv" in c["rule"]:
129
- actions.append("Ask for additional collateral or higher down payment.")
130
- elif "require" in c["rule"]:
131
- actions.append(f'Add documentation for "{c["rule"].split(chr(34))[1]}".')
132
- if not actions:
133
- actions.append("Move application to underwriting/approval queue.")
134
- return sorted(set(actions))
135
-
136
- # ---------------------------
137
- # RAG store
138
- # ---------------------------
139
- class RAGStore:
140
- def __init__(self):
141
- self.docs: List[str] = []
142
- self.doc_ids: List[Tuple[int,int]] = [] # (file_idx, chunk_idx)
143
- self.embs: np.ndarray | None = None
144
- self.index = None
145
-
146
- def ingest(self, files: List[gr.File]) -> Tuple[int,int,str]:
147
- self.docs, self.doc_ids = [], []
148
- combined_text = []
149
- for fi, f in enumerate(files or []):
150
- text = read_pdf_text(f)
151
- chunks = chunk_text(text)
152
- self.docs.extend(chunks)
153
- self.doc_ids.extend([(fi, ci) for ci in range(len(chunks))])
154
- combined_text.append(text)
155
- return len(files or []), len(self.docs), "\n".join(combined_text)
156
-
157
- def build(self):
158
- if not self.docs:
159
- return 0
160
- self.embs = embed_texts(self.docs).astype("float32")
161
- if FAISS_OK:
162
- dim = self.embs.shape[1]
163
- self.index = faiss.IndexFlatIP(dim)
164
- self.index.add(self.embs)
165
- return len(self.docs)
166
-
167
- def search(self, query: str, k=5) -> List[str]:
168
- if not self.docs: return []
169
- q = embed_texts([query]).astype("float32")[0]
170
- if self.index is not None:
171
- D, I = self.index.search(np.expand_dims(q,0), k)
172
- idxs = I[0].tolist()
173
- else:
174
- idxs = cosine_topk(q, self.embs, k)
175
- return [self.docs[i] for i in idxs if i is not None]
176
-
177
- RAG = RAGStore()
178
-
179
- # ---------------------------
180
- # Pipelines
181
- # ---------------------------
182
- def build_kb(files, policy_text):
183
- n_files, n_chunks, all_text = RAG.ingest(files)
184
- n_vecs = RAG.build()
185
- pol = policy_text or DEFAULT_POLICY
186
- return (
187
- f"✅ Ingested {n_files} file(s), created {n_chunks} chunk(s), indexed {n_vecs} vector(s).",
188
- pol
189
- )
190
-
191
- def ask(question, policy_yaml):
192
- if not question.strip():
193
- return "Please enter a question.", "", ""
194
- contexts = RAG.search(question, k=6)
195
- context_block = "\n\n".join(contexts[:6]) if contexts else "No context found."
196
-
197
- prompt = (
198
- "You are a credit-analyst assistant. Using ONLY the provided context, "
199
- "answer the question concisely and cite key terms. "
200
- "Then provide a 3-bullet summary.\n\n"
201
- f"Context:\n{context_block}\n\nQuestion: {question}\nAnswer:"
202
- )
203
- answer = t2t(prompt, max_new_tokens=256)[0]["generated_text"]
204
-
205
- # Policy cross-check on the union of top chunks
206
- combined = " ".join(contexts)
207
- report = evaluate_policy(combined, policy_yaml or DEFAULT_POLICY)
208
- actions = next_actions(report)
209
-
210
- return answer, json.dumps(report, indent=2), "\n".join(f"• {a}" for a in actions)
211
-
212
- def summarize():
213
- if not RAG.docs:
214
- return "No documents indexed yet."
215
- joined = " ".join(RAG.docs[:18]) # keep prompt small
216
- prompt = (
217
- "Summarize the loan/application documents: list borrower(s), purpose, amount, "
218
- "rate, tenor, collateral, covenants, key risks. Keep to 7 bullets.\n\n"
219
- f"{joined}"
220
- )
221
- return t2t(prompt, max_new_tokens=220)[0]["generated_text"]
222
-
223
- # ---------------------------
224
- # UI
225
- # ---------------------------
226
- with gr.Blocks(title="CreditCopilot — RAG for Loan Docs") as demo:
227
- gr.Markdown("# 🧠 CreditCopilot\nRetrieval-augmented assistant that summarizes loan documents, checks policy rules, and suggests next actions.")
228
-
229
- with gr.Row():
230
- with gr.Column(scale=1):
231
- files = gr.Files(label="Upload loan PDFs", file_count="multiple", file_types=[".pdf"])
232
- policy = gr.Code(value=DEFAULT_POLICY, language="yaml", label="Policy rules (YAML)")
233
- build_btn = gr.Button("Build knowledge base", variant="primary")
234
- build_status = gr.Markdown()
235
-
236
- sum_btn = gr.Button("Quick summarize")
237
- sum_out = gr.Textbox(label="Portfolio-ready summary", lines=8)
238
-
239
- with gr.Column(scale=2):
240
- q = gr.Textbox(label="Ask a question (e.g., What are the key risks and missing docs?)", lines=2)
241
- ask_btn = gr.Button("Ask")
242
- ans = gr.Markdown(label="Answer")
243
- pol_report = gr.Code(label="Policy check report (JSON)")
244
- actions = gr.Markdown(label="Suggested next actions")
245
-
246
- build_btn.click(build_kb, [files, policy], [build_status, policy])
247
- ask_btn.click(ask, [q, policy], [ans, pol_report, actions])
248
- sum_btn.click(summarize, None, sum_out)
249
 
250
  if __name__ == "__main__":
251
- demo.launch()
 
 
1
+ from __future__ import annotations
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from ui import create_interface
 
 
 
 
4
 
5
+ demo = create_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  if __name__ == "__main__":
8
+ demo.launch()
9
+
models.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
8
+
9
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
10
+ GEN_MODEL_NAME = "google/flan-t5-base"
11
+
12
+ _embedder = SentenceTransformer(EMBED_MODEL_NAME)
13
+ _tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
14
+ _generator_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
15
+ _text2text = pipeline(
16
+ "text2text-generation",
17
+ model=_generator_model,
18
+ tokenizer=_tokenizer,
19
+ device_map=None,
20
+ )
21
+
22
+
23
+ def embed_texts(texts: List[str]) -> np.ndarray:
24
+ return _embedder.encode(
25
+ texts,
26
+ batch_size=32,
27
+ show_progress_bar=False,
28
+ convert_to_numpy=True,
29
+ normalize_embeddings=True,
30
+ )
31
+
32
+
33
+ def generate_text(prompt: str, max_new_tokens: int = 256) -> str:
34
+ return _text2text(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]
35
+
pipelines.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import List, Tuple
5
+
6
+ from models import generate_text
7
+ from policy import DEFAULT_POLICY, evaluate_policy, next_actions
8
+ from rag_store import RAG
9
+
10
+
11
+ def build_kb(files, policy_text: str) -> Tuple[str, str]:
12
+ n_files, n_chunks, _ = RAG.ingest(files)
13
+ n_vectors = RAG.build()
14
+ policy_value = policy_text or DEFAULT_POLICY
15
+ status = f"✅ Ingested {n_files} file(s), created {n_chunks} chunk(s), indexed {n_vectors} vector(s)."
16
+ return status, policy_value
17
+
18
+
19
+ def ask(question: str, policy_yaml: str):
20
+ if not question.strip():
21
+ return "Please enter a question.", "", ""
22
+
23
+ contexts = RAG.search(question, k=6)
24
+ context_block = "\n\n".join(contexts[:6]) if contexts else "No context found."
25
+
26
+ prompt = (
27
+ "You are a credit-analyst assistant. Using ONLY the provided context, "
28
+ "answer the question concisely and cite key terms. "
29
+ "Then provide a 3-bullet summary.\n\n"
30
+ f"Context:\n{context_block}\n\nQuestion: {question}\nAnswer:"
31
+ )
32
+ answer = generate_text(prompt, max_new_tokens=256)
33
+
34
+ combined = " ".join(contexts)
35
+ report = evaluate_policy(combined, policy_yaml or DEFAULT_POLICY)
36
+ actions = next_actions(report)
37
+
38
+ return answer, json.dumps(report, indent=2), "\n".join(f"• {item}" for item in actions)
39
+
40
+
41
+ def summarize():
42
+ if not RAG.docs:
43
+ return "No documents indexed yet."
44
+
45
+ joined = " ".join(RAG.docs[:18])
46
+ prompt = (
47
+ "Summarize the loan/application documents: list borrower(s), purpose, amount, "
48
+ "rate, tenor, collateral, covenants, key risks. Keep to 7 bullets.\n\n"
49
+ f"{joined}"
50
+ )
51
+ return generate_text(prompt, max_new_tokens=220)
52
+
policy.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Dict, List
5
+
6
+ import yaml
7
+
8
+ DEFAULT_POLICY = """\
9
+ # Example policy rules
10
+ min_credit_score: 620
11
+ max_dti_ratio: 0.45 # debt-to-income
12
+ max_ltv_ratio: 0.80 # loan-to-value
13
+ required_keywords:
14
+ - "employment verification"
15
+ - "collateral"
16
+ - "interest rate"
17
+ """
18
+
19
+
20
+ def parse_numeric(pattern: str, text: str, cast=float, scale: float = 1.0):
21
+ match = re.search(pattern, text, re.I)
22
+ if not match:
23
+ return None
24
+ try:
25
+ return cast(match.group(1)) * scale
26
+ except Exception:
27
+ return None
28
+
29
+
30
+ def evaluate_policy(all_text: str, policy_yaml: str) -> Dict:
31
+ try:
32
+ policy = yaml.safe_load(policy_yaml) if policy_yaml.strip() else {}
33
+ except Exception:
34
+ return {"error": "Invalid YAML in policy rules."}
35
+
36
+ report = {"checks": [], "pass": True}
37
+
38
+ credit_score = parse_numeric(r"credit score[^0-9]{0,10}(\d{3})", all_text, int)
39
+ dti = parse_numeric(r"\bDTI[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01)
40
+ ltv = parse_numeric(r"\bLTV[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01) or parse_numeric(
41
+ r"\bloan[- ]to[- ]value[^0-9%]{0,10}(\d{1,2}(?:\.\d+)?)\s*%", all_text, float, 0.01
42
+ )
43
+
44
+ if "min_credit_score" in policy and credit_score is not None:
45
+ ok = credit_score >= policy["min_credit_score"]
46
+ report["checks"].append(
47
+ {"rule": f"credit_score ≥ {policy['min_credit_score']}", "observed": credit_score, "ok": ok}
48
+ )
49
+ report["pass"] &= ok
50
+ if "max_dti_ratio" in policy and dti is not None:
51
+ ok = dti <= policy["max_dti_ratio"]
52
+ report["checks"].append({"rule": f"dti ≤ {policy['max_dti_ratio']}", "observed": dti, "ok": ok})
53
+ report["pass"] &= ok
54
+ if "max_ltv_ratio" in policy and ltv is not None:
55
+ ok = ltv <= policy["max_ltv_ratio"]
56
+ report["checks"].append({"rule": f"ltv ≤ {policy['max_ltv_ratio']}", "observed": ltv, "ok": ok})
57
+ report["pass"] &= ok
58
+
59
+ for kw in policy.get("required_keywords", []):
60
+ present = bool(re.search(re.escape(kw), all_text, re.I))
61
+ report["checks"].append(
62
+ {"rule": f'require "{kw}"', "observed": "found" if present else "missing", "ok": present}
63
+ )
64
+ report["pass"] &= present
65
+
66
+ if "min_credit_score" in policy and credit_score is None:
67
+ report["checks"].append({"rule": "credit_score present", "observed": "not found", "ok": False})
68
+ report["pass"] = False
69
+
70
+ return report
71
+
72
+
73
+ def next_actions(policy_report: Dict) -> List[str]:
74
+ actions: List[str] = []
75
+ if "error" in policy_report:
76
+ return ["Fix policy YAML (could not parse)."]
77
+ for check in policy_report["checks"]:
78
+ if check["ok"]:
79
+ continue
80
+ if "credit_score" in check["rule"]:
81
+ actions.append("Request updated bureau report or alternative credit data.")
82
+ elif "dti" in check["rule"]:
83
+ actions.append("Obtain income docs or reduce loan amount to meet DTI.")
84
+ elif "ltv" in check["rule"]:
85
+ actions.append("Ask for additional collateral or higher down payment.")
86
+ elif "require" in check["rule"]:
87
+ actions.append(f'Add documentation for "{check["rule"].split(chr(34))[1]}".')
88
+ if not actions:
89
+ actions.append("Move application to underwriting/approval queue.")
90
+ return sorted(set(actions))
rag_store.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Tuple
4
+
5
+ import numpy as np
6
+
7
+ from models import embed_texts
8
+ from text_utils import chunk_text, read_pdf_text
9
+
10
+ try:
11
+ import faiss # type: ignore
12
+
13
+ FAISS_OK = True
14
+ except Exception:
15
+ FAISS_OK = False
16
+
17
+
18
+ class RAGStore:
19
+ def __init__(self):
20
+ self.docs: List[str] = []
21
+ self.doc_ids: List[Tuple[int, int]] = []
22
+ self.embs: np.ndarray | None = None
23
+ self.index = None
24
+
25
+ def ingest(self, files) -> Tuple[int, int, str]:
26
+ self.docs, self.doc_ids = [], []
27
+ combined_text: List[str] = []
28
+ for file_idx, file in enumerate(files or []):
29
+ text = read_pdf_text(file)
30
+ chunks = chunk_text(text)
31
+ self.docs.extend(chunks)
32
+ self.doc_ids.extend([(file_idx, chunk_idx) for chunk_idx in range(len(chunks))])
33
+ combined_text.append(text)
34
+ return len(files or []), len(self.docs), "\n".join(combined_text)
35
+
36
+ def build(self) -> int:
37
+ if not self.docs:
38
+ return 0
39
+ self.embs = embed_texts(self.docs).astype("float32")
40
+ if FAISS_OK:
41
+ dim = self.embs.shape[1]
42
+ self.index = faiss.IndexFlatIP(dim)
43
+ self.index.add(self.embs)
44
+ return len(self.docs)
45
+
46
+ def search(self, query: str, k: int = 5) -> List[str]:
47
+ if not self.docs:
48
+ return []
49
+ query_vec = embed_texts([query]).astype("float32")[0]
50
+ if self.index is not None:
51
+ _, indices = self.index.search(np.expand_dims(query_vec, 0), k)
52
+ ranked_indices = indices[0].tolist()
53
+ else:
54
+ sims = self.embs @ query_vec # type: ignore[operator]
55
+ ranked_indices = np.argsort(-sims)[:k].tolist()
56
+ return [self.docs[idx] for idx in ranked_indices if idx is not None]
57
+
58
+
59
+ RAG = RAGStore()
60
+
requirements.txt CHANGED
@@ -2,5 +2,9 @@ gradio==4.44.1
2
  easyocr==1.7.1
3
  torch==2.3.1
4
  transformers==4.44.2
 
 
 
 
5
  Pillow==10.4.0
6
  numpy==1.26.4
 
2
  easyocr==1.7.1
3
  torch==2.3.1
4
  transformers==4.44.2
5
+ sentence-transformers==2.2.2
6
+ pdfplumber==0.11.4
7
+ PyYAML==6.0.2
8
+ faiss-cpu==1.7.4
9
  Pillow==10.4.0
10
  numpy==1.26.4
text_utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import List
5
+
6
+ import pdfplumber
7
+
8
+
9
+ def read_pdf_text(pathlike) -> str:
10
+ """Return concatenated text from every page of the PDF."""
11
+ text: List[str] = []
12
+ with pdfplumber.open(pathlike.name) as pdf:
13
+ for page in pdf.pages:
14
+ text.append(page.extract_text() or "")
15
+ return "\n".join(text)
16
+
17
+
18
+ def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
19
+ """Split text into overlapping chunks with light sentence-aware boundaries."""
20
+ text = re.sub(r"\s+", " ", text).strip()
21
+ chunks: List[str] = []
22
+ i = 0
23
+ while i < len(text):
24
+ j = min(i + max_chars, len(text))
25
+ if j < len(text):
26
+ candidate = text.rfind(".", i, j)
27
+ if candidate != -1 and candidate > i + 200:
28
+ j = candidate + 1
29
+ chunks.append(text[i:j].strip())
30
+ i = max(j - overlap, j)
31
+ return [chunk for chunk in chunks if chunk]
32
+
ui.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import gradio as gr
4
+
5
+ from pipelines import ask, build_kb, summarize
6
+ from policy import DEFAULT_POLICY
7
+
8
+
9
+ def create_interface() -> gr.Blocks:
10
+ with gr.Blocks(title="CreditCopilot — RAG for Loan Docs") as demo:
11
+ gr.Markdown(
12
+ "# 🧠 CreditCopilot\nRetrieval-augmented assistant that summarizes loan documents, checks policy rules, and suggests next actions."
13
+ )
14
+
15
+ with gr.Row():
16
+ with gr.Column(scale=1):
17
+ files = gr.Files(label="Upload loan PDFs", file_count="multiple", file_types=[".pdf"])
18
+ policy = gr.Code(value=DEFAULT_POLICY, language="yaml", label="Policy rules (YAML)")
19
+ build_btn = gr.Button("Build knowledge base", variant="primary")
20
+ build_status = gr.Markdown()
21
+
22
+ sum_btn = gr.Button("Quick summarize")
23
+ sum_out = gr.Textbox(label="Portfolio-ready summary", lines=8)
24
+
25
+ with gr.Column(scale=2):
26
+ question = gr.Textbox(
27
+ label="Ask a question (e.g., What are the key risks and missing docs?)", lines=2
28
+ )
29
+ ask_btn = gr.Button("Ask")
30
+ answer = gr.Markdown(label="Answer")
31
+ policy_report = gr.Code(label="Policy check report (JSON)")
32
+ actions = gr.Markdown(label="Suggested next actions")
33
+
34
+ build_btn.click(build_kb, [files, policy], [build_status, policy])
35
+ ask_btn.click(ask, [question, policy], [answer, policy_report, actions])
36
+ sum_btn.click(summarize, None, sum_out)
37
+
38
+ return demo
39
+