soupstick commited on
Commit
bc2b09a
·
1 Parent(s): ccb470a

chore: commit modular split + updates

Browse files
Files changed (12) hide show
  1. agent.py +27 -1
  2. app.py +172 -1
  3. llm_provider.py +101 -1
  4. mcp.py +40 -1
  5. modules/credit.py +45 -1
  6. modules/kyc.py +48 -1
  7. modules/sanctions.py +41 -1
  8. modules/transactions.py +61 -1
  9. threat_intel.py +17 -1
  10. tools.py +59 -1
  11. ttp_guard.py +67 -1
  12. validation.py +125 -1
agent.py CHANGED
@@ -1 +1,27 @@
1
- """TODO: implement agent.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import List
3
+ from langchain.agents import initialize_agent, AgentType
4
+ from llm_provider import CHAT_LLM, SUMMARY_NOTICE
5
+ from ttp_guard import TTPGuard, GuardDecision
6
+
7
+ AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
8
+ You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
9
+ If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
10
+ Be concise and actionable."""
11
+
12
+ def build_agent(tools: List, guard: TTPGuard):
13
+ if CHAT_LLM is None:
14
+ # Stub agent that returns notice
15
+ class Stub:
16
+ def invoke(self, prompt): return SUMMARY_NOTICE
17
+ return Stub()
18
+
19
+ # Wrap LLM invocation with a guard-aware tool-use policy by leveraging the system message.
20
+ return initialize_agent(
21
+ tools,
22
+ CHAT_LLM,
23
+ agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
24
+ verbose=False,
25
+ agent_kwargs={"system_message": AGENT_SYSTEM},
26
+ handle_parsing_errors=True,
27
+ )
app.py CHANGED
@@ -1 +1,172 @@
1
- """TODO: implement app.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import gradio as gr
3
+ import pandas as pd
4
+
5
+ from llm_provider import CHAT_LLM, SUMMARY_NOTICE
6
+ from mcp import mcp_fetch_sanctions, mcp_fetch_high_risk_mcc
7
+ from threat_intel import ThreatIntel
8
+ from ttp_guard import TTPGuard, GuardDecision, default_guard
9
+ from modules.transactions import prepare_transactions, detect_transactions
10
+ from modules.kyc import prepare_kyc, detect_kyc
11
+ from modules.sanctions import prepare_sanctions, detect_sanctions, DEMO_SANCTIONS
12
+ from modules.credit import prepare_credit, detect_credit
13
+ from agent import build_agent
14
+ from tools import build_tools
15
+ from langchain.schema import SystemMessage, HumanMessage
16
+
17
+ # ---------- Summarizer ----------
18
+ SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
19
+
20
+ def summarize_ai(context: str) -> str:
21
+ if CHAT_LLM is None:
22
+ return SUMMARY_NOTICE
23
+ # Guard summaries as well (low severity just annotate)
24
+ decision = default_guard.inspect_input(context)
25
+ if decision.action == GuardDecision.BLOCK:
26
+ return f"Blocked by TTP Guard: {decision.reason}"
27
+ try:
28
+ out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
29
+ return getattr(out, "content", str(out))
30
+ except Exception:
31
+ return SUMMARY_NOTICE
32
+
33
+ # ---------- TI + Guard singletons ----------
34
+ TI = ThreatIntel.load() # pulls MCP envs if set, else defaults
35
+ GUARD = default_guard
36
+
37
+ # ---------- Pipelines (tabs) ----------
38
+ def run_transactions(file):
39
+ try:
40
+ from validation import _read_csv_any
41
+ df = _read_csv_any(file)
42
+ clean, issues, quality, colmap = prepare_transactions(df)
43
+ mcc_list = mcp_fetch_high_risk_mcc() or TI.high_risk_mcc
44
+ flagged, stats = detect_transactions(clean, colmap, mcc_list)
45
+ ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
46
+ ai = summarize_ai(ctx)
47
+ return ai, stats, flagged, issues
48
+ except Exception as e:
49
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
50
+
51
+ def run_kyc(file):
52
+ try:
53
+ from validation import _read_csv_any
54
+ df = _read_csv_any(file)
55
+ clean, issues, quality, colmap = prepare_kyc(df)
56
+ flagged, stats = detect_kyc(clean, colmap)
57
+ ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
58
+ ai = summarize_ai(ctx)
59
+ return ai, stats, flagged, issues
60
+ except Exception as e:
61
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
62
+
63
+ def run_sanctions(customers_file, sanctions_file):
64
+ try:
65
+ from validation import _read_csv_any
66
+ df = _read_csv_any(customers_file)
67
+ clean, issues, quality, colmap = prepare_sanctions(df)
68
+ sanc_df = mcp_fetch_sanctions() or ( _read_csv_any(sanctions_file) if sanctions_file else None ) or TI.sanctions_df or DEMO_SANCTIONS
69
+ flagged, stats = detect_sanctions(clean, colmap, sanc_df)
70
+ ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
71
+ ai = summarize_ai(ctx)
72
+ return ai, stats, flagged, issues
73
+ except Exception as e:
74
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
75
+
76
+ def run_credit(file):
77
+ try:
78
+ from validation import _read_csv_any
79
+ df = _read_csv_any(file)
80
+ clean, issues, quality, colmap = prepare_credit(df)
81
+ flagged, stats = detect_credit(clean, colmap)
82
+ ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
83
+ ai = summarize_ai(ctx)
84
+ return ai, stats, flagged, issues
85
+ except Exception as e:
86
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
87
+
88
+ # ---------- Agent & tools ----------
89
+ TOOLS = build_tools()
90
+ AGENT = build_agent(TOOLS, GUARD)
91
+
92
+ def agent_reply(history, user_msg: str):
93
+ # Guard the incoming user message before tool routing
94
+ decision = GUARD.inspect_input(user_msg)
95
+ if decision.action == GuardDecision.BLOCK:
96
+ return f"❌ Blocked by TTP Guard: {decision.reason}"
97
+ try:
98
+ looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
99
+ prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
100
+ res = AGENT.invoke(prompt)
101
+ if isinstance(res, dict) and "output" in res:
102
+ return res["output"]
103
+ return str(res)
104
+ except Exception as e:
105
+ return f"Agent error: {e}"
106
+
107
+ # ---------- UI ----------
108
+ with gr.Blocks(title="Fraud Detector Analyst — LangChain + Fireworks + MCP") as demo:
109
+ gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + Fireworks + MCP")
110
+ with gr.Tabs():
111
+ with gr.Tab("Transactions"):
112
+ f = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
113
+ ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
114
+ st = gr.Textbox(label="Stats", lines=3)
115
+ flagged = gr.Dataframe(label="Flagged Transactions")
116
+ issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
117
+ f.upload(run_transactions, inputs=[f], outputs=[ai, st, flagged, issues])
118
+
119
+ with gr.Tab("KYC"):
120
+ f = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
121
+ ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
122
+ st = gr.Textbox(label="Stats", lines=3)
123
+ flagged = gr.Dataframe(label="Flagged KYC Rows")
124
+ issues = gr.Dataframe(label="Data Quality Issues")
125
+ f.upload(run_kyc, inputs=[f], outputs=[ai, st, flagged, issues])
126
+
127
+ with gr.Tab("Sanctions/PEP"):
128
+ cust = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
129
+ sanc = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
130
+ ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
131
+ st = gr.Textbox(label="Stats", lines=3)
132
+ flagged = gr.Dataframe(label="Matches")
133
+ issues = gr.Dataframe(label="Data Quality Issues")
134
+ cust.upload(run_sanctions, inputs=[cust, sanc], outputs=[ai, st, flagged, issues])
135
+ sanc.upload(run_sanctions, inputs=[cust, sanc], outputs=[ai, st, flagged, issues])
136
+
137
+ with gr.Tab("Credit Risk"):
138
+ f = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
139
+ ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
140
+ st = gr.Textbox(label="Stats", lines=3)
141
+ flagged = gr.Dataframe(label="Flagged Applicants")
142
+ issues = gr.Dataframe(label="Data Quality Issues")
143
+ f.upload(run_credit, inputs=[f], outputs=[ai, st, flagged, issues])
144
+
145
+ with gr.Tab("AI Consultant (Agent)"):
146
+ chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
147
+ user_in = gr.Textbox(label="Message or CSV snippet")
148
+ send_btn = gr.Button("Send")
149
+ def _chat_fn(history, msg):
150
+ reply = agent_reply(history, msg)
151
+ history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
152
+ return history, ""
153
+ send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
154
+
155
+ with gr.Tab("Security & TI"):
156
+ gr.Markdown("**TTP Guard policy & latest indicators**")
157
+ gr.JSON(value=GUARD.describe_policy())
158
+ gr.Dataframe(value=TI.sanctions_df.head(10) if TI.sanctions_df is not None else pd.DataFrame({"note":["demo sanctions used"]}),
159
+ label="Sanctions (sample)")
160
+ gr.Dataframe(value=pd.DataFrame({"high_risk_mcc": TI.high_risk_mcc}),
161
+ label="High-risk MCC (current)")
162
+
163
+ gr.Markdown(
164
+ "### ⚙️ Configure\n"
165
+ "- `FIREWORKS_API_KEY` **or** `HF_TOKEN` (provider routing to Fireworks)\n"
166
+ "- `FW_PRIMARY_MODEL` (default openai/gpt-oss-20b), `FW_SECONDARY_MODEL` (default Qwen/Qwen3-Coder-30B-A3B-Instruct)\n"
167
+ "- MCP (optional): `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`\n"
168
+ "- TTP guard thresholds: `TTP_BLOCK_LEVEL` (default 3)\n"
169
+ )
170
+
171
+ if __name__ == "__main__":
172
+ demo.launch(server_name="0.0.0.0", server_port=7860)
llm_provider.py CHANGED
@@ -1 +1,101 @@
1
- """TODO: implement llm_provider.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import os, logging
3
+ from dotenv import load_dotenv
4
+ from huggingface_hub import InferenceClient
5
+ from langchain_core.language_models.chat_models import BaseChatModel
6
+ from langchain.schema import HumanMessage, SystemMessage, AIMessage
7
+ from langchain_core.outputs import ChatGeneration, ChatResult
8
+
9
+ load_dotenv()
10
+ log = logging.getLogger("fraud-analyst")
11
+ logging.basicConfig(level=logging.INFO)
12
+
13
+ FIREWORKS_API_KEY = os.getenv("FIREWORKS_API_KEY") or os.getenv("HF_TOKEN")
14
+ FW_PRIMARY_MODEL = os.getenv("FW_PRIMARY_MODEL", "openai/gpt-oss-20b")
15
+ FW_SECONDARY_MODEL = os.getenv("FW_SECONDARY_MODEL", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
16
+
17
+ SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
18
+
19
+ class FireworksHFChat(BaseChatModel):
20
+ model: str
21
+ api_key: str | None = None
22
+ temperature: float = 0.2
23
+ max_new_tokens: int = 256
24
+ timeout: int = 60
25
+
26
+ def __init__(self, model: str, api_key: str | None):
27
+ super().__init__()
28
+ self.model = model
29
+ self.api_key = api_key
30
+ self._client = InferenceClient(provider="fireworks-ai", api_key=self.api_key)
31
+
32
+ @property
33
+ def _llm_type(self) -> str:
34
+ return "fireworks_hf_chat"
35
+
36
+ def _convert(self, messages):
37
+ out=[]
38
+ for m in messages:
39
+ if isinstance(m, SystemMessage):
40
+ out.append({"role":"system","content":m.content})
41
+ elif isinstance(m, HumanMessage):
42
+ out.append({"role":"user","content":m.content})
43
+ elif isinstance(m, AIMessage):
44
+ out.append({"role":"assistant","content":m.content})
45
+ else:
46
+ out.append({"role":"user","content":str(getattr(m,"content",m))})
47
+ return out
48
+
49
+ def _generate(self, messages, stop=None, run_manager=None, **kwargs) -> ChatResult:
50
+ if not self.api_key:
51
+ gen = ChatGeneration(message=AIMessage(content=""))
52
+ return ChatResult(generations=[gen], llm_output={"error": "no_api_key"})
53
+ try:
54
+ resp = self._client.chat.completions.create(
55
+ model=self.model,
56
+ messages=self._convert(messages),
57
+ stream=False,
58
+ max_tokens=kwargs.get("max_tokens", 256),
59
+ temperature=kwargs.get("temperature", 0.2),
60
+ )
61
+ text = ""
62
+ if hasattr(resp, "choices") and resp.choices:
63
+ ch = resp.choices[0]
64
+ if hasattr(ch, "message") and ch.message and getattr(ch.message, "content", None):
65
+ text = ch.message.content
66
+ elif hasattr(ch, "text") and ch.text:
67
+ text = ch.text
68
+ gen = ChatGeneration(message=AIMessage(content=text or ""))
69
+ return ChatResult(generations=[gen], llm_output={"model": self.model})
70
+ except Exception as e:
71
+ log.warning(f"Fireworks call failed for {self.model}: {type(e).__name__}: {str(e)[:200]}")
72
+ gen = ChatGeneration(message=AIMessage(content=""))
73
+ return ChatResult(generations=[gen], llm_output={"error": str(e)})
74
+
75
+ def _heartbeat(model_id: str) -> bool:
76
+ if not FIREWORKS_API_KEY: return False
77
+ try:
78
+ client = InferenceClient(provider="fireworks-ai", api_key=FIREWORKS_API_KEY)
79
+ _ = client.chat.completions.create(
80
+ model=model_id,
81
+ messages=[{"role":"user","content":"ping"}],
82
+ stream=False,
83
+ max_tokens=1,
84
+ )
85
+ return True
86
+ except Exception as e:
87
+ log.warning(f"Heartbeat failed for {model_id}: {type(e).__name__}: {str(e)[:160]}")
88
+ return False
89
+
90
+ def build_chat_llm():
91
+ log.info(f"Fireworks key present: {bool(FIREWORKS_API_KEY)} len={len(FIREWORKS_API_KEY) if FIREWORKS_API_KEY else 0}")
92
+ if FIREWORKS_API_KEY and _heartbeat(FW_PRIMARY_MODEL):
93
+ log.info(f"Using chat model: {FW_PRIMARY_MODEL}")
94
+ return FireworksHFChat(FW_PRIMARY_MODEL, FIREWORKS_API_KEY)
95
+ if FIREWORKS_API_KEY and _heartbeat(FW_SECONDARY_MODEL):
96
+ log.info(f"Using fallback chat model: {FW_SECONDARY_MODEL}")
97
+ return FireworksHFChat(FW_SECONDARY_MODEL, FIREWORKS_API_KEY)
98
+ log.warning("No working chat model; notice will be shown.")
99
+ return None
100
+
101
+ CHAT_LLM = build_chat_llm()
mcp.py CHANGED
@@ -1 +1,40 @@
1
- """TODO: implement mcp.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import os, json, logging
3
+ from typing import Optional, List
4
+ import pandas as pd
5
+ from urllib.request import Request, urlopen
6
+
7
+ log = logging.getLogger("fraud-analyst")
8
+
9
+ def _mcp_get_json(url: str, auth_header: Optional[str]):
10
+ try:
11
+ req = Request(url)
12
+ if auth_header:
13
+ k, v = auth_header.split(":", 1)
14
+ req.add_header(k.strip(), v.strip())
15
+ with urlopen(req, timeout=10) as r:
16
+ return json.loads(r.read().decode("utf-8"))
17
+ except Exception as e:
18
+ log.warning(f"MCP fetch failed: {e}")
19
+ return None
20
+
21
+ def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
22
+ if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
23
+ url = os.getenv("MCP_SANCTIONS_URL")
24
+ if not url: return None
25
+ data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
26
+ if not data: return None
27
+ if isinstance(data, list):
28
+ if all(isinstance(x, dict) for x in data):
29
+ rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
30
+ return pd.DataFrame(rows) if rows else None
31
+ if all(isinstance(x, str) for x in data):
32
+ return pd.DataFrame({"name": data})
33
+ return None
34
+
35
+ def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
36
+ if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
37
+ url = os.getenv("MCP_HIGH_RISK_MCC_URL")
38
+ if not url: return None
39
+ data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
40
+ return [str(x) for x in data] if isinstance(data, list) else None
modules/credit.py CHANGED
@@ -1 +1,45 @@
1
- """TODO: implement modules/credit.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import pandas as pd, numpy as np
3
+ from typing import Dict
4
+ from validation import _prepare_generic
5
+
6
+ CR_EXPECTED = {
7
+ "customer_id":["cust_id","user_id","client_id"],
8
+ "credit_score":["creditscore","score"],
9
+ "utilization":["util","credit_utilization","utilization_ratio"],
10
+ "dti":["debt_to_income","debt_to_income_ratio"],
11
+ "recent_defaults":["defaults","recentdefaults"],
12
+ "income":["annual_income","salary"]
13
+ }
14
+
15
+ def prepare_credit(df: pd.DataFrame):
16
+ return _prepare_generic(df, CR_EXPECTED)
17
+
18
+ def detect_credit(clean_df: pd.DataFrame, colmap: Dict[str,str]):
19
+ needed = ["credit_score","utilization","dti","recent_defaults","income"]
20
+ if not any(k in colmap for k in needed):
21
+ return pd.DataFrame(), "Required columns missing for Credit Risk."
22
+ df = clean_df.copy()
23
+ cs = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
24
+ util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
25
+ dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
26
+ rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
27
+ inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
28
+ out=[]
29
+ for i in range(len(df)):
30
+ hits=0; reasons=[]
31
+ if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
32
+ if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
33
+ if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
34
+ if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
35
+ if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
36
+ level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
37
+ out.append((hits, level, ", ".join(reasons)))
38
+ res = df.assign(
39
+ risk_score=[x[0] for x in out],
40
+ risk_level=[x[1] for x in out],
41
+ risk_reason=[x[2] for x in out]
42
+ )
43
+ flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
44
+ stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
45
+ return flagged, stats
modules/kyc.py CHANGED
@@ -1 +1,48 @@
1
- """TODO: implement modules/kyc.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ from typing import Dict
4
+ from validation import _prepare_generic
5
+ import numpy as np
6
+
7
+ KYC_EXPECTED = {
8
+ "customer_id":["cust_id","user_id","client_id"],
9
+ "name":["full_name","customer_name"],
10
+ "email":["email_address","mail"],
11
+ "phone":["phone_number","mobile","contact"],
12
+ "dob":["date_of_birth","birthdate"]
13
+ }
14
+
15
+ def prepare_kyc(df: pd.DataFrame):
16
+ return _prepare_generic(df, KYC_EXPECTED)
17
+
18
+ def _age_years(dob: pd.Series) -> pd.Series:
19
+ now = pd.Timestamp.utcnow()
20
+ return (now - dob).dt.days / 365.25
21
+
22
+ def detect_kyc(clean_df: pd.DataFrame, colmap: Dict[str,str]):
23
+ if not all(k in colmap for k in ["customer_id","name"]):
24
+ return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
25
+ df = clean_df.copy()
26
+ reasons=[]
27
+ if "email" in colmap:
28
+ dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
29
+ reasons.append(dupe_email)
30
+ if "phone" in colmap:
31
+ dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
32
+ reasons.append(dupe_phone)
33
+ if "dob" in colmap:
34
+ age = _age_years(df[colmap["dob"]])
35
+ invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
36
+ reasons.append(invalid)
37
+ if "name" in colmap:
38
+ name = df[colmap["name"]].astype(str)
39
+ susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
40
+ reasons.append(susp)
41
+ mask=None
42
+ for m in reasons:
43
+ mask = m if mask is None else (mask | m)
44
+ flagged = df[mask] if mask is not None else pd.DataFrame()
45
+ if not flagged.empty:
46
+ flagged = flagged.assign(risk_reason="kyc_rule_hit")
47
+ stats = f"KYC flagged: {len(flagged)} of {len(df)}."
48
+ return flagged, stats
modules/sanctions.py CHANGED
@@ -1 +1,41 @@
1
- """TODO: implement modules/sanctions.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import re, pandas as pd
3
+ from typing import Optional, Dict
4
+ from validation import _prepare_generic, _standardize_df
5
+
6
+ SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
7
+
8
+ def prepare_sanctions(df: pd.DataFrame):
9
+ return _prepare_generic(df, SAN_EXPECTED)
10
+
11
+ DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
12
+
13
+ def token_overlap(a: str, b: str) -> int:
14
+ at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
15
+ bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
16
+ return len(at & bt)
17
+
18
+ def detect_sanctions(clean_df: pd.DataFrame, colmap: Dict[str,str], sanctions_df: Optional[pd.DataFrame]=None):
19
+ if "name" not in colmap:
20
+ return pd.DataFrame(), "Required column missing for Sanctions (need name)."
21
+ df = clean_df.copy()
22
+ sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
23
+ sanc = _standardize_df(sanc)
24
+ if "name" not in sanc.columns:
25
+ for c in sanc.columns:
26
+ if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
27
+ sanc_names = sanc["name"].dropna().astype(str).tolist()
28
+ matches=[]
29
+ for idx, row in df.iterrows():
30
+ nm = str(row[colmap["name"]] or "").strip()
31
+ if not nm: continue
32
+ if any(nm.lower()==s.lower() for s in sanc_names):
33
+ matches.append((idx,"exact")); continue
34
+ if any(token_overlap(nm, s) >= 2 for s in sanc_names):
35
+ matches.append((idx,"fuzzy"))
36
+ flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
37
+ if not flagged.empty:
38
+ mt = {i:t for i,t in matches}
39
+ flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
40
+ stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
41
+ return flagged, stats
modules/transactions.py CHANGED
@@ -1 +1,61 @@
1
- """TODO: implement modules/transactions.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ from typing import Optional, List, Dict
4
+ from validation import _prepare_generic, _nfkc
5
+
6
+ TX_EXPECTED = {
7
+ "transaction_id":["txn_id","transactionid","id","tx_id"],
8
+ "customer_id":["cust_id","user_id","client_id"],
9
+ "amount":["amt","amount_inr","value"],
10
+ "timestamp":["date","event_time","created_at","tx_time"],
11
+ "merchant_category":["mcc","merchant_cat","category"]
12
+ }
13
+
14
+ def prepare_transactions(df: pd.DataFrame):
15
+ return _prepare_generic(df, TX_EXPECTED)
16
+
17
+ def detect_transactions(clean_df: pd.DataFrame, colmap: Dict[str,str], high_risk_mcc: Optional[List[str]] = None):
18
+ high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
19
+ if high_risk_mcc:
20
+ high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
21
+ if not all(k in colmap for k in ["customer_id","amount"]):
22
+ return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
23
+ df = clean_df.copy()
24
+ reasons = []
25
+ amtcol = colmap.get("amount")
26
+ if amtcol:
27
+ reasons.append(df[amtcol] > 10000) # large
28
+ reasons.append(df[amtcol] < 0) # negative
29
+ if "merchant_category" in colmap:
30
+ mcc = colmap["merchant_category"]
31
+ high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
32
+ reasons.append(high)
33
+ if all(k in colmap for k in ["customer_id","timestamp","amount"]):
34
+ cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
35
+ daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
36
+ reasons.append(daily > 50000)
37
+ mask = None
38
+ for m in reasons:
39
+ mask = m if mask is None else (mask | m)
40
+ flagged = df[mask] if mask is not None else pd.DataFrame()
41
+ if not flagged.empty:
42
+ rr=[]
43
+ for _, row in flagged.iterrows():
44
+ hits=[]
45
+ if amtcol:
46
+ a=row[amtcol]
47
+ if pd.notna(a) and a>10000: hits.append("large_amount")
48
+ if pd.notna(a) and a<0: hits.append("negative_amount")
49
+ if "merchant_category" in colmap:
50
+ val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
51
+ if val in high_risk: hits.append("mcc_high_risk")
52
+ try:
53
+ if all(k in colmap for k in ["customer_id","timestamp","amount"]):
54
+ sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
55
+ (df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
56
+ if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
57
+ except Exception: pass
58
+ rr.append(", ".join(sorted(set(hits))) or "rule_hit")
59
+ flagged = flagged.assign(risk_reason=rr)
60
+ stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
61
+ return flagged, stats
threat_intel.py CHANGED
@@ -1 +1,17 @@
1
- """TODO: implement threat_intel.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Optional, List
4
+ import pandas as pd
5
+ from mcp import mcp_fetch_sanctions, mcp_fetch_high_risk_mcc
6
+ from modules.sanctions import DEMO_SANCTIONS
7
+
8
+ @dataclass
9
+ class ThreatIntel:
10
+ sanctions_df: Optional[pd.DataFrame]
11
+ high_risk_mcc: List[str]
12
+
13
+ @staticmethod
14
+ def load() -> "ThreatIntel":
15
+ sanc = mcp_fetch_sanctions()
16
+ mcc = mcp_fetch_high_risk_mcc() or ["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"]
17
+ return ThreatIntel(sanctions_df=sanc or DEMO_SANCTIONS, high_risk_mcc=mcc)
tools.py CHANGED
@@ -1 +1,59 @@
1
- """TODO: implement tools.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import io, pandas as pd
3
+ from pydantic import BaseModel, Field
4
+ from langchain.tools import tool
5
+
6
+ from modules.transactions import prepare_transactions, detect_transactions
7
+ from modules.kyc import prepare_kyc, detect_kyc
8
+ from modules.sanctions import prepare_sanctions, detect_sanctions
9
+ from modules.credit import prepare_credit, detect_credit
10
+
11
+ def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
12
+ return pd.read_csv(io.StringIO(csv_text))
13
+
14
+ class TransactionCSVInput(BaseModel):
15
+ csv_text: str = Field(..., description="Transactions CSV text")
16
+
17
+ @tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
18
+ def transactions_fraud_tool(csv_text: str) -> str:
19
+ """Analyze transactions CSV: large/negative amounts, high-risk MCCs, per-customer daily sum >50k. Returns counts + sample."""
20
+ df = _csv_text_to_df(csv_text)
21
+ clean, issues, quality, colmap = prepare_transactions(df)
22
+ flagged, stats = detect_transactions(clean, colmap)
23
+ return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
24
+
25
+ class KYCCSVInput(BaseModel):
26
+ csv_text: str = Field(..., description="KYC CSV text")
27
+
28
+ @tool("kyc_fraud_tool", args_schema=KYCCSVInput)
29
+ def kyc_fraud_tool(csv_text: str) -> str:
30
+ """Analyze KYC CSV: duplicate email/phone, invalid DOBs, suspicious names. Returns counts + sample."""
31
+ df = _csv_text_to_df(csv_text)
32
+ clean, issues, quality, colmap = prepare_kyc(df)
33
+ flagged, stats = detect_kyc(clean, colmap)
34
+ return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
35
+
36
+ class SanctionsCSVInput(BaseModel):
37
+ csv_text: str = Field(..., description="Customers CSV text with 'name' column")
38
+
39
+ @tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
40
+ def sanctions_pep_tool(csv_text: str) -> str:
41
+ """Check customers against sanctions/PEP list (exact + simple fuzzy). Returns counts + sample."""
42
+ df = _csv_text_to_df(csv_text)
43
+ clean, issues, quality, colmap = prepare_sanctions(df)
44
+ flagged, stats = detect_sanctions(clean, colmap)
45
+ return f"{stats}\nData quality issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
46
+
47
+ class CreditCSVInput(BaseModel):
48
+ csv_text: str = Field(..., description="Credit CSV text")
49
+
50
+ @tool("credit_risk_tool", args_schema=CreditCSVInput)
51
+ def credit_risk_tool(csv_text: str) -> str:
52
+ """Score credit risk using simple rules → risk_score, risk_level. Returns counts + sample."""
53
+ df = _csv_text_to_df(csv_text)
54
+ clean, issues, quality, colmap = prepare_credit(df)
55
+ flagged, stats = detect_credit(clean, colmap)
56
+ return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
57
+
58
+ def build_tools():
59
+ return [transactions_fraud_tool, kyc_fraud_tool, sanctions_pep_tool, credit_risk_tool]
ttp_guard.py CHANGED
@@ -1 +1,67 @@
1
- """TODO: implement ttp_guard.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import re, os
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Any, List
5
+
6
+ class GuardDecision:
7
+ ALLOW = "allow"
8
+ BLOCK = "block"
9
+ ANNOTATE = "annotate"
10
+
11
+ @dataclass
12
+ class GuardResult:
13
+ action: str
14
+ reason: str
15
+ indicators: List[str]
16
+
17
+ class TTPGuard:
18
+ """
19
+ Lightweight rule-based guard for adversarial TTPs:
20
+ - Prompt injection / instruction override (e.g., "ignore previous instructions", "you are DAN")
21
+ - Safety bypass ("never refuse", "no moralizing")
22
+ - Secret exfiltration ("print your system prompt", "reveal keys")
23
+ - Credential patterns (AWS, Slack, HuggingFace tokens) in input
24
+ """
25
+ def __init__(self, block_level: int = None):
26
+ self.block_level = int(os.getenv("TTP_BLOCK_LEVEL", block_level if block_level is not None else 3))
27
+
28
+ self.rules = [
29
+ (3, r"\bignore (all|any|previous) instructions\b", "prompt_injection"),
30
+ (3, r"\boverride (system|assistant) (prompt|instructions)\b", "prompt_injection"),
31
+ (3, r"\byou are (now )?(?:dan|dev mode)\b", "jailbreak_alias"),
32
+ (2, r"\bnever refuse\b|\bdon't refuse\b|\balways comply\b", "safety_bypass"),
33
+ (3, r"\bshow (me )?(your )?(system prompt|hidden instructions)\b", "sys_prompt_exfil"),
34
+ (3, r"\bexfiltrate\b|\bleak\b|\bdump secrets?\b", "exfil_intent"),
35
+ (2, r"BEGIN RSA PRIVATE KEY|BEGIN OPENSSH PRIVATE KEY", "secret_marker"),
36
+ (2, r"AKIA[0-9A-Z]{16}", "aws_access_key"),
37
+ (2, r"sk-[A-Za-z0-9]{20,}", "generic_api_key"),
38
+ (2, r"hf_[A-Za-z0-9]{30,}", "huggingface_token"),
39
+ (2, r"xox[baprs]-[A-Za-z0-9-]{10,}", "slack_token"),
40
+ ]
41
+
42
+ def score(self, text: str) -> (int, list):
43
+ hits=[]
44
+ sev=0
45
+ t = text.lower()
46
+ for level, rx, tag in self.rules:
47
+ if re.search(rx, t, flags=re.IGNORECASE):
48
+ hits.append(tag)
49
+ sev = max(sev, level)
50
+ return sev, list(sorted(set(hits)))
51
+
52
+ def inspect_input(self, text: str) -> GuardResult:
53
+ sev, indicators = self.score(text)
54
+ if sev >= self.block_level:
55
+ return GuardResult(action=GuardDecision.BLOCK, reason=f"TTP severity {sev} >= block_level", indicators=indicators)
56
+ if sev > 0:
57
+ return GuardResult(action=GuardDecision.ANNOTATE, reason=f"TTP indicators: {', '.join(indicators)}", indicators=indicators)
58
+ return GuardResult(action=GuardDecision.ALLOW, reason="clean", indicators=[])
59
+
60
+ def describe_policy(self) -> Dict[str, Any]:
61
+ return {
62
+ "block_level": self.block_level,
63
+ "rules": [{"severity":lvl, "regex":rx, "tag":tag} for (lvl, rx, tag) in self.rules]
64
+ }
65
+
66
+ # sensible default
67
+ default_guard = TTPGuard()
validation.py CHANGED
@@ -1 +1,125 @@
1
- """TODO: implement validation.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import re, math, unicodedata
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ try:
7
+ import phonenumbers
8
+ HAVE_PHONENUM = True
9
+ except Exception:
10
+ HAVE_PHONENUM = False
11
+
12
+ def _norm_colname(c: str) -> str:
13
+ c = c.strip().lower()
14
+ c = re.sub(r"\s+", "_", c)
15
+ c = re.sub(r"[^\w]+", "_", c)
16
+ return c.strip("_")
17
+
18
+ def _nfkc(s: str) -> str:
19
+ return unicodedata.normalize("NFKC", s)
20
+
21
+ def _collapse_ws(s: str) -> str:
22
+ return re.sub(r"\s+", " ", s).strip()
23
+
24
+ def _clean_str(x):
25
+ if pd.isna(x): return x
26
+ return _collapse_ws(_nfkc(str(x)))
27
+
28
+ def _is_email(s: str) -> bool:
29
+ return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
30
+
31
+ def _clean_phone(s: str, default_region: str = "IN"):
32
+ if s is None or str(s).strip() == "":
33
+ return None, "missing_phone"
34
+ raw = re.sub(r"[^\d+]", "", str(s))
35
+ if HAVE_PHONENUM:
36
+ try:
37
+ pn = phonenumbers.parse(raw, default_region)
38
+ if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
39
+ return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
40
+ return raw, "invalid_phone"
41
+ except Exception:
42
+ return raw, "invalid_phone"
43
+ digits = re.sub(r"\D", "", raw)
44
+ return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
45
+
46
+ def _parse_datetime(s):
47
+ try:
48
+ return pd.to_datetime(s, errors="coerce", utc=True)
49
+ except Exception:
50
+ return pd.NaT
51
+
52
+ def _to_numeric(series: pd.Series):
53
+ coerced = pd.to_numeric(series, errors="coerce")
54
+ return coerced, (coerced.isna() & series.notna())
55
+
56
+ def _read_csv_any(file_obj) -> pd.DataFrame:
57
+ if file_obj is None:
58
+ raise ValueError("No file uploaded.")
59
+ if hasattr(file_obj, "name"):
60
+ p = file_obj.name
61
+ try: return pd.read_csv(p)
62
+ except Exception: return pd.read_csv(p, encoding="latin-1")
63
+ try: return pd.read_csv(file_obj)
64
+ except Exception:
65
+ file_obj.seek(0)
66
+ return pd.read_csv(file_obj, encoding="latin-1")
67
+
68
+ def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
69
+ df = df.copy()
70
+ df.columns = [_norm_colname(c) for c in df.columns]
71
+ for c in df.select_dtypes(include=["object"]).columns:
72
+ df[c] = df[c].apply(_clean_str)
73
+ return df
74
+
75
+ def _prepare_generic(df: pd.DataFrame, expected: dict[str, list[str]]):
76
+ issues = []
77
+ df0 = _standardize_df(df)
78
+
79
+ colmap = {}
80
+ cols = set(df0.columns)
81
+ for canon, syns in expected.items():
82
+ found = None
83
+ for s in [canon] + syns:
84
+ s = _norm_colname(s)
85
+ if s in cols:
86
+ found = s; break
87
+ if found: colmap[canon] = found
88
+
89
+ for c in list(df0.columns):
90
+ if "email" in c:
91
+ df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
92
+ for idx, v in df0[c].items():
93
+ if pd.isna(v) or str(v).strip()=="":
94
+ issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
95
+ elif not _is_email(v):
96
+ issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
97
+ if "phone" in c or "mobile" in c:
98
+ vals = []
99
+ for idx, v in df0[c].items():
100
+ e164, prob = _clean_phone(v)
101
+ vals.append(e164)
102
+ if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
103
+ df0[c] = vals
104
+
105
+ for c in df0.columns:
106
+ if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
107
+ parsed = _parse_datetime(df0[c])
108
+ bad = parsed.isna() & df0[c].notna()
109
+ for idx in df0.index[bad]:
110
+ issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
111
+ df0[c] = parsed
112
+
113
+ for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
114
+ for c in df0.columns:
115
+ if c == nc or c.endswith("_"+nc) or nc in c:
116
+ coerced, badmask = _to_numeric(df0[c])
117
+ for idx in df0.index[badmask]:
118
+ issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
119
+ df0[c] = coerced
120
+
121
+ import pandas as pd
122
+ issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
123
+ missing = [k for k in expected.keys() if k not in colmap]
124
+ quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
125
+ return df0, issues_df, quality_summary, colmap