soupstick
commited on
Commit
·
bc2b09a
1
Parent(s):
ccb470a
chore: commit modular split + updates
Browse files- agent.py +27 -1
- app.py +172 -1
- llm_provider.py +101 -1
- mcp.py +40 -1
- modules/credit.py +45 -1
- modules/kyc.py +48 -1
- modules/sanctions.py +41 -1
- modules/transactions.py +61 -1
- threat_intel.py +17 -1
- tools.py +59 -1
- ttp_guard.py +67 -1
- validation.py +125 -1
agent.py
CHANGED
@@ -1 +1,27 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
from typing import List
|
3 |
+
from langchain.agents import initialize_agent, AgentType
|
4 |
+
from llm_provider import CHAT_LLM, SUMMARY_NOTICE
|
5 |
+
from ttp_guard import TTPGuard, GuardDecision
|
6 |
+
|
7 |
+
AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
|
8 |
+
You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
|
9 |
+
If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
|
10 |
+
Be concise and actionable."""
|
11 |
+
|
12 |
+
def build_agent(tools: List, guard: TTPGuard):
|
13 |
+
if CHAT_LLM is None:
|
14 |
+
# Stub agent that returns notice
|
15 |
+
class Stub:
|
16 |
+
def invoke(self, prompt): return SUMMARY_NOTICE
|
17 |
+
return Stub()
|
18 |
+
|
19 |
+
# Wrap LLM invocation with a guard-aware tool-use policy by leveraging the system message.
|
20 |
+
return initialize_agent(
|
21 |
+
tools,
|
22 |
+
CHAT_LLM,
|
23 |
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
24 |
+
verbose=False,
|
25 |
+
agent_kwargs={"system_message": AGENT_SYSTEM},
|
26 |
+
handle_parsing_errors=True,
|
27 |
+
)
|
app.py
CHANGED
@@ -1 +1,172 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from llm_provider import CHAT_LLM, SUMMARY_NOTICE
|
6 |
+
from mcp import mcp_fetch_sanctions, mcp_fetch_high_risk_mcc
|
7 |
+
from threat_intel import ThreatIntel
|
8 |
+
from ttp_guard import TTPGuard, GuardDecision, default_guard
|
9 |
+
from modules.transactions import prepare_transactions, detect_transactions
|
10 |
+
from modules.kyc import prepare_kyc, detect_kyc
|
11 |
+
from modules.sanctions import prepare_sanctions, detect_sanctions, DEMO_SANCTIONS
|
12 |
+
from modules.credit import prepare_credit, detect_credit
|
13 |
+
from agent import build_agent
|
14 |
+
from tools import build_tools
|
15 |
+
from langchain.schema import SystemMessage, HumanMessage
|
16 |
+
|
17 |
+
# ---------- Summarizer ----------
|
18 |
+
SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
|
19 |
+
|
20 |
+
def summarize_ai(context: str) -> str:
|
21 |
+
if CHAT_LLM is None:
|
22 |
+
return SUMMARY_NOTICE
|
23 |
+
# Guard summaries as well (low severity just annotate)
|
24 |
+
decision = default_guard.inspect_input(context)
|
25 |
+
if decision.action == GuardDecision.BLOCK:
|
26 |
+
return f"Blocked by TTP Guard: {decision.reason}"
|
27 |
+
try:
|
28 |
+
out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
|
29 |
+
return getattr(out, "content", str(out))
|
30 |
+
except Exception:
|
31 |
+
return SUMMARY_NOTICE
|
32 |
+
|
33 |
+
# ---------- TI + Guard singletons ----------
|
34 |
+
TI = ThreatIntel.load() # pulls MCP envs if set, else defaults
|
35 |
+
GUARD = default_guard
|
36 |
+
|
37 |
+
# ---------- Pipelines (tabs) ----------
|
38 |
+
def run_transactions(file):
|
39 |
+
try:
|
40 |
+
from validation import _read_csv_any
|
41 |
+
df = _read_csv_any(file)
|
42 |
+
clean, issues, quality, colmap = prepare_transactions(df)
|
43 |
+
mcc_list = mcp_fetch_high_risk_mcc() or TI.high_risk_mcc
|
44 |
+
flagged, stats = detect_transactions(clean, colmap, mcc_list)
|
45 |
+
ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
46 |
+
ai = summarize_ai(ctx)
|
47 |
+
return ai, stats, flagged, issues
|
48 |
+
except Exception as e:
|
49 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
50 |
+
|
51 |
+
def run_kyc(file):
|
52 |
+
try:
|
53 |
+
from validation import _read_csv_any
|
54 |
+
df = _read_csv_any(file)
|
55 |
+
clean, issues, quality, colmap = prepare_kyc(df)
|
56 |
+
flagged, stats = detect_kyc(clean, colmap)
|
57 |
+
ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
58 |
+
ai = summarize_ai(ctx)
|
59 |
+
return ai, stats, flagged, issues
|
60 |
+
except Exception as e:
|
61 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
62 |
+
|
63 |
+
def run_sanctions(customers_file, sanctions_file):
|
64 |
+
try:
|
65 |
+
from validation import _read_csv_any
|
66 |
+
df = _read_csv_any(customers_file)
|
67 |
+
clean, issues, quality, colmap = prepare_sanctions(df)
|
68 |
+
sanc_df = mcp_fetch_sanctions() or ( _read_csv_any(sanctions_file) if sanctions_file else None ) or TI.sanctions_df or DEMO_SANCTIONS
|
69 |
+
flagged, stats = detect_sanctions(clean, colmap, sanc_df)
|
70 |
+
ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
|
71 |
+
ai = summarize_ai(ctx)
|
72 |
+
return ai, stats, flagged, issues
|
73 |
+
except Exception as e:
|
74 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
75 |
+
|
76 |
+
def run_credit(file):
|
77 |
+
try:
|
78 |
+
from validation import _read_csv_any
|
79 |
+
df = _read_csv_any(file)
|
80 |
+
clean, issues, quality, colmap = prepare_credit(df)
|
81 |
+
flagged, stats = detect_credit(clean, colmap)
|
82 |
+
ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
83 |
+
ai = summarize_ai(ctx)
|
84 |
+
return ai, stats, flagged, issues
|
85 |
+
except Exception as e:
|
86 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
87 |
+
|
88 |
+
# ---------- Agent & tools ----------
|
89 |
+
TOOLS = build_tools()
|
90 |
+
AGENT = build_agent(TOOLS, GUARD)
|
91 |
+
|
92 |
+
def agent_reply(history, user_msg: str):
|
93 |
+
# Guard the incoming user message before tool routing
|
94 |
+
decision = GUARD.inspect_input(user_msg)
|
95 |
+
if decision.action == GuardDecision.BLOCK:
|
96 |
+
return f"❌ Blocked by TTP Guard: {decision.reason}"
|
97 |
+
try:
|
98 |
+
looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
|
99 |
+
prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
|
100 |
+
res = AGENT.invoke(prompt)
|
101 |
+
if isinstance(res, dict) and "output" in res:
|
102 |
+
return res["output"]
|
103 |
+
return str(res)
|
104 |
+
except Exception as e:
|
105 |
+
return f"Agent error: {e}"
|
106 |
+
|
107 |
+
# ---------- UI ----------
|
108 |
+
with gr.Blocks(title="Fraud Detector Analyst — LangChain + Fireworks + MCP") as demo:
|
109 |
+
gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + Fireworks + MCP")
|
110 |
+
with gr.Tabs():
|
111 |
+
with gr.Tab("Transactions"):
|
112 |
+
f = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
|
113 |
+
ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
114 |
+
st = gr.Textbox(label="Stats", lines=3)
|
115 |
+
flagged = gr.Dataframe(label="Flagged Transactions")
|
116 |
+
issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
|
117 |
+
f.upload(run_transactions, inputs=[f], outputs=[ai, st, flagged, issues])
|
118 |
+
|
119 |
+
with gr.Tab("KYC"):
|
120 |
+
f = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
|
121 |
+
ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
122 |
+
st = gr.Textbox(label="Stats", lines=3)
|
123 |
+
flagged = gr.Dataframe(label="Flagged KYC Rows")
|
124 |
+
issues = gr.Dataframe(label="Data Quality Issues")
|
125 |
+
f.upload(run_kyc, inputs=[f], outputs=[ai, st, flagged, issues])
|
126 |
+
|
127 |
+
with gr.Tab("Sanctions/PEP"):
|
128 |
+
cust = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
|
129 |
+
sanc = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
|
130 |
+
ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
131 |
+
st = gr.Textbox(label="Stats", lines=3)
|
132 |
+
flagged = gr.Dataframe(label="Matches")
|
133 |
+
issues = gr.Dataframe(label="Data Quality Issues")
|
134 |
+
cust.upload(run_sanctions, inputs=[cust, sanc], outputs=[ai, st, flagged, issues])
|
135 |
+
sanc.upload(run_sanctions, inputs=[cust, sanc], outputs=[ai, st, flagged, issues])
|
136 |
+
|
137 |
+
with gr.Tab("Credit Risk"):
|
138 |
+
f = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
|
139 |
+
ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
140 |
+
st = gr.Textbox(label="Stats", lines=3)
|
141 |
+
flagged = gr.Dataframe(label="Flagged Applicants")
|
142 |
+
issues = gr.Dataframe(label="Data Quality Issues")
|
143 |
+
f.upload(run_credit, inputs=[f], outputs=[ai, st, flagged, issues])
|
144 |
+
|
145 |
+
with gr.Tab("AI Consultant (Agent)"):
|
146 |
+
chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
|
147 |
+
user_in = gr.Textbox(label="Message or CSV snippet")
|
148 |
+
send_btn = gr.Button("Send")
|
149 |
+
def _chat_fn(history, msg):
|
150 |
+
reply = agent_reply(history, msg)
|
151 |
+
history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
|
152 |
+
return history, ""
|
153 |
+
send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
|
154 |
+
|
155 |
+
with gr.Tab("Security & TI"):
|
156 |
+
gr.Markdown("**TTP Guard policy & latest indicators**")
|
157 |
+
gr.JSON(value=GUARD.describe_policy())
|
158 |
+
gr.Dataframe(value=TI.sanctions_df.head(10) if TI.sanctions_df is not None else pd.DataFrame({"note":["demo sanctions used"]}),
|
159 |
+
label="Sanctions (sample)")
|
160 |
+
gr.Dataframe(value=pd.DataFrame({"high_risk_mcc": TI.high_risk_mcc}),
|
161 |
+
label="High-risk MCC (current)")
|
162 |
+
|
163 |
+
gr.Markdown(
|
164 |
+
"### ⚙️ Configure\n"
|
165 |
+
"- `FIREWORKS_API_KEY` **or** `HF_TOKEN` (provider routing to Fireworks)\n"
|
166 |
+
"- `FW_PRIMARY_MODEL` (default openai/gpt-oss-20b), `FW_SECONDARY_MODEL` (default Qwen/Qwen3-Coder-30B-A3B-Instruct)\n"
|
167 |
+
"- MCP (optional): `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`\n"
|
168 |
+
"- TTP guard thresholds: `TTP_BLOCK_LEVEL` (default 3)\n"
|
169 |
+
)
|
170 |
+
|
171 |
+
if __name__ == "__main__":
|
172 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
llm_provider.py
CHANGED
@@ -1 +1,101 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import os, logging
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from huggingface_hub import InferenceClient
|
5 |
+
from langchain_core.language_models.chat_models import BaseChatModel
|
6 |
+
from langchain.schema import HumanMessage, SystemMessage, AIMessage
|
7 |
+
from langchain_core.outputs import ChatGeneration, ChatResult
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
log = logging.getLogger("fraud-analyst")
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
|
13 |
+
FIREWORKS_API_KEY = os.getenv("FIREWORKS_API_KEY") or os.getenv("HF_TOKEN")
|
14 |
+
FW_PRIMARY_MODEL = os.getenv("FW_PRIMARY_MODEL", "openai/gpt-oss-20b")
|
15 |
+
FW_SECONDARY_MODEL = os.getenv("FW_SECONDARY_MODEL", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
|
16 |
+
|
17 |
+
SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
|
18 |
+
|
19 |
+
class FireworksHFChat(BaseChatModel):
|
20 |
+
model: str
|
21 |
+
api_key: str | None = None
|
22 |
+
temperature: float = 0.2
|
23 |
+
max_new_tokens: int = 256
|
24 |
+
timeout: int = 60
|
25 |
+
|
26 |
+
def __init__(self, model: str, api_key: str | None):
|
27 |
+
super().__init__()
|
28 |
+
self.model = model
|
29 |
+
self.api_key = api_key
|
30 |
+
self._client = InferenceClient(provider="fireworks-ai", api_key=self.api_key)
|
31 |
+
|
32 |
+
@property
|
33 |
+
def _llm_type(self) -> str:
|
34 |
+
return "fireworks_hf_chat"
|
35 |
+
|
36 |
+
def _convert(self, messages):
|
37 |
+
out=[]
|
38 |
+
for m in messages:
|
39 |
+
if isinstance(m, SystemMessage):
|
40 |
+
out.append({"role":"system","content":m.content})
|
41 |
+
elif isinstance(m, HumanMessage):
|
42 |
+
out.append({"role":"user","content":m.content})
|
43 |
+
elif isinstance(m, AIMessage):
|
44 |
+
out.append({"role":"assistant","content":m.content})
|
45 |
+
else:
|
46 |
+
out.append({"role":"user","content":str(getattr(m,"content",m))})
|
47 |
+
return out
|
48 |
+
|
49 |
+
def _generate(self, messages, stop=None, run_manager=None, **kwargs) -> ChatResult:
|
50 |
+
if not self.api_key:
|
51 |
+
gen = ChatGeneration(message=AIMessage(content=""))
|
52 |
+
return ChatResult(generations=[gen], llm_output={"error": "no_api_key"})
|
53 |
+
try:
|
54 |
+
resp = self._client.chat.completions.create(
|
55 |
+
model=self.model,
|
56 |
+
messages=self._convert(messages),
|
57 |
+
stream=False,
|
58 |
+
max_tokens=kwargs.get("max_tokens", 256),
|
59 |
+
temperature=kwargs.get("temperature", 0.2),
|
60 |
+
)
|
61 |
+
text = ""
|
62 |
+
if hasattr(resp, "choices") and resp.choices:
|
63 |
+
ch = resp.choices[0]
|
64 |
+
if hasattr(ch, "message") and ch.message and getattr(ch.message, "content", None):
|
65 |
+
text = ch.message.content
|
66 |
+
elif hasattr(ch, "text") and ch.text:
|
67 |
+
text = ch.text
|
68 |
+
gen = ChatGeneration(message=AIMessage(content=text or ""))
|
69 |
+
return ChatResult(generations=[gen], llm_output={"model": self.model})
|
70 |
+
except Exception as e:
|
71 |
+
log.warning(f"Fireworks call failed for {self.model}: {type(e).__name__}: {str(e)[:200]}")
|
72 |
+
gen = ChatGeneration(message=AIMessage(content=""))
|
73 |
+
return ChatResult(generations=[gen], llm_output={"error": str(e)})
|
74 |
+
|
75 |
+
def _heartbeat(model_id: str) -> bool:
|
76 |
+
if not FIREWORKS_API_KEY: return False
|
77 |
+
try:
|
78 |
+
client = InferenceClient(provider="fireworks-ai", api_key=FIREWORKS_API_KEY)
|
79 |
+
_ = client.chat.completions.create(
|
80 |
+
model=model_id,
|
81 |
+
messages=[{"role":"user","content":"ping"}],
|
82 |
+
stream=False,
|
83 |
+
max_tokens=1,
|
84 |
+
)
|
85 |
+
return True
|
86 |
+
except Exception as e:
|
87 |
+
log.warning(f"Heartbeat failed for {model_id}: {type(e).__name__}: {str(e)[:160]}")
|
88 |
+
return False
|
89 |
+
|
90 |
+
def build_chat_llm():
|
91 |
+
log.info(f"Fireworks key present: {bool(FIREWORKS_API_KEY)} len={len(FIREWORKS_API_KEY) if FIREWORKS_API_KEY else 0}")
|
92 |
+
if FIREWORKS_API_KEY and _heartbeat(FW_PRIMARY_MODEL):
|
93 |
+
log.info(f"Using chat model: {FW_PRIMARY_MODEL}")
|
94 |
+
return FireworksHFChat(FW_PRIMARY_MODEL, FIREWORKS_API_KEY)
|
95 |
+
if FIREWORKS_API_KEY and _heartbeat(FW_SECONDARY_MODEL):
|
96 |
+
log.info(f"Using fallback chat model: {FW_SECONDARY_MODEL}")
|
97 |
+
return FireworksHFChat(FW_SECONDARY_MODEL, FIREWORKS_API_KEY)
|
98 |
+
log.warning("No working chat model; notice will be shown.")
|
99 |
+
return None
|
100 |
+
|
101 |
+
CHAT_LLM = build_chat_llm()
|
mcp.py
CHANGED
@@ -1 +1,40 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import os, json, logging
|
3 |
+
from typing import Optional, List
|
4 |
+
import pandas as pd
|
5 |
+
from urllib.request import Request, urlopen
|
6 |
+
|
7 |
+
log = logging.getLogger("fraud-analyst")
|
8 |
+
|
9 |
+
def _mcp_get_json(url: str, auth_header: Optional[str]):
|
10 |
+
try:
|
11 |
+
req = Request(url)
|
12 |
+
if auth_header:
|
13 |
+
k, v = auth_header.split(":", 1)
|
14 |
+
req.add_header(k.strip(), v.strip())
|
15 |
+
with urlopen(req, timeout=10) as r:
|
16 |
+
return json.loads(r.read().decode("utf-8"))
|
17 |
+
except Exception as e:
|
18 |
+
log.warning(f"MCP fetch failed: {e}")
|
19 |
+
return None
|
20 |
+
|
21 |
+
def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
|
22 |
+
if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
|
23 |
+
url = os.getenv("MCP_SANCTIONS_URL")
|
24 |
+
if not url: return None
|
25 |
+
data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
|
26 |
+
if not data: return None
|
27 |
+
if isinstance(data, list):
|
28 |
+
if all(isinstance(x, dict) for x in data):
|
29 |
+
rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
|
30 |
+
return pd.DataFrame(rows) if rows else None
|
31 |
+
if all(isinstance(x, str) for x in data):
|
32 |
+
return pd.DataFrame({"name": data})
|
33 |
+
return None
|
34 |
+
|
35 |
+
def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
|
36 |
+
if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
|
37 |
+
url = os.getenv("MCP_HIGH_RISK_MCC_URL")
|
38 |
+
if not url: return None
|
39 |
+
data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
|
40 |
+
return [str(x) for x in data] if isinstance(data, list) else None
|
modules/credit.py
CHANGED
@@ -1 +1,45 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import pandas as pd, numpy as np
|
3 |
+
from typing import Dict
|
4 |
+
from validation import _prepare_generic
|
5 |
+
|
6 |
+
CR_EXPECTED = {
|
7 |
+
"customer_id":["cust_id","user_id","client_id"],
|
8 |
+
"credit_score":["creditscore","score"],
|
9 |
+
"utilization":["util","credit_utilization","utilization_ratio"],
|
10 |
+
"dti":["debt_to_income","debt_to_income_ratio"],
|
11 |
+
"recent_defaults":["defaults","recentdefaults"],
|
12 |
+
"income":["annual_income","salary"]
|
13 |
+
}
|
14 |
+
|
15 |
+
def prepare_credit(df: pd.DataFrame):
|
16 |
+
return _prepare_generic(df, CR_EXPECTED)
|
17 |
+
|
18 |
+
def detect_credit(clean_df: pd.DataFrame, colmap: Dict[str,str]):
|
19 |
+
needed = ["credit_score","utilization","dti","recent_defaults","income"]
|
20 |
+
if not any(k in colmap for k in needed):
|
21 |
+
return pd.DataFrame(), "Required columns missing for Credit Risk."
|
22 |
+
df = clean_df.copy()
|
23 |
+
cs = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
|
24 |
+
util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
|
25 |
+
dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
|
26 |
+
rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
|
27 |
+
inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
|
28 |
+
out=[]
|
29 |
+
for i in range(len(df)):
|
30 |
+
hits=0; reasons=[]
|
31 |
+
if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
|
32 |
+
if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
|
33 |
+
if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
|
34 |
+
if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
|
35 |
+
if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
|
36 |
+
level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
|
37 |
+
out.append((hits, level, ", ".join(reasons)))
|
38 |
+
res = df.assign(
|
39 |
+
risk_score=[x[0] for x in out],
|
40 |
+
risk_level=[x[1] for x in out],
|
41 |
+
risk_reason=[x[2] for x in out]
|
42 |
+
)
|
43 |
+
flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
|
44 |
+
stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
|
45 |
+
return flagged, stats
|
modules/kyc.py
CHANGED
@@ -1 +1,48 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import pandas as pd
|
3 |
+
from typing import Dict
|
4 |
+
from validation import _prepare_generic
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
KYC_EXPECTED = {
|
8 |
+
"customer_id":["cust_id","user_id","client_id"],
|
9 |
+
"name":["full_name","customer_name"],
|
10 |
+
"email":["email_address","mail"],
|
11 |
+
"phone":["phone_number","mobile","contact"],
|
12 |
+
"dob":["date_of_birth","birthdate"]
|
13 |
+
}
|
14 |
+
|
15 |
+
def prepare_kyc(df: pd.DataFrame):
|
16 |
+
return _prepare_generic(df, KYC_EXPECTED)
|
17 |
+
|
18 |
+
def _age_years(dob: pd.Series) -> pd.Series:
|
19 |
+
now = pd.Timestamp.utcnow()
|
20 |
+
return (now - dob).dt.days / 365.25
|
21 |
+
|
22 |
+
def detect_kyc(clean_df: pd.DataFrame, colmap: Dict[str,str]):
|
23 |
+
if not all(k in colmap for k in ["customer_id","name"]):
|
24 |
+
return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
|
25 |
+
df = clean_df.copy()
|
26 |
+
reasons=[]
|
27 |
+
if "email" in colmap:
|
28 |
+
dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
|
29 |
+
reasons.append(dupe_email)
|
30 |
+
if "phone" in colmap:
|
31 |
+
dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
|
32 |
+
reasons.append(dupe_phone)
|
33 |
+
if "dob" in colmap:
|
34 |
+
age = _age_years(df[colmap["dob"]])
|
35 |
+
invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
|
36 |
+
reasons.append(invalid)
|
37 |
+
if "name" in colmap:
|
38 |
+
name = df[colmap["name"]].astype(str)
|
39 |
+
susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
|
40 |
+
reasons.append(susp)
|
41 |
+
mask=None
|
42 |
+
for m in reasons:
|
43 |
+
mask = m if mask is None else (mask | m)
|
44 |
+
flagged = df[mask] if mask is not None else pd.DataFrame()
|
45 |
+
if not flagged.empty:
|
46 |
+
flagged = flagged.assign(risk_reason="kyc_rule_hit")
|
47 |
+
stats = f"KYC flagged: {len(flagged)} of {len(df)}."
|
48 |
+
return flagged, stats
|
modules/sanctions.py
CHANGED
@@ -1 +1,41 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import re, pandas as pd
|
3 |
+
from typing import Optional, Dict
|
4 |
+
from validation import _prepare_generic, _standardize_df
|
5 |
+
|
6 |
+
SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
|
7 |
+
|
8 |
+
def prepare_sanctions(df: pd.DataFrame):
|
9 |
+
return _prepare_generic(df, SAN_EXPECTED)
|
10 |
+
|
11 |
+
DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
|
12 |
+
|
13 |
+
def token_overlap(a: str, b: str) -> int:
|
14 |
+
at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
|
15 |
+
bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
|
16 |
+
return len(at & bt)
|
17 |
+
|
18 |
+
def detect_sanctions(clean_df: pd.DataFrame, colmap: Dict[str,str], sanctions_df: Optional[pd.DataFrame]=None):
|
19 |
+
if "name" not in colmap:
|
20 |
+
return pd.DataFrame(), "Required column missing for Sanctions (need name)."
|
21 |
+
df = clean_df.copy()
|
22 |
+
sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
|
23 |
+
sanc = _standardize_df(sanc)
|
24 |
+
if "name" not in sanc.columns:
|
25 |
+
for c in sanc.columns:
|
26 |
+
if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
|
27 |
+
sanc_names = sanc["name"].dropna().astype(str).tolist()
|
28 |
+
matches=[]
|
29 |
+
for idx, row in df.iterrows():
|
30 |
+
nm = str(row[colmap["name"]] or "").strip()
|
31 |
+
if not nm: continue
|
32 |
+
if any(nm.lower()==s.lower() for s in sanc_names):
|
33 |
+
matches.append((idx,"exact")); continue
|
34 |
+
if any(token_overlap(nm, s) >= 2 for s in sanc_names):
|
35 |
+
matches.append((idx,"fuzzy"))
|
36 |
+
flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
|
37 |
+
if not flagged.empty:
|
38 |
+
mt = {i:t for i,t in matches}
|
39 |
+
flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
|
40 |
+
stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
|
41 |
+
return flagged, stats
|
modules/transactions.py
CHANGED
@@ -1 +1,61 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import pandas as pd
|
3 |
+
from typing import Optional, List, Dict
|
4 |
+
from validation import _prepare_generic, _nfkc
|
5 |
+
|
6 |
+
TX_EXPECTED = {
|
7 |
+
"transaction_id":["txn_id","transactionid","id","tx_id"],
|
8 |
+
"customer_id":["cust_id","user_id","client_id"],
|
9 |
+
"amount":["amt","amount_inr","value"],
|
10 |
+
"timestamp":["date","event_time","created_at","tx_time"],
|
11 |
+
"merchant_category":["mcc","merchant_cat","category"]
|
12 |
+
}
|
13 |
+
|
14 |
+
def prepare_transactions(df: pd.DataFrame):
|
15 |
+
return _prepare_generic(df, TX_EXPECTED)
|
16 |
+
|
17 |
+
def detect_transactions(clean_df: pd.DataFrame, colmap: Dict[str,str], high_risk_mcc: Optional[List[str]] = None):
|
18 |
+
high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
|
19 |
+
if high_risk_mcc:
|
20 |
+
high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
|
21 |
+
if not all(k in colmap for k in ["customer_id","amount"]):
|
22 |
+
return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
|
23 |
+
df = clean_df.copy()
|
24 |
+
reasons = []
|
25 |
+
amtcol = colmap.get("amount")
|
26 |
+
if amtcol:
|
27 |
+
reasons.append(df[amtcol] > 10000) # large
|
28 |
+
reasons.append(df[amtcol] < 0) # negative
|
29 |
+
if "merchant_category" in colmap:
|
30 |
+
mcc = colmap["merchant_category"]
|
31 |
+
high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
|
32 |
+
reasons.append(high)
|
33 |
+
if all(k in colmap for k in ["customer_id","timestamp","amount"]):
|
34 |
+
cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
|
35 |
+
daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
|
36 |
+
reasons.append(daily > 50000)
|
37 |
+
mask = None
|
38 |
+
for m in reasons:
|
39 |
+
mask = m if mask is None else (mask | m)
|
40 |
+
flagged = df[mask] if mask is not None else pd.DataFrame()
|
41 |
+
if not flagged.empty:
|
42 |
+
rr=[]
|
43 |
+
for _, row in flagged.iterrows():
|
44 |
+
hits=[]
|
45 |
+
if amtcol:
|
46 |
+
a=row[amtcol]
|
47 |
+
if pd.notna(a) and a>10000: hits.append("large_amount")
|
48 |
+
if pd.notna(a) and a<0: hits.append("negative_amount")
|
49 |
+
if "merchant_category" in colmap:
|
50 |
+
val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
|
51 |
+
if val in high_risk: hits.append("mcc_high_risk")
|
52 |
+
try:
|
53 |
+
if all(k in colmap for k in ["customer_id","timestamp","amount"]):
|
54 |
+
sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
|
55 |
+
(df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
|
56 |
+
if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
|
57 |
+
except Exception: pass
|
58 |
+
rr.append(", ".join(sorted(set(hits))) or "rule_hit")
|
59 |
+
flagged = flagged.assign(risk_reason=rr)
|
60 |
+
stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
|
61 |
+
return flagged, stats
|
threat_intel.py
CHANGED
@@ -1 +1,17 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from typing import Optional, List
|
4 |
+
import pandas as pd
|
5 |
+
from mcp import mcp_fetch_sanctions, mcp_fetch_high_risk_mcc
|
6 |
+
from modules.sanctions import DEMO_SANCTIONS
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class ThreatIntel:
|
10 |
+
sanctions_df: Optional[pd.DataFrame]
|
11 |
+
high_risk_mcc: List[str]
|
12 |
+
|
13 |
+
@staticmethod
|
14 |
+
def load() -> "ThreatIntel":
|
15 |
+
sanc = mcp_fetch_sanctions()
|
16 |
+
mcc = mcp_fetch_high_risk_mcc() or ["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"]
|
17 |
+
return ThreatIntel(sanctions_df=sanc or DEMO_SANCTIONS, high_risk_mcc=mcc)
|
tools.py
CHANGED
@@ -1 +1,59 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import io, pandas as pd
|
3 |
+
from pydantic import BaseModel, Field
|
4 |
+
from langchain.tools import tool
|
5 |
+
|
6 |
+
from modules.transactions import prepare_transactions, detect_transactions
|
7 |
+
from modules.kyc import prepare_kyc, detect_kyc
|
8 |
+
from modules.sanctions import prepare_sanctions, detect_sanctions
|
9 |
+
from modules.credit import prepare_credit, detect_credit
|
10 |
+
|
11 |
+
def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
|
12 |
+
return pd.read_csv(io.StringIO(csv_text))
|
13 |
+
|
14 |
+
class TransactionCSVInput(BaseModel):
|
15 |
+
csv_text: str = Field(..., description="Transactions CSV text")
|
16 |
+
|
17 |
+
@tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
|
18 |
+
def transactions_fraud_tool(csv_text: str) -> str:
|
19 |
+
"""Analyze transactions CSV: large/negative amounts, high-risk MCCs, per-customer daily sum >50k. Returns counts + sample."""
|
20 |
+
df = _csv_text_to_df(csv_text)
|
21 |
+
clean, issues, quality, colmap = prepare_transactions(df)
|
22 |
+
flagged, stats = detect_transactions(clean, colmap)
|
23 |
+
return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
24 |
+
|
25 |
+
class KYCCSVInput(BaseModel):
|
26 |
+
csv_text: str = Field(..., description="KYC CSV text")
|
27 |
+
|
28 |
+
@tool("kyc_fraud_tool", args_schema=KYCCSVInput)
|
29 |
+
def kyc_fraud_tool(csv_text: str) -> str:
|
30 |
+
"""Analyze KYC CSV: duplicate email/phone, invalid DOBs, suspicious names. Returns counts + sample."""
|
31 |
+
df = _csv_text_to_df(csv_text)
|
32 |
+
clean, issues, quality, colmap = prepare_kyc(df)
|
33 |
+
flagged, stats = detect_kyc(clean, colmap)
|
34 |
+
return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
35 |
+
|
36 |
+
class SanctionsCSVInput(BaseModel):
|
37 |
+
csv_text: str = Field(..., description="Customers CSV text with 'name' column")
|
38 |
+
|
39 |
+
@tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
|
40 |
+
def sanctions_pep_tool(csv_text: str) -> str:
|
41 |
+
"""Check customers against sanctions/PEP list (exact + simple fuzzy). Returns counts + sample."""
|
42 |
+
df = _csv_text_to_df(csv_text)
|
43 |
+
clean, issues, quality, colmap = prepare_sanctions(df)
|
44 |
+
flagged, stats = detect_sanctions(clean, colmap)
|
45 |
+
return f"{stats}\nData quality issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
46 |
+
|
47 |
+
class CreditCSVInput(BaseModel):
|
48 |
+
csv_text: str = Field(..., description="Credit CSV text")
|
49 |
+
|
50 |
+
@tool("credit_risk_tool", args_schema=CreditCSVInput)
|
51 |
+
def credit_risk_tool(csv_text: str) -> str:
|
52 |
+
"""Score credit risk using simple rules → risk_score, risk_level. Returns counts + sample."""
|
53 |
+
df = _csv_text_to_df(csv_text)
|
54 |
+
clean, issues, quality, colmap = prepare_credit(df)
|
55 |
+
flagged, stats = detect_credit(clean, colmap)
|
56 |
+
return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
57 |
+
|
58 |
+
def build_tools():
|
59 |
+
return [transactions_fraud_tool, kyc_fraud_tool, sanctions_pep_tool, credit_risk_tool]
|
ttp_guard.py
CHANGED
@@ -1 +1,67 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import re, os
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Dict, Any, List
|
5 |
+
|
6 |
+
class GuardDecision:
|
7 |
+
ALLOW = "allow"
|
8 |
+
BLOCK = "block"
|
9 |
+
ANNOTATE = "annotate"
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class GuardResult:
|
13 |
+
action: str
|
14 |
+
reason: str
|
15 |
+
indicators: List[str]
|
16 |
+
|
17 |
+
class TTPGuard:
|
18 |
+
"""
|
19 |
+
Lightweight rule-based guard for adversarial TTPs:
|
20 |
+
- Prompt injection / instruction override (e.g., "ignore previous instructions", "you are DAN")
|
21 |
+
- Safety bypass ("never refuse", "no moralizing")
|
22 |
+
- Secret exfiltration ("print your system prompt", "reveal keys")
|
23 |
+
- Credential patterns (AWS, Slack, HuggingFace tokens) in input
|
24 |
+
"""
|
25 |
+
def __init__(self, block_level: int = None):
|
26 |
+
self.block_level = int(os.getenv("TTP_BLOCK_LEVEL", block_level if block_level is not None else 3))
|
27 |
+
|
28 |
+
self.rules = [
|
29 |
+
(3, r"\bignore (all|any|previous) instructions\b", "prompt_injection"),
|
30 |
+
(3, r"\boverride (system|assistant) (prompt|instructions)\b", "prompt_injection"),
|
31 |
+
(3, r"\byou are (now )?(?:dan|dev mode)\b", "jailbreak_alias"),
|
32 |
+
(2, r"\bnever refuse\b|\bdon't refuse\b|\balways comply\b", "safety_bypass"),
|
33 |
+
(3, r"\bshow (me )?(your )?(system prompt|hidden instructions)\b", "sys_prompt_exfil"),
|
34 |
+
(3, r"\bexfiltrate\b|\bleak\b|\bdump secrets?\b", "exfil_intent"),
|
35 |
+
(2, r"BEGIN RSA PRIVATE KEY|BEGIN OPENSSH PRIVATE KEY", "secret_marker"),
|
36 |
+
(2, r"AKIA[0-9A-Z]{16}", "aws_access_key"),
|
37 |
+
(2, r"sk-[A-Za-z0-9]{20,}", "generic_api_key"),
|
38 |
+
(2, r"hf_[A-Za-z0-9]{30,}", "huggingface_token"),
|
39 |
+
(2, r"xox[baprs]-[A-Za-z0-9-]{10,}", "slack_token"),
|
40 |
+
]
|
41 |
+
|
42 |
+
def score(self, text: str) -> (int, list):
|
43 |
+
hits=[]
|
44 |
+
sev=0
|
45 |
+
t = text.lower()
|
46 |
+
for level, rx, tag in self.rules:
|
47 |
+
if re.search(rx, t, flags=re.IGNORECASE):
|
48 |
+
hits.append(tag)
|
49 |
+
sev = max(sev, level)
|
50 |
+
return sev, list(sorted(set(hits)))
|
51 |
+
|
52 |
+
def inspect_input(self, text: str) -> GuardResult:
|
53 |
+
sev, indicators = self.score(text)
|
54 |
+
if sev >= self.block_level:
|
55 |
+
return GuardResult(action=GuardDecision.BLOCK, reason=f"TTP severity {sev} >= block_level", indicators=indicators)
|
56 |
+
if sev > 0:
|
57 |
+
return GuardResult(action=GuardDecision.ANNOTATE, reason=f"TTP indicators: {', '.join(indicators)}", indicators=indicators)
|
58 |
+
return GuardResult(action=GuardDecision.ALLOW, reason="clean", indicators=[])
|
59 |
+
|
60 |
+
def describe_policy(self) -> Dict[str, Any]:
|
61 |
+
return {
|
62 |
+
"block_level": self.block_level,
|
63 |
+
"rules": [{"severity":lvl, "regex":rx, "tag":tag} for (lvl, rx, tag) in self.rules]
|
64 |
+
}
|
65 |
+
|
66 |
+
# sensible default
|
67 |
+
default_guard = TTPGuard()
|
validation.py
CHANGED
@@ -1 +1,125 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import re, math, unicodedata
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
try:
|
7 |
+
import phonenumbers
|
8 |
+
HAVE_PHONENUM = True
|
9 |
+
except Exception:
|
10 |
+
HAVE_PHONENUM = False
|
11 |
+
|
12 |
+
def _norm_colname(c: str) -> str:
|
13 |
+
c = c.strip().lower()
|
14 |
+
c = re.sub(r"\s+", "_", c)
|
15 |
+
c = re.sub(r"[^\w]+", "_", c)
|
16 |
+
return c.strip("_")
|
17 |
+
|
18 |
+
def _nfkc(s: str) -> str:
|
19 |
+
return unicodedata.normalize("NFKC", s)
|
20 |
+
|
21 |
+
def _collapse_ws(s: str) -> str:
|
22 |
+
return re.sub(r"\s+", " ", s).strip()
|
23 |
+
|
24 |
+
def _clean_str(x):
|
25 |
+
if pd.isna(x): return x
|
26 |
+
return _collapse_ws(_nfkc(str(x)))
|
27 |
+
|
28 |
+
def _is_email(s: str) -> bool:
|
29 |
+
return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
|
30 |
+
|
31 |
+
def _clean_phone(s: str, default_region: str = "IN"):
|
32 |
+
if s is None or str(s).strip() == "":
|
33 |
+
return None, "missing_phone"
|
34 |
+
raw = re.sub(r"[^\d+]", "", str(s))
|
35 |
+
if HAVE_PHONENUM:
|
36 |
+
try:
|
37 |
+
pn = phonenumbers.parse(raw, default_region)
|
38 |
+
if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
|
39 |
+
return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
|
40 |
+
return raw, "invalid_phone"
|
41 |
+
except Exception:
|
42 |
+
return raw, "invalid_phone"
|
43 |
+
digits = re.sub(r"\D", "", raw)
|
44 |
+
return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
|
45 |
+
|
46 |
+
def _parse_datetime(s):
|
47 |
+
try:
|
48 |
+
return pd.to_datetime(s, errors="coerce", utc=True)
|
49 |
+
except Exception:
|
50 |
+
return pd.NaT
|
51 |
+
|
52 |
+
def _to_numeric(series: pd.Series):
|
53 |
+
coerced = pd.to_numeric(series, errors="coerce")
|
54 |
+
return coerced, (coerced.isna() & series.notna())
|
55 |
+
|
56 |
+
def _read_csv_any(file_obj) -> pd.DataFrame:
|
57 |
+
if file_obj is None:
|
58 |
+
raise ValueError("No file uploaded.")
|
59 |
+
if hasattr(file_obj, "name"):
|
60 |
+
p = file_obj.name
|
61 |
+
try: return pd.read_csv(p)
|
62 |
+
except Exception: return pd.read_csv(p, encoding="latin-1")
|
63 |
+
try: return pd.read_csv(file_obj)
|
64 |
+
except Exception:
|
65 |
+
file_obj.seek(0)
|
66 |
+
return pd.read_csv(file_obj, encoding="latin-1")
|
67 |
+
|
68 |
+
def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
|
69 |
+
df = df.copy()
|
70 |
+
df.columns = [_norm_colname(c) for c in df.columns]
|
71 |
+
for c in df.select_dtypes(include=["object"]).columns:
|
72 |
+
df[c] = df[c].apply(_clean_str)
|
73 |
+
return df
|
74 |
+
|
75 |
+
def _prepare_generic(df: pd.DataFrame, expected: dict[str, list[str]]):
|
76 |
+
issues = []
|
77 |
+
df0 = _standardize_df(df)
|
78 |
+
|
79 |
+
colmap = {}
|
80 |
+
cols = set(df0.columns)
|
81 |
+
for canon, syns in expected.items():
|
82 |
+
found = None
|
83 |
+
for s in [canon] + syns:
|
84 |
+
s = _norm_colname(s)
|
85 |
+
if s in cols:
|
86 |
+
found = s; break
|
87 |
+
if found: colmap[canon] = found
|
88 |
+
|
89 |
+
for c in list(df0.columns):
|
90 |
+
if "email" in c:
|
91 |
+
df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
|
92 |
+
for idx, v in df0[c].items():
|
93 |
+
if pd.isna(v) or str(v).strip()=="":
|
94 |
+
issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
|
95 |
+
elif not _is_email(v):
|
96 |
+
issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
|
97 |
+
if "phone" in c or "mobile" in c:
|
98 |
+
vals = []
|
99 |
+
for idx, v in df0[c].items():
|
100 |
+
e164, prob = _clean_phone(v)
|
101 |
+
vals.append(e164)
|
102 |
+
if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
|
103 |
+
df0[c] = vals
|
104 |
+
|
105 |
+
for c in df0.columns:
|
106 |
+
if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
|
107 |
+
parsed = _parse_datetime(df0[c])
|
108 |
+
bad = parsed.isna() & df0[c].notna()
|
109 |
+
for idx in df0.index[bad]:
|
110 |
+
issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
|
111 |
+
df0[c] = parsed
|
112 |
+
|
113 |
+
for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
|
114 |
+
for c in df0.columns:
|
115 |
+
if c == nc or c.endswith("_"+nc) or nc in c:
|
116 |
+
coerced, badmask = _to_numeric(df0[c])
|
117 |
+
for idx in df0.index[badmask]:
|
118 |
+
issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
|
119 |
+
df0[c] = coerced
|
120 |
+
|
121 |
+
import pandas as pd
|
122 |
+
issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
|
123 |
+
missing = [k for k in expected.keys() if k not in colmap]
|
124 |
+
quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
|
125 |
+
return df0, issues_df, quality_summary, colmap
|