Spaces:

soupstick
/

advanced-fraud-analyst

Running

App Files Files Community

soupstick commited on Aug 30

Commit

bc2b09a

1 Parent(s): ccb470a

chore: commit modular split + updates

Browse files

Files changed (12) hide show

agent.py +27 -1
app.py +172 -1
llm_provider.py +101 -1
mcp.py +40 -1
modules/credit.py +45 -1
modules/kyc.py +48 -1
modules/sanctions.py +41 -1
modules/transactions.py +61 -1
threat_intel.py +17 -1
tools.py +59 -1
ttp_guard.py +67 -1
validation.py +125 -1

agent.py CHANGED Viewed

	@@ -1 +1,27 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~agent.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+from typing import List
+from langchain.agents import initialize_agent, AgentType
+from llm_provider import CHAT_LLM, SUMMARY_NOTICE
+from ttp_guard import TTPGuard, GuardDecision
+AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
+You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
+If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
+Be concise and actionable."""
+def build_agent(tools: List, guard: TTPGuard):
+    if CHAT_LLM is None:
+        # Stub agent that returns notice
+        class Stub:
+            def invoke(self, prompt): return SUMMARY_NOTICE
+        return Stub()
+    # Wrap LLM invocation with a guard-aware tool-use policy by leveraging the system message.
+    return initialize_agent(
+        tools,
+        CHAT_LLM,
+        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+        verbose=False,
+        agent_kwargs={"system_message": AGENT_SYSTEM},
+        handle_parsing_errors=True,
+    )

app.py CHANGED Viewed

	@@ -1 +1,172 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~app.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import gradio as gr
+import pandas as pd
+from llm_provider import CHAT_LLM, SUMMARY_NOTICE
+from mcp import mcp_fetch_sanctions, mcp_fetch_high_risk_mcc
+from threat_intel import ThreatIntel
+from ttp_guard import TTPGuard, GuardDecision, default_guard
+from modules.transactions import prepare_transactions, detect_transactions
+from modules.kyc import prepare_kyc, detect_kyc
+from modules.sanctions import prepare_sanctions, detect_sanctions, DEMO_SANCTIONS
+from modules.credit import prepare_credit, detect_credit
+from agent import build_agent
+from tools import build_tools
+from langchain.schema import SystemMessage, HumanMessage
+# ---------- Summarizer ----------
+SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
+def summarize_ai(context: str) -> str:
+    if CHAT_LLM is None:
+        return SUMMARY_NOTICE
+    # Guard summaries as well (low severity just annotate)
+    decision = default_guard.inspect_input(context)
+    if decision.action == GuardDecision.BLOCK:
+        return f"Blocked by TTP Guard: {decision.reason}"
+    try:
+        out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
+        return getattr(out, "content", str(out))
+    except Exception:
+        return SUMMARY_NOTICE
+# ---------- TI + Guard singletons ----------
+TI = ThreatIntel.load()   # pulls MCP envs if set, else defaults
+GUARD = default_guard
+# ---------- Pipelines (tabs) ----------
+def run_transactions(file):
+    try:
+        from validation import _read_csv_any
+        df = _read_csv_any(file)
+        clean, issues, quality, colmap = prepare_transactions(df)
+        mcc_list = mcp_fetch_high_risk_mcc() or TI.high_risk_mcc
+        flagged, stats = detect_transactions(clean, colmap, mcc_list)
+        ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
+        ai = summarize_ai(ctx)
+        return ai, stats, flagged, issues
+    except Exception as e:
+        return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
+def run_kyc(file):
+    try:
+        from validation import _read_csv_any
+        df = _read_csv_any(file)
+        clean, issues, quality, colmap = prepare_kyc(df)
+        flagged, stats = detect_kyc(clean, colmap)
+        ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
+        ai = summarize_ai(ctx)
+        return ai, stats, flagged, issues
+    except Exception as e:
+        return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
+def run_sanctions(customers_file, sanctions_file):
+    try:
+        from validation import _read_csv_any
+        df = _read_csv_any(customers_file)
+        clean, issues, quality, colmap = prepare_sanctions(df)
+        sanc_df = mcp_fetch_sanctions() or ( _read_csv_any(sanctions_file) if sanctions_file else None ) or TI.sanctions_df or DEMO_SANCTIONS
+        flagged, stats = detect_sanctions(clean, colmap, sanc_df)
+        ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
+        ai = summarize_ai(ctx)
+        return ai, stats, flagged, issues
+    except Exception as e:
+        return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
+def run_credit(file):
+    try:
+        from validation import _read_csv_any
+        df = _read_csv_any(file)
+        clean, issues, quality, colmap = prepare_credit(df)
+        flagged, stats = detect_credit(clean, colmap)
+        ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
+        ai = summarize_ai(ctx)
+        return ai, stats, flagged, issues
+    except Exception as e:
+        return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
+# ---------- Agent & tools ----------
+TOOLS = build_tools()
+AGENT = build_agent(TOOLS, GUARD)
+def agent_reply(history, user_msg: str):
+    # Guard the incoming user message before tool routing
+    decision = GUARD.inspect_input(user_msg)
+    if decision.action == GuardDecision.BLOCK:
+        return f"❌ Blocked by TTP Guard: {decision.reason}"
+    try:
+        looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
+        prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
+        res = AGENT.invoke(prompt)
+        if isinstance(res, dict) and "output" in res:
+            return res["output"]
+        return str(res)
+    except Exception as e:
+        return f"Agent error: {e}"
+# ---------- UI ----------
+with gr.Blocks(title="Fraud Detector Analyst — LangChain + Fireworks + MCP") as demo:
+    gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + Fireworks + MCP")
+    with gr.Tabs():
+        with gr.Tab("Transactions"):
+            f = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
+            ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
+            st = gr.Textbox(label="Stats", lines=3)
+            flagged = gr.Dataframe(label="Flagged Transactions")
+            issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
+            f.upload(run_transactions, inputs=[f], outputs=[ai, st, flagged, issues])
+        with gr.Tab("KYC"):
+            f = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
+            ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
+            st = gr.Textbox(label="Stats", lines=3)
+            flagged = gr.Dataframe(label="Flagged KYC Rows")
+            issues = gr.Dataframe(label="Data Quality Issues")
+            f.upload(run_kyc, inputs=[f], outputs=[ai, st, flagged, issues])
+        with gr.Tab("Sanctions/PEP"):
+            cust = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
+            sanc = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
+            ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
+            st = gr.Textbox(label="Stats", lines=3)
+            flagged = gr.Dataframe(label="Matches")
+            issues = gr.Dataframe(label="Data Quality Issues")
+            cust.upload(run_sanctions, inputs=[cust, sanc], outputs=[ai, st, flagged, issues])
+            sanc.upload(run_sanctions, inputs=[cust, sanc], outputs=[ai, st, flagged, issues])
+        with gr.Tab("Credit Risk"):
+            f = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
+            ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
+            st = gr.Textbox(label="Stats", lines=3)
+            flagged = gr.Dataframe(label="Flagged Applicants")
+            issues = gr.Dataframe(label="Data Quality Issues")
+            f.upload(run_credit, inputs=[f], outputs=[ai, st, flagged, issues])
+        with gr.Tab("AI Consultant (Agent)"):
+            chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
+            user_in = gr.Textbox(label="Message or CSV snippet")
+            send_btn = gr.Button("Send")
+            def _chat_fn(history, msg):
+                reply = agent_reply(history, msg)
+                history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
+                return history, ""
+            send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
+        with gr.Tab("Security & TI"):
+            gr.Markdown("**TTP Guard policy & latest indicators**")
+            gr.JSON(value=GUARD.describe_policy())
+            gr.Dataframe(value=TI.sanctions_df.head(10) if TI.sanctions_df is not None else pd.DataFrame({"note":["demo sanctions used"]}),
+                         label="Sanctions (sample)")
+            gr.Dataframe(value=pd.DataFrame({"high_risk_mcc": TI.high_risk_mcc}),
+                         label="High-risk MCC (current)")
+    gr.Markdown(
+        "### ⚙️ Configure\n"
+        "- `FIREWORKS_API_KEY` **or** `HF_TOKEN` (provider routing to Fireworks)\n"
+        "- `FW_PRIMARY_MODEL` (default openai/gpt-oss-20b), `FW_SECONDARY_MODEL` (default Qwen/Qwen3-Coder-30B-A3B-Instruct)\n"
+        "- MCP (optional): `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`\n"
+        "- TTP guard thresholds: `TTP_BLOCK_LEVEL` (default 3)\n"
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

llm_provider.py CHANGED Viewed

	@@ -1 +1,101 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~llm_provider.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import os, logging
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain.schema import HumanMessage, SystemMessage, AIMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+load_dotenv()
+log = logging.getLogger("fraud-analyst")
+logging.basicConfig(level=logging.INFO)
+FIREWORKS_API_KEY = os.getenv("FIREWORKS_API_KEY") or os.getenv("HF_TOKEN")
+FW_PRIMARY_MODEL   = os.getenv("FW_PRIMARY_MODEL",   "openai/gpt-oss-20b")
+FW_SECONDARY_MODEL = os.getenv("FW_SECONDARY_MODEL", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
+SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
+class FireworksHFChat(BaseChatModel):
+    model: str
+    api_key: str | None = None
+    temperature: float = 0.2
+    max_new_tokens: int = 256
+    timeout: int = 60
+    def __init__(self, model: str, api_key: str | None):
+        super().__init__()
+        self.model = model
+        self.api_key = api_key
+        self._client = InferenceClient(provider="fireworks-ai", api_key=self.api_key)
+    @property
+    def _llm_type(self) -> str:
+        return "fireworks_hf_chat"
+    def _convert(self, messages):
+        out=[]
+        for m in messages:
+            if isinstance(m, SystemMessage):
+                out.append({"role":"system","content":m.content})
+            elif isinstance(m, HumanMessage):
+                out.append({"role":"user","content":m.content})
+            elif isinstance(m, AIMessage):
+                out.append({"role":"assistant","content":m.content})
+            else:
+                out.append({"role":"user","content":str(getattr(m,"content",m))})
+        return out
+    def _generate(self, messages, stop=None, run_manager=None, **kwargs) -> ChatResult:
+        if not self.api_key:
+            gen = ChatGeneration(message=AIMessage(content=""))
+            return ChatResult(generations=[gen], llm_output={"error": "no_api_key"})
+        try:
+            resp = self._client.chat.completions.create(
+                model=self.model,
+                messages=self._convert(messages),
+                stream=False,
+                max_tokens=kwargs.get("max_tokens", 256),
+                temperature=kwargs.get("temperature", 0.2),
+            )
+            text = ""
+            if hasattr(resp, "choices") and resp.choices:
+                ch = resp.choices[0]
+                if hasattr(ch, "message") and ch.message and getattr(ch.message, "content", None):
+                    text = ch.message.content
+                elif hasattr(ch, "text") and ch.text:
+                    text = ch.text
+            gen = ChatGeneration(message=AIMessage(content=text or ""))
+            return ChatResult(generations=[gen], llm_output={"model": self.model})
+        except Exception as e:
+            log.warning(f"Fireworks call failed for {self.model}: {type(e).__name__}: {str(e)[:200]}")
+            gen = ChatGeneration(message=AIMessage(content=""))
+            return ChatResult(generations=[gen], llm_output={"error": str(e)})
+def _heartbeat(model_id: str) -> bool:
+    if not FIREWORKS_API_KEY: return False
+    try:
+        client = InferenceClient(provider="fireworks-ai", api_key=FIREWORKS_API_KEY)
+        _ = client.chat.completions.create(
+            model=model_id,
+            messages=[{"role":"user","content":"ping"}],
+            stream=False,
+            max_tokens=1,
+        )
+        return True
+    except Exception as e:
+        log.warning(f"Heartbeat failed for {model_id}: {type(e).__name__}: {str(e)[:160]}")
+        return False
+def build_chat_llm():
+    log.info(f"Fireworks key present: {bool(FIREWORKS_API_KEY)} len={len(FIREWORKS_API_KEY) if FIREWORKS_API_KEY else 0}")
+    if FIREWORKS_API_KEY and _heartbeat(FW_PRIMARY_MODEL):
+        log.info(f"Using chat model: {FW_PRIMARY_MODEL}")
+        return FireworksHFChat(FW_PRIMARY_MODEL, FIREWORKS_API_KEY)
+    if FIREWORKS_API_KEY and _heartbeat(FW_SECONDARY_MODEL):
+        log.info(f"Using fallback chat model: {FW_SECONDARY_MODEL}")
+        return FireworksHFChat(FW_SECONDARY_MODEL, FIREWORKS_API_KEY)
+    log.warning("No working chat model; notice will be shown.")
+    return None
+CHAT_LLM = build_chat_llm()

mcp.py CHANGED Viewed

	@@ -1 +1,40 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~mcp.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import os, json, logging
+from typing import Optional, List
+import pandas as pd
+from urllib.request import Request, urlopen
+log = logging.getLogger("fraud-analyst")
+def _mcp_get_json(url: str, auth_header: Optional[str]):
+    try:
+        req = Request(url)
+        if auth_header:
+            k, v = auth_header.split(":", 1)
+            req.add_header(k.strip(), v.strip())
+        with urlopen(req, timeout=10) as r:
+            return json.loads(r.read().decode("utf-8"))
+    except Exception as e:
+        log.warning(f"MCP fetch failed: {e}")
+        return None
+def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
+    if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
+    url = os.getenv("MCP_SANCTIONS_URL")
+    if not url: return None
+    data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
+    if not data: return None
+    if isinstance(data, list):
+        if all(isinstance(x, dict) for x in data):
+            rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
+            return pd.DataFrame(rows) if rows else None
+        if all(isinstance(x, str) for x in data):
+            return pd.DataFrame({"name": data})
+    return None
+def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
+    if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
+    url = os.getenv("MCP_HIGH_RISK_MCC_URL")
+    if not url: return None
+    data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
+    return [str(x) for x in data] if isinstance(data, list) else None

modules/credit.py CHANGED Viewed

	@@ -1 +1,45 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~modules/credit.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import pandas as pd, numpy as np
+from typing import Dict
+from validation import _prepare_generic
+CR_EXPECTED = {
+    "customer_id":["cust_id","user_id","client_id"],
+    "credit_score":["creditscore","score"],
+    "utilization":["util","credit_utilization","utilization_ratio"],
+    "dti":["debt_to_income","debt_to_income_ratio"],
+    "recent_defaults":["defaults","recentdefaults"],
+    "income":["annual_income","salary"]
+}
+def prepare_credit(df: pd.DataFrame):
+    return _prepare_generic(df, CR_EXPECTED)
+def detect_credit(clean_df: pd.DataFrame, colmap: Dict[str,str]):
+    needed = ["credit_score","utilization","dti","recent_defaults","income"]
+    if not any(k in colmap for k in needed):
+        return pd.DataFrame(), "Required columns missing for Credit Risk."
+    df = clean_df.copy()
+    cs  = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
+    util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
+    dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
+    rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
+    inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
+    out=[]
+    for i in range(len(df)):
+        hits=0; reasons=[]
+        if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
+        if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
+        if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
+        if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
+        if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
+        level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
+        out.append((hits, level, ", ".join(reasons)))
+    res = df.assign(
+        risk_score=[x[0] for x in out],
+        risk_level=[x[1] for x in out],
+        risk_reason=[x[2] for x in out]
+    )
+    flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
+    stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
+    return flagged, stats

modules/kyc.py CHANGED Viewed

	@@ -1 +1,48 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~modules/kyc.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import pandas as pd
+from typing import Dict
+from validation import _prepare_generic
+import numpy as np
+KYC_EXPECTED = {
+    "customer_id":["cust_id","user_id","client_id"],
+    "name":["full_name","customer_name"],
+    "email":["email_address","mail"],
+    "phone":["phone_number","mobile","contact"],
+    "dob":["date_of_birth","birthdate"]
+}
+def prepare_kyc(df: pd.DataFrame):
+    return _prepare_generic(df, KYC_EXPECTED)
+def _age_years(dob: pd.Series) -> pd.Series:
+    now = pd.Timestamp.utcnow()
+    return (now - dob).dt.days / 365.25
+def detect_kyc(clean_df: pd.DataFrame, colmap: Dict[str,str]):
+    if not all(k in colmap for k in ["customer_id","name"]):
+        return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
+    df = clean_df.copy()
+    reasons=[]
+    if "email" in colmap:
+        dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
+        reasons.append(dupe_email)
+    if "phone" in colmap:
+        dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
+        reasons.append(dupe_phone)
+    if "dob" in colmap:
+        age = _age_years(df[colmap["dob"]])
+        invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
+        reasons.append(invalid)
+    if "name" in colmap:
+        name = df[colmap["name"]].astype(str)
+        susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
+        reasons.append(susp)
+    mask=None
+    for m in reasons:
+        mask = m if mask is None else (mask | m)
+    flagged = df[mask] if mask is not None else pd.DataFrame()
+    if not flagged.empty:
+        flagged = flagged.assign(risk_reason="kyc_rule_hit")
+    stats = f"KYC flagged: {len(flagged)} of {len(df)}."
+    return flagged, stats

modules/sanctions.py CHANGED Viewed

	@@ -1 +1,41 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~modules/sanctions.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import re, pandas as pd
+from typing import Optional, Dict
+from validation import _prepare_generic, _standardize_df
+SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
+def prepare_sanctions(df: pd.DataFrame):
+    return _prepare_generic(df, SAN_EXPECTED)
+DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
+def token_overlap(a: str, b: str) -> int:
+    at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
+    bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
+    return len(at & bt)
+def detect_sanctions(clean_df: pd.DataFrame, colmap: Dict[str,str], sanctions_df: Optional[pd.DataFrame]=None):
+    if "name" not in colmap:
+        return pd.DataFrame(), "Required column missing for Sanctions (need name)."
+    df = clean_df.copy()
+    sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
+    sanc = _standardize_df(sanc)
+    if "name" not in sanc.columns:
+        for c in sanc.columns:
+            if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
+    sanc_names = sanc["name"].dropna().astype(str).tolist()
+    matches=[]
+    for idx, row in df.iterrows():
+        nm = str(row[colmap["name"]] or "").strip()
+        if not nm: continue
+        if any(nm.lower()==s.lower() for s in sanc_names):
+            matches.append((idx,"exact")); continue
+        if any(token_overlap(nm, s) >= 2 for s in sanc_names):
+            matches.append((idx,"fuzzy"))
+    flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
+    if not flagged.empty:
+        mt = {i:t for i,t in matches}
+        flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
+    stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
+    return flagged, stats

modules/transactions.py CHANGED Viewed

	@@ -1 +1,61 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~modules/transactions.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import pandas as pd
+from typing import Optional, List, Dict
+from validation import _prepare_generic, _nfkc
+TX_EXPECTED = {
+    "transaction_id":["txn_id","transactionid","id","tx_id"],
+    "customer_id":["cust_id","user_id","client_id"],
+    "amount":["amt","amount_inr","value"],
+    "timestamp":["date","event_time","created_at","tx_time"],
+    "merchant_category":["mcc","merchant_cat","category"]
+}
+def prepare_transactions(df: pd.DataFrame):
+    return _prepare_generic(df, TX_EXPECTED)
+def detect_transactions(clean_df: pd.DataFrame, colmap: Dict[str,str], high_risk_mcc: Optional[List[str]] = None):
+    high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
+    if high_risk_mcc:
+        high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
+    if not all(k in colmap for k in ["customer_id","amount"]):
+        return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
+    df = clean_df.copy()
+    reasons = []
+    amtcol = colmap.get("amount")
+    if amtcol:
+        reasons.append(df[amtcol] > 10000)    # large
+        reasons.append(df[amtcol] < 0)        # negative
+    if "merchant_category" in colmap:
+        mcc = colmap["merchant_category"]
+        high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
+        reasons.append(high)
+    if all(k in colmap for k in ["customer_id","timestamp","amount"]):
+        cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
+        daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
+        reasons.append(daily > 50000)
+    mask = None
+    for m in reasons:
+        mask = m if mask is None else (mask | m)
+    flagged = df[mask] if mask is not None else pd.DataFrame()
+    if not flagged.empty:
+        rr=[]
+        for _, row in flagged.iterrows():
+            hits=[]
+            if amtcol:
+                a=row[amtcol]
+                if pd.notna(a) and a>10000: hits.append("large_amount")
+                if pd.notna(a) and a<0: hits.append("negative_amount")
+            if "merchant_category" in colmap:
+                val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
+                if val in high_risk: hits.append("mcc_high_risk")
+            try:
+                if all(k in colmap for k in ["customer_id","timestamp","amount"]):
+                    sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
+                             (df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
+                    if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
+            except Exception: pass
+            rr.append(", ".join(sorted(set(hits))) or "rule_hit")
+        flagged = flagged.assign(risk_reason=rr)
+    stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
+    return flagged, stats

threat_intel.py CHANGED Viewed

	@@ -1 +1,17 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~threat_intel.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, List
+import pandas as pd
+from mcp import mcp_fetch_sanctions, mcp_fetch_high_risk_mcc
+from modules.sanctions import DEMO_SANCTIONS
+@dataclass
+class ThreatIntel:
+    sanctions_df: Optional[pd.DataFrame]
+    high_risk_mcc: List[str]
+    @staticmethod
+    def load() -> "ThreatIntel":
+        sanc = mcp_fetch_sanctions()
+        mcc = mcp_fetch_high_risk_mcc() or ["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"]
+        return ThreatIntel(sanctions_df=sanc or DEMO_SANCTIONS, high_risk_mcc=mcc)

tools.py CHANGED Viewed

	@@ -1 +1,59 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~tools.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import io, pandas as pd
+from pydantic import BaseModel, Field
+from langchain.tools import tool
+from modules.transactions import prepare_transactions, detect_transactions
+from modules.kyc import prepare_kyc, detect_kyc
+from modules.sanctions import prepare_sanctions, detect_sanctions
+from modules.credit import prepare_credit, detect_credit
+def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
+    return pd.read_csv(io.StringIO(csv_text))
+class TransactionCSVInput(BaseModel):
+    csv_text: str = Field(..., description="Transactions CSV text")
+@tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
+def transactions_fraud_tool(csv_text: str) -> str:
+    """Analyze transactions CSV: large/negative amounts, high-risk MCCs, per-customer daily sum >50k. Returns counts + sample."""
+    df = _csv_text_to_df(csv_text)
+    clean, issues, quality, colmap = prepare_transactions(df)
+    flagged, stats = detect_transactions(clean, colmap)
+    return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
+class KYCCSVInput(BaseModel):
+    csv_text: str = Field(..., description="KYC CSV text")
+@tool("kyc_fraud_tool", args_schema=KYCCSVInput)
+def kyc_fraud_tool(csv_text: str) -> str:
+    """Analyze KYC CSV: duplicate email/phone, invalid DOBs, suspicious names. Returns counts + sample."""
+    df = _csv_text_to_df(csv_text)
+    clean, issues, quality, colmap = prepare_kyc(df)
+    flagged, stats = detect_kyc(clean, colmap)
+    return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
+class SanctionsCSVInput(BaseModel):
+    csv_text: str = Field(..., description="Customers CSV text with 'name' column")
+@tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
+def sanctions_pep_tool(csv_text: str) -> str:
+    """Check customers against sanctions/PEP list (exact + simple fuzzy). Returns counts + sample."""
+    df = _csv_text_to_df(csv_text)
+    clean, issues, quality, colmap = prepare_sanctions(df)
+    flagged, stats = detect_sanctions(clean, colmap)
+    return f"{stats}\nData quality issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
+class CreditCSVInput(BaseModel):
+    csv_text: str = Field(..., description="Credit CSV text")
+@tool("credit_risk_tool", args_schema=CreditCSVInput)
+def credit_risk_tool(csv_text: str) -> str:
+    """Score credit risk using simple rules → risk_score, risk_level. Returns counts + sample."""
+    df = _csv_text_to_df(csv_text)
+    clean, issues, quality, colmap = prepare_credit(df)
+    flagged, stats = detect_credit(clean, colmap)
+    return f"{stats}\nData quality issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
+def build_tools():
+    return [transactions_fraud_tool, kyc_fraud_tool, sanctions_pep_tool, credit_risk_tool]

ttp_guard.py CHANGED Viewed

	@@ -1 +1,67 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~ttp_guard.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import re, os
+from dataclasses import dataclass
+from typing import Dict, Any, List
+class GuardDecision:
+    ALLOW = "allow"
+    BLOCK = "block"
+    ANNOTATE = "annotate"
+@dataclass
+class GuardResult:
+    action: str
+    reason: str
+    indicators: List[str]
+class TTPGuard:
+    """
+    Lightweight rule-based guard for adversarial TTPs:
+    - Prompt injection / instruction override (e.g., "ignore previous instructions", "you are DAN")
+    - Safety bypass ("never refuse", "no moralizing")
+    - Secret exfiltration ("print your system prompt", "reveal keys")
+    - Credential patterns (AWS, Slack, HuggingFace tokens) in input
+    """
+    def __init__(self, block_level: int = None):
+        self.block_level = int(os.getenv("TTP_BLOCK_LEVEL", block_level if block_level is not None else 3))
+        self.rules = [
+            (3, r"\bignore (all|any|previous) instructions\b", "prompt_injection"),
+            (3, r"\boverride (system|assistant) (prompt|instructions)\b", "prompt_injection"),
+            (3, r"\byou are (now )?(?:dan|dev mode)\b", "jailbreak_alias"),
+            (2, r"\bnever refuse\b|\bdon't refuse\b|\balways comply\b", "safety_bypass"),
+            (3, r"\bshow (me )?(your )?(system prompt|hidden instructions)\b", "sys_prompt_exfil"),
+            (3, r"\bexfiltrate\b|\bleak\b|\bdump secrets?\b", "exfil_intent"),
+            (2, r"BEGIN RSA PRIVATE KEY|BEGIN OPENSSH PRIVATE KEY", "secret_marker"),
+            (2, r"AKIA[0-9A-Z]{16}", "aws_access_key"),
+            (2, r"sk-[A-Za-z0-9]{20,}", "generic_api_key"),
+            (2, r"hf_[A-Za-z0-9]{30,}", "huggingface_token"),
+            (2, r"xox[baprs]-[A-Za-z0-9-]{10,}", "slack_token"),
+        ]
+    def score(self, text: str) -> (int, list):
+        hits=[]
+        sev=0
+        t = text.lower()
+        for level, rx, tag in self.rules:
+            if re.search(rx, t, flags=re.IGNORECASE):
+                hits.append(tag)
+                sev = max(sev, level)
+        return sev, list(sorted(set(hits)))
+    def inspect_input(self, text: str) -> GuardResult:
+        sev, indicators = self.score(text)
+        if sev >= self.block_level:
+            return GuardResult(action=GuardDecision.BLOCK, reason=f"TTP severity {sev} >= block_level", indicators=indicators)
+        if sev > 0:
+            return GuardResult(action=GuardDecision.ANNOTATE, reason=f"TTP indicators: {', '.join(indicators)}", indicators=indicators)
+        return GuardResult(action=GuardDecision.ALLOW, reason="clean", indicators=[])
+    def describe_policy(self) -> Dict[str, Any]:
+        return {
+            "block_level": self.block_level,
+            "rules": [{"severity":lvl, "regex":rx, "tag":tag} for (lvl, rx, tag) in self.rules]
+        }
+# sensible default
+default_guard = TTPGuard()

validation.py CHANGED Viewed

	@@ -1 +1,125 @@
1	- ~~"""TODO:~~ ~~implement~~ ~~validation.py~~ ~~(split from app_monolith_backup.py)"""~~

+from __future__ import annotations
+import re, math, unicodedata
+import pandas as pd
+import numpy as np
+try:
+    import phonenumbers
+    HAVE_PHONENUM = True
+except Exception:
+    HAVE_PHONENUM = False
+def _norm_colname(c: str) -> str:
+    c = c.strip().lower()
+    c = re.sub(r"\s+", "_", c)
+    c = re.sub(r"[^\w]+", "_", c)
+    return c.strip("_")
+def _nfkc(s: str) -> str:
+    return unicodedata.normalize("NFKC", s)
+def _collapse_ws(s: str) -> str:
+    return re.sub(r"\s+", " ", s).strip()
+def _clean_str(x):
+    if pd.isna(x): return x
+    return _collapse_ws(_nfkc(str(x)))
+def _is_email(s: str) -> bool:
+    return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
+def _clean_phone(s: str, default_region: str = "IN"):
+    if s is None or str(s).strip() == "":
+        return None, "missing_phone"
+    raw = re.sub(r"[^\d+]", "", str(s))
+    if HAVE_PHONENUM:
+        try:
+            pn = phonenumbers.parse(raw, default_region)
+            if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
+                return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
+            return raw, "invalid_phone"
+        except Exception:
+            return raw, "invalid_phone"
+    digits = re.sub(r"\D", "", raw)
+    return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
+def _parse_datetime(s):
+    try:
+        return pd.to_datetime(s, errors="coerce", utc=True)
+    except Exception:
+        return pd.NaT
+def _to_numeric(series: pd.Series):
+    coerced = pd.to_numeric(series, errors="coerce")
+    return coerced, (coerced.isna() & series.notna())
+def _read_csv_any(file_obj) -> pd.DataFrame:
+    if file_obj is None:
+        raise ValueError("No file uploaded.")
+    if hasattr(file_obj, "name"):
+        p = file_obj.name
+        try: return pd.read_csv(p)
+        except Exception: return pd.read_csv(p, encoding="latin-1")
+    try: return pd.read_csv(file_obj)
+    except Exception:
+        file_obj.seek(0)
+        return pd.read_csv(file_obj, encoding="latin-1")
+def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df.columns = [_norm_colname(c) for c in df.columns]
+    for c in df.select_dtypes(include=["object"]).columns:
+        df[c] = df[c].apply(_clean_str)
+    return df
+def _prepare_generic(df: pd.DataFrame, expected: dict[str, list[str]]):
+    issues = []
+    df0 = _standardize_df(df)
+    colmap = {}
+    cols = set(df0.columns)
+    for canon, syns in expected.items():
+        found = None
+        for s in [canon] + syns:
+            s = _norm_colname(s)
+            if s in cols:
+                found = s; break
+        if found: colmap[canon] = found
+    for c in list(df0.columns):
+        if "email" in c:
+            df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
+            for idx, v in df0[c].items():
+                if pd.isna(v) or str(v).strip()=="":
+                    issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
+                elif not _is_email(v):
+                    issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
+        if "phone" in c or "mobile" in c:
+            vals = []
+            for idx, v in df0[c].items():
+                e164, prob = _clean_phone(v)
+                vals.append(e164)
+                if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
+            df0[c] = vals
+    for c in df0.columns:
+        if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
+            parsed = _parse_datetime(df0[c])
+            bad = parsed.isna() & df0[c].notna()
+            for idx in df0.index[bad]:
+                issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
+            df0[c] = parsed
+    for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
+        for c in df0.columns:
+            if c == nc or c.endswith("_"+nc) or nc in c:
+                coerced, badmask = _to_numeric(df0[c])
+                for idx in df0.index[badmask]:
+                    issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
+                df0[c] = coerced
+    import pandas as pd
+    issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
+    missing = [k for k in expected.keys() if k not in colmap]
+    quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
+    return df0, issues_df, quality_summary, colmap