DataBoySu commited on
Commit ·
4204e21
1
Parent(s): c0db7bb
second
Browse files- tools/haystack.py +432 -0
- tools/tasks.json +239 -0
tools/haystack.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Financial Knowledge Graph - Haystack Generator
|
| 3 |
+
================================================
|
| 4 |
+
Generates entities.json, accounts.json, and transactions.json
|
| 5 |
+
for an AI Reinforcement Learning environment.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python generate_haystack.py # Generate haystack only
|
| 9 |
+
python generate_haystack.py --inject manual_tasks.json # Inject manual fraud tasks
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import random
|
| 14 |
+
import argparse
|
| 15 |
+
from datetime import datetime, timedelta, timezone
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from faker import Faker
|
| 18 |
+
|
| 19 |
+
# ── Reproducibility ──────────────────────────────────────────────────────────
|
| 20 |
+
SEED = 42
|
| 21 |
+
random.seed(SEED)
|
| 22 |
+
fake = Faker()
|
| 23 |
+
Faker.seed(SEED)
|
| 24 |
+
|
| 25 |
+
# ── Volume constants ──────────────────────────────────────────────────────────
|
| 26 |
+
NUM_ENTITIES = 300
|
| 27 |
+
NUM_ACCOUNTS = 400
|
| 28 |
+
NUM_TRANSACTIONS = 5_000
|
| 29 |
+
PCT_INDIVIDUAL = 0.80 # 80 % Individual, 20 % Corporate
|
| 30 |
+
PCT_ACTIVE = 0.95 # 95 % Active accounts
|
| 31 |
+
|
| 32 |
+
# ── Time window ───────────────────────────────────────────────────────────────
|
| 33 |
+
NOW = datetime.now(timezone.utc)
|
| 34 |
+
SIX_MONTHS_AGO = NOW - timedelta(days=182)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 38 |
+
# STEP 1 – PROCEDURAL GENERATION
|
| 39 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 40 |
+
|
| 41 |
+
def make_entity_id(n: int) -> str:
|
| 42 |
+
return f"ENT-{n:04d}"
|
| 43 |
+
|
| 44 |
+
def make_account_id(n: int) -> str:
|
| 45 |
+
return f"ACC-{n:04d}"
|
| 46 |
+
|
| 47 |
+
def make_txn_id(n: int) -> str:
|
| 48 |
+
return f"TXN-{n:06d}"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def generate_entities(count: int = NUM_ENTITIES) -> list[dict]:
|
| 52 |
+
"""Generate Individual and Corporate entities."""
|
| 53 |
+
entities: list[dict] = []
|
| 54 |
+
n_individual = round(count * PCT_INDIVIDUAL)
|
| 55 |
+
|
| 56 |
+
# ── Individuals first (needed as directors for Corporates) ───────────────
|
| 57 |
+
for i in range(1, n_individual + 1):
|
| 58 |
+
entities.append({
|
| 59 |
+
"entity_id": make_entity_id(i),
|
| 60 |
+
"name": fake.name(),
|
| 61 |
+
"type": "Individual",
|
| 62 |
+
"registration_address": fake.address().replace("\n", ", "),
|
| 63 |
+
"directors": [],
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
individual_ids = [e["entity_id"] for e in entities]
|
| 67 |
+
|
| 68 |
+
# ── Corporates ────────────────────────────────────────────────────────────
|
| 69 |
+
for i in range(n_individual + 1, count + 1):
|
| 70 |
+
num_directors = random.randint(1, 3)
|
| 71 |
+
directors = random.sample(individual_ids, k=num_directors)
|
| 72 |
+
entities.append({
|
| 73 |
+
"entity_id": make_entity_id(i),
|
| 74 |
+
"name": fake.company(),
|
| 75 |
+
"type": "Corporate",
|
| 76 |
+
"registration_address": fake.address().replace("\n", ", "),
|
| 77 |
+
"directors": directors,
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
return entities
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def generate_accounts(entities: list[dict], count: int = NUM_ACCOUNTS) -> list[dict]:
|
| 84 |
+
"""Assign accounts randomly to entities."""
|
| 85 |
+
entity_ids = [e["entity_id"] for e in entities]
|
| 86 |
+
accounts: list[dict] = []
|
| 87 |
+
|
| 88 |
+
for i in range(1, count + 1):
|
| 89 |
+
accounts.append({
|
| 90 |
+
"account_id": make_account_id(i),
|
| 91 |
+
"owner_entity_id": random.choice(entity_ids),
|
| 92 |
+
"status": "Active" if random.random() < PCT_ACTIVE else "Closed",
|
| 93 |
+
})
|
| 94 |
+
|
| 95 |
+
return accounts
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ── Memo / amount helpers ─────────────────────────────────────────────────────
|
| 99 |
+
|
| 100 |
+
def _corp_to_individual_tx() -> tuple[str, float]:
|
| 101 |
+
memo = random.choice([
|
| 102 |
+
"Payroll", "Salary Q3", "Salary Q4", "Expense Reimbursement",
|
| 103 |
+
"Bonus Payment", "Contractor Fee", "Freelance Invoice",
|
| 104 |
+
])
|
| 105 |
+
amount = round(random.uniform(2_000, 10_000), 2)
|
| 106 |
+
return memo, amount
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _corp_to_corp_tx() -> tuple[str, float]:
|
| 110 |
+
memo = random.choice([
|
| 111 |
+
"Server Hosting", "Consulting Retainer", "Office Supplies",
|
| 112 |
+
f"Invoice #{random.randint(1000, 9999)}", "Software License",
|
| 113 |
+
"Marketing Services", "Logistics Fee", "Partnership Distribution",
|
| 114 |
+
])
|
| 115 |
+
amount = round(random.uniform(500, 50_000), 2)
|
| 116 |
+
return memo, amount
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _individual_to_corp_tx() -> tuple[str, float]:
|
| 120 |
+
memo = random.choice([
|
| 121 |
+
"Utility Bill", "Coffee", "Gym Membership", "Online Shopping",
|
| 122 |
+
"Streaming Subscription", "Insurance Premium", "Rent Payment",
|
| 123 |
+
])
|
| 124 |
+
amount = round(random.uniform(5, 200), 2)
|
| 125 |
+
return memo, amount
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _individual_to_individual_tx() -> tuple[str, float]:
|
| 129 |
+
memo = random.choice([
|
| 130 |
+
"Dinner split", "Birthday gift", "Loan repayment", "Shared expenses",
|
| 131 |
+
"Concert tickets", "Rent share", "",
|
| 132 |
+
])
|
| 133 |
+
amount = round(random.uniform(10, 500), 2)
|
| 134 |
+
return memo, amount
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _random_timestamp() -> str:
|
| 138 |
+
delta_seconds = (NOW - SIX_MONTHS_AGO).total_seconds()
|
| 139 |
+
rand_seconds = random.uniform(0, delta_seconds)
|
| 140 |
+
ts = SIX_MONTHS_AGO + timedelta(seconds=rand_seconds)
|
| 141 |
+
return ts.isoformat()
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def generate_transactions(
|
| 145 |
+
accounts: list[dict],
|
| 146 |
+
entities: list[dict],
|
| 147 |
+
count: int = NUM_TRANSACTIONS,
|
| 148 |
+
id_offset: int = 0,
|
| 149 |
+
) -> list[dict]:
|
| 150 |
+
"""Generate semantically-typed transactions between accounts."""
|
| 151 |
+
|
| 152 |
+
# Build a lookup: account_id → entity type
|
| 153 |
+
entity_type: dict[str, str] = {e["entity_id"]: e["type"] for e in entities}
|
| 154 |
+
acct_to_entity: dict[str, str] = {
|
| 155 |
+
a["account_id"]: a["owner_entity_id"] for a in accounts
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
active_accounts = [a["account_id"] for a in accounts if a["status"] == "Active"]
|
| 159 |
+
if len(active_accounts) < 2:
|
| 160 |
+
raise ValueError("Not enough active accounts to generate transactions.")
|
| 161 |
+
|
| 162 |
+
transactions: list[dict] = []
|
| 163 |
+
|
| 164 |
+
for i in range(1, count + 1):
|
| 165 |
+
sender_acct = random.choice(active_accounts)
|
| 166 |
+
receiver_acct = random.choice(active_accounts)
|
| 167 |
+
while receiver_acct == sender_acct:
|
| 168 |
+
receiver_acct = random.choice(active_accounts)
|
| 169 |
+
|
| 170 |
+
sender_type = entity_type.get(acct_to_entity.get(sender_acct, ""), "Individual")
|
| 171 |
+
receiver_type = entity_type.get(acct_to_entity.get(receiver_acct, ""), "Individual")
|
| 172 |
+
|
| 173 |
+
if sender_type == "Corporate" and receiver_type == "Individual":
|
| 174 |
+
memo, amount = _corp_to_individual_tx()
|
| 175 |
+
elif sender_type == "Corporate" and receiver_type == "Corporate":
|
| 176 |
+
memo, amount = _corp_to_corp_tx()
|
| 177 |
+
elif sender_type == "Individual" and receiver_type == "Corporate":
|
| 178 |
+
memo, amount = _individual_to_corp_tx()
|
| 179 |
+
else:
|
| 180 |
+
memo, amount = _individual_to_individual_tx()
|
| 181 |
+
|
| 182 |
+
transactions.append({
|
| 183 |
+
"txn_id": make_txn_id(i + id_offset),
|
| 184 |
+
"sender_account": sender_acct,
|
| 185 |
+
"receiver_account": receiver_acct,
|
| 186 |
+
"amount": amount,
|
| 187 |
+
"timestamp": _random_timestamp(),
|
| 188 |
+
"memo_text": memo,
|
| 189 |
+
})
|
| 190 |
+
|
| 191 |
+
return transactions
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 195 |
+
# STEP 3 – INTEGRATION HOOKS
|
| 196 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 197 |
+
|
| 198 |
+
def _camouflage_transactions(
|
| 199 |
+
manual_account_ids: list[str],
|
| 200 |
+
haystack_account_ids: list[str],
|
| 201 |
+
entities: list[dict],
|
| 202 |
+
accounts: list[dict],
|
| 203 |
+
txn_id_start: int,
|
| 204 |
+
) -> list[dict]:
|
| 205 |
+
"""
|
| 206 |
+
Generate 5-10 'normal' transactions that bridge each manual account to
|
| 207 |
+
random haystack accounts, so manual accounts don't appear as isolated islands.
|
| 208 |
+
"""
|
| 209 |
+
entity_type: dict[str, str] = {e["entity_id"]: e["type"] for e in entities}
|
| 210 |
+
acct_to_entity: dict[str, str] = {
|
| 211 |
+
a["account_id"]: a["owner_entity_id"] for a in accounts
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
camouflage: list[dict] = []
|
| 215 |
+
counter = txn_id_start
|
| 216 |
+
|
| 217 |
+
for manual_acct in manual_account_ids:
|
| 218 |
+
n_bridge_txns = random.randint(5, 10)
|
| 219 |
+
for _ in range(n_bridge_txns):
|
| 220 |
+
haystack_acct = random.choice(haystack_account_ids)
|
| 221 |
+
|
| 222 |
+
# Randomly decide direction (manual sends or receives)
|
| 223 |
+
if random.random() < 0.5:
|
| 224 |
+
sender_acct, receiver_acct = manual_acct, haystack_acct
|
| 225 |
+
else:
|
| 226 |
+
sender_acct, receiver_acct = haystack_acct, manual_acct
|
| 227 |
+
|
| 228 |
+
sender_type = entity_type.get(acct_to_entity.get(sender_acct, ""), "Individual")
|
| 229 |
+
receiver_type = entity_type.get(acct_to_entity.get(receiver_acct, ""), "Individual")
|
| 230 |
+
|
| 231 |
+
if sender_type == "Corporate" and receiver_type == "Individual":
|
| 232 |
+
memo, amount = _corp_to_individual_tx()
|
| 233 |
+
elif sender_type == "Corporate" and receiver_type == "Corporate":
|
| 234 |
+
memo, amount = _corp_to_corp_tx()
|
| 235 |
+
elif sender_type == "Individual" and receiver_type == "Corporate":
|
| 236 |
+
memo, amount = _individual_to_corp_tx()
|
| 237 |
+
else:
|
| 238 |
+
memo, amount = _individual_to_individual_tx()
|
| 239 |
+
|
| 240 |
+
camouflage.append({
|
| 241 |
+
"txn_id": make_txn_id(counter),
|
| 242 |
+
"sender_account": sender_acct,
|
| 243 |
+
"receiver_account": receiver_acct,
|
| 244 |
+
"amount": amount,
|
| 245 |
+
"timestamp": _random_timestamp(),
|
| 246 |
+
"memo_text": memo,
|
| 247 |
+
"_camouflage": True, # debug tag – remove if unwanted
|
| 248 |
+
})
|
| 249 |
+
counter += 1
|
| 250 |
+
|
| 251 |
+
return camouflage
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def inject_manual_tasks(
|
| 255 |
+
haystack_data: dict,
|
| 256 |
+
manual_json_path: str | Path,
|
| 257 |
+
) -> dict:
|
| 258 |
+
"""
|
| 259 |
+
Load hand-written manual_tasks.json and merge it into the haystack.
|
| 260 |
+
|
| 261 |
+
Expected manual_tasks.json schema:
|
| 262 |
+
{
|
| 263 |
+
"entities": [ { ...entity fields... }, ... ],
|
| 264 |
+
"accounts": [ { ...account fields... }, ... ],
|
| 265 |
+
"transactions": [ { ...transaction fields... }, ... ]
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
Returns the merged dataset dict.
|
| 269 |
+
"""
|
| 270 |
+
manual_path = Path(manual_json_path)
|
| 271 |
+
if not manual_path.exists():
|
| 272 |
+
raise FileNotFoundError(f"Manual tasks file not found: {manual_path}")
|
| 273 |
+
|
| 274 |
+
with manual_path.open() as fh:
|
| 275 |
+
manual: dict = json.load(fh)
|
| 276 |
+
|
| 277 |
+
# ── Validate top-level keys ───────────────────────────────────────────────
|
| 278 |
+
for key in ("entities", "accounts", "transactions"):
|
| 279 |
+
if key not in manual:
|
| 280 |
+
raise ValueError(
|
| 281 |
+
f"manual_tasks.json is missing the '{key}' key. "
|
| 282 |
+
"Please check the expected schema in the docstring."
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
print(f" Injecting {len(manual['entities'])} manual entities …")
|
| 286 |
+
print(f" Injecting {len(manual['accounts'])} manual accounts …")
|
| 287 |
+
print(f" Injecting {len(manual['transactions'])} manual transactions …")
|
| 288 |
+
|
| 289 |
+
# ── Collect IDs already in the haystack to detect collisions ─────────────
|
| 290 |
+
existing_entity_ids = {e["entity_id"] for e in haystack_data["entities"]}
|
| 291 |
+
existing_acct_ids = {a["account_id"] for a in haystack_data["accounts"]}
|
| 292 |
+
existing_txn_ids = {t["txn_id"] for t in haystack_data["transactions"]}
|
| 293 |
+
|
| 294 |
+
for e in manual["entities"]:
|
| 295 |
+
if e["entity_id"] in existing_entity_ids:
|
| 296 |
+
raise ValueError(
|
| 297 |
+
f"Collision: entity_id '{e['entity_id']}' already exists in the haystack. "
|
| 298 |
+
"Use IDs outside the ENT-0001 … ENT-0300 range (e.g. ENT-9001)."
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
for a in manual["accounts"]:
|
| 302 |
+
if a["account_id"] in existing_acct_ids:
|
| 303 |
+
raise ValueError(
|
| 304 |
+
f"Collision: account_id '{a['account_id']}' already exists in the haystack. "
|
| 305 |
+
"Use IDs outside the ACC-0001 … ACC-0400 range (e.g. ACC-9001)."
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
for t in manual["transactions"]:
|
| 309 |
+
if t["txn_id"] in existing_txn_ids:
|
| 310 |
+
raise ValueError(
|
| 311 |
+
f"Collision: txn_id '{t['txn_id']}' already exists in the haystack. "
|
| 312 |
+
"Use IDs outside the TXN-000001 … TXN-005000 range (e.g. TXN-900001)."
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
# ── Append manual data ────────────────────────────────────────────────────
|
| 316 |
+
# Build a combined entity + account list so camouflage txns can look up types
|
| 317 |
+
combined_entities = haystack_data["entities"] + manual["entities"]
|
| 318 |
+
combined_accounts = haystack_data["accounts"] + manual["accounts"]
|
| 319 |
+
|
| 320 |
+
haystack_data["entities"] += manual["entities"]
|
| 321 |
+
haystack_data["accounts"] += manual["accounts"]
|
| 322 |
+
haystack_data["transactions"] += manual["transactions"]
|
| 323 |
+
|
| 324 |
+
# ── Camouflage: bridge manual accounts to haystack accounts ──────────────
|
| 325 |
+
manual_acct_ids = [a["account_id"] for a in manual["accounts"]]
|
| 326 |
+
haystack_acct_ids = [
|
| 327 |
+
a["account_id"]
|
| 328 |
+
for a in haystack_data["accounts"]
|
| 329 |
+
if a["account_id"] not in set(manual_acct_ids) and a["status"] == "Active"
|
| 330 |
+
]
|
| 331 |
+
|
| 332 |
+
txn_id_start = (
|
| 333 |
+
max(
|
| 334 |
+
int(t["txn_id"].split("-")[1])
|
| 335 |
+
for t in haystack_data["transactions"]
|
| 336 |
+
) + 1
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
camouflage = _camouflage_transactions(
|
| 340 |
+
manual_account_ids = manual_acct_ids,
|
| 341 |
+
haystack_account_ids = haystack_acct_ids,
|
| 342 |
+
entities = combined_entities,
|
| 343 |
+
accounts = combined_accounts,
|
| 344 |
+
txn_id_start = txn_id_start,
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
print(f" Generated {len(camouflage)} camouflage transactions …")
|
| 348 |
+
haystack_data["transactions"] += camouflage
|
| 349 |
+
|
| 350 |
+
return haystack_data
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 354 |
+
# I/O HELPERS
|
| 355 |
+
# ══════════════════════════════════���════════════════════════════════════════════
|
| 356 |
+
|
| 357 |
+
def _write_json(obj: list | dict, path: Path) -> None:
|
| 358 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 359 |
+
with path.open("w", encoding="utf-8") as fh:
|
| 360 |
+
json.dump(obj, fh, indent=2, ensure_ascii=False)
|
| 361 |
+
print(f" ✓ Wrote {len(obj):,} records → {path}")
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def save_dataset(data: dict, output_dir: Path = Path(".")) -> None:
|
| 365 |
+
"""Write the three JSON files to output_dir."""
|
| 366 |
+
_write_json(data["entities"], output_dir / "entities.json")
|
| 367 |
+
_write_json(data["accounts"], output_dir / "accounts.json")
|
| 368 |
+
_write_json(data["transactions"], output_dir / "transactions.json")
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 372 |
+
# MAIN
|
| 373 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 374 |
+
|
| 375 |
+
def main() -> None:
|
| 376 |
+
parser = argparse.ArgumentParser(
|
| 377 |
+
description="Generate a financial Knowledge Graph haystack."
|
| 378 |
+
)
|
| 379 |
+
parser.add_argument(
|
| 380 |
+
"--inject",
|
| 381 |
+
metavar="MANUAL_JSON",
|
| 382 |
+
help="Path to hand-written manual_tasks.json to inject into the haystack.",
|
| 383 |
+
)
|
| 384 |
+
parser.add_argument(
|
| 385 |
+
"--output-dir",
|
| 386 |
+
default=".",
|
| 387 |
+
metavar="DIR",
|
| 388 |
+
help="Directory where the three JSON files will be written (default: cwd).",
|
| 389 |
+
)
|
| 390 |
+
args = parser.parse_args()
|
| 391 |
+
output_dir = Path(args.output_dir)
|
| 392 |
+
|
| 393 |
+
# ── 1. Build the haystack ─────────────────────────────────────────────────
|
| 394 |
+
print("\n── Step 1: Generating entities …")
|
| 395 |
+
entities = generate_entities()
|
| 396 |
+
print(f" ✓ {len(entities)} entities "
|
| 397 |
+
f"({sum(1 for e in entities if e['type']=='Individual')} individuals, "
|
| 398 |
+
f"{sum(1 for e in entities if e['type']=='Corporate')} corporates)")
|
| 399 |
+
|
| 400 |
+
print("── Step 2: Generating accounts …")
|
| 401 |
+
accounts = generate_accounts(entities)
|
| 402 |
+
print(f" ✓ {len(accounts)} accounts "
|
| 403 |
+
f"({sum(1 for a in accounts if a['status']=='Active')} active)")
|
| 404 |
+
|
| 405 |
+
print("── Step 3: Generating transactions …")
|
| 406 |
+
transactions = generate_transactions(accounts, entities)
|
| 407 |
+
print(f" ✓ {len(transactions):,} transactions")
|
| 408 |
+
|
| 409 |
+
dataset: dict = {
|
| 410 |
+
"entities": entities,
|
| 411 |
+
"accounts": accounts,
|
| 412 |
+
"transactions": transactions,
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
# ── 2. Optionally inject manual fraud tasks ───────────────────────────────
|
| 416 |
+
if args.inject:
|
| 417 |
+
print(f"\n── Injecting manual tasks from: {args.inject}")
|
| 418 |
+
dataset = inject_manual_tasks(dataset, args.inject)
|
| 419 |
+
|
| 420 |
+
# ── 3. Write outputs ──────────────────────────────────────────────────────
|
| 421 |
+
print(f"\n── Writing JSON files to: {output_dir.resolve()}")
|
| 422 |
+
save_dataset(dataset, output_dir)
|
| 423 |
+
|
| 424 |
+
print("\n✅ Done.\n")
|
| 425 |
+
print(" Dataset summary:")
|
| 426 |
+
print(f" Entities: {len(dataset['entities']):>6,}")
|
| 427 |
+
print(f" Accounts: {len(dataset['accounts']):>6,}")
|
| 428 |
+
print(f" Transactions: {len(dataset['transactions']):>6,}")
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
if __name__ == "__main__":
|
| 432 |
+
main()
|
tools/tasks.json
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_comment": "Hand-crafted SOTA-Killer Fraud Tasks. Injected into the procedural haystack.",
|
| 3 |
+
"_id_conventions": {
|
| 4 |
+
"entities": "Use ENT-9001 and above",
|
| 5 |
+
"accounts": "Use ACC-9001 and above",
|
| 6 |
+
"transactions": "Use TXN-900001 and above"
|
| 7 |
+
},
|
| 8 |
+
"entities": [
|
| 9 |
+
{
|
| 10 |
+
"_task": "EASY - The False Positive",
|
| 11 |
+
"entity_id": "ENT-9001",
|
| 12 |
+
"name": "Springfield Construction LLC",
|
| 13 |
+
"type": "Corporate",
|
| 14 |
+
"registration_address": "404 Industrial Way, IL",
|
| 15 |
+
"directors": []
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"_task": "EASY - The False Positive",
|
| 19 |
+
"entity_id": "ENT-9002",
|
| 20 |
+
"name": "Global Tractor Sales Ltd.",
|
| 21 |
+
"type": "Corporate",
|
| 22 |
+
"registration_address": "High-Risk Jurisdiction, Cyprus",
|
| 23 |
+
"directors": []
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"_task": "MEDIUM - The Smurfs (Structuring)",
|
| 27 |
+
"entity_id": "ENT-9010",
|
| 28 |
+
"name": "Downtown Used Auto",
|
| 29 |
+
"type": "Corporate",
|
| 30 |
+
"registration_address": "101 Main St",
|
| 31 |
+
"directors": []
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"_task": "MEDIUM - The Smurfs (Structuring)",
|
| 35 |
+
"entity_id": "ENT-9011",
|
| 36 |
+
"name": "Sam Student",
|
| 37 |
+
"type": "Individual",
|
| 38 |
+
"registration_address": "Dorm 4B, State Uni",
|
| 39 |
+
"directors": []
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"_task": "MEDIUM - The Smurfs (Structuring)",
|
| 43 |
+
"entity_id": "ENT-9012",
|
| 44 |
+
"name": "Alex Undergrad",
|
| 45 |
+
"type": "Individual",
|
| 46 |
+
"registration_address": "Dorm 4B, State Uni",
|
| 47 |
+
"directors": []
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"_task": "MEDIUM - The Smurfs (Structuring)",
|
| 51 |
+
"entity_id": "ENT-9013",
|
| 52 |
+
"name": "Taylor Freshman",
|
| 53 |
+
"type": "Individual",
|
| 54 |
+
"registration_address": "Dorm 4B, State Uni",
|
| 55 |
+
"directors": []
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"_task": "HARD - The Corporate Mirage (The Mastermind)",
|
| 59 |
+
"entity_id": "ENT-9025",
|
| 60 |
+
"name": "Robert House",
|
| 61 |
+
"type": "Individual",
|
| 62 |
+
"registration_address": "1 Billionaire Row, NY",
|
| 63 |
+
"directors": []
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"_task": "HARD - The Corporate Mirage (Parent Company)",
|
| 67 |
+
"entity_id": "ENT-9024",
|
| 68 |
+
"name": "Apex Management Corp",
|
| 69 |
+
"type": "Corporate",
|
| 70 |
+
"registration_address": "100 Wall St, NY",
|
| 71 |
+
"directors": [
|
| 72 |
+
"ENT-9025"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"_task": "HARD - The Corporate Mirage (Sender)",
|
| 77 |
+
"entity_id": "ENT-9021",
|
| 78 |
+
"name": "Major Logistics Firm LLC",
|
| 79 |
+
"type": "Corporate",
|
| 80 |
+
"registration_address": "Port Authority, NJ",
|
| 81 |
+
"directors": [
|
| 82 |
+
"ENT-9024"
|
| 83 |
+
]
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"_task": "HARD - The Corporate Mirage (The Middleman)",
|
| 87 |
+
"entity_id": "ENT-9022",
|
| 88 |
+
"name": "Generic Consulting Agency",
|
| 89 |
+
"type": "Corporate",
|
| 90 |
+
"registration_address": "500 Madison Ave, NY",
|
| 91 |
+
"directors": []
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"_task": "HARD - The Corporate Mirage (The Offshore Loop)",
|
| 95 |
+
"entity_id": "ENT-9023",
|
| 96 |
+
"name": "Island Holding Company",
|
| 97 |
+
"type": "Corporate",
|
| 98 |
+
"registration_address": "Grand Cayman",
|
| 99 |
+
"directors": [
|
| 100 |
+
"ENT-9025"
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"_task": "HARD - The Corporate Mirage (The False Flag Trap)",
|
| 105 |
+
"entity_id": "ENT-9026",
|
| 106 |
+
"name": "Al-Qaeda Watchlist Target",
|
| 107 |
+
"type": "Individual",
|
| 108 |
+
"registration_address": "Unknown",
|
| 109 |
+
"directors": []
|
| 110 |
+
}
|
| 111 |
+
],
|
| 112 |
+
"accounts": [
|
| 113 |
+
{
|
| 114 |
+
"_task": "EASY",
|
| 115 |
+
"account_id": "ACC-9001",
|
| 116 |
+
"owner_entity_id": "ENT-9001",
|
| 117 |
+
"status": "Active"
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"_task": "EASY",
|
| 121 |
+
"account_id": "ACC-9002",
|
| 122 |
+
"owner_entity_id": "ENT-9002",
|
| 123 |
+
"status": "Active"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"_task": "MEDIUM",
|
| 127 |
+
"account_id": "ACC-9010",
|
| 128 |
+
"owner_entity_id": "ENT-9010",
|
| 129 |
+
"status": "Active"
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"_task": "MEDIUM",
|
| 133 |
+
"account_id": "ACC-9011",
|
| 134 |
+
"owner_entity_id": "ENT-9011",
|
| 135 |
+
"status": "Active"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"_task": "MEDIUM",
|
| 139 |
+
"account_id": "ACC-9012",
|
| 140 |
+
"owner_entity_id": "ENT-9012",
|
| 141 |
+
"status": "Active"
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"_task": "MEDIUM",
|
| 145 |
+
"account_id": "ACC-9013",
|
| 146 |
+
"owner_entity_id": "ENT-9013",
|
| 147 |
+
"status": "Active"
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"_task": "HARD",
|
| 151 |
+
"account_id": "ACC-9021",
|
| 152 |
+
"owner_entity_id": "ENT-9021",
|
| 153 |
+
"status": "Active"
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"_task": "HARD",
|
| 157 |
+
"account_id": "ACC-9022",
|
| 158 |
+
"owner_entity_id": "ENT-9022",
|
| 159 |
+
"status": "Active"
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"_task": "HARD",
|
| 163 |
+
"account_id": "ACC-9023",
|
| 164 |
+
"owner_entity_id": "ENT-9023",
|
| 165 |
+
"status": "Active"
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"_task": "HARD",
|
| 169 |
+
"account_id": "ACC-9026",
|
| 170 |
+
"owner_entity_id": "ENT-9026",
|
| 171 |
+
"status": "Active"
|
| 172 |
+
}
|
| 173 |
+
],
|
| 174 |
+
"transactions": [
|
| 175 |
+
{
|
| 176 |
+
"_task": "EASY - Looks bad (High Risk Area) but is just buying tractors.",
|
| 177 |
+
"txn_id": "TXN-900001",
|
| 178 |
+
"sender_account": "ACC-9001",
|
| 179 |
+
"receiver_account": "ACC-9002",
|
| 180 |
+
"amount": 50000.00,
|
| 181 |
+
"timestamp": "2025-04-01T09:00:00+00:00",
|
| 182 |
+
"memo_text": "Heavy Machinery Purchase - Unit 4"
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"_task": "MEDIUM - Smurf 1",
|
| 186 |
+
"txn_id": "TXN-900010",
|
| 187 |
+
"sender_account": "ACC-9011",
|
| 188 |
+
"receiver_account": "ACC-9010",
|
| 189 |
+
"amount": 9900.00,
|
| 190 |
+
"timestamp": "2025-04-05T10:00:00+00:00",
|
| 191 |
+
"memo_text": "Vehicle deposit"
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"_task": "MEDIUM - Smurf 2",
|
| 195 |
+
"txn_id": "TXN-900011",
|
| 196 |
+
"sender_account": "ACC-9012",
|
| 197 |
+
"receiver_account": "ACC-9010",
|
| 198 |
+
"amount": 9500.00,
|
| 199 |
+
"timestamp": "2025-04-05T11:30:00+00:00",
|
| 200 |
+
"memo_text": "Used car payment"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"_task": "MEDIUM - Smurf 3",
|
| 204 |
+
"txn_id": "TXN-900012",
|
| 205 |
+
"sender_account": "ACC-9013",
|
| 206 |
+
"receiver_account": "ACC-9010",
|
| 207 |
+
"amount": 9850.00,
|
| 208 |
+
"timestamp": "2025-04-05T14:15:00+00:00",
|
| 209 |
+
"memo_text": "Down payment"
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"_task": "HARD - The False Flag (The Bait). Distracts from the $2.5M crime.",
|
| 213 |
+
"txn_id": "TXN-900020",
|
| 214 |
+
"sender_account": "ACC-9021",
|
| 215 |
+
"receiver_account": "ACC-9026",
|
| 216 |
+
"amount": 100.00,
|
| 217 |
+
"timestamp": "2025-04-10T08:00:00+00:00",
|
| 218 |
+
"memo_text": "Charitable Donation"
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"_task": "HARD - The Main Transfer out.",
|
| 222 |
+
"txn_id": "TXN-900021",
|
| 223 |
+
"sender_account": "ACC-9021",
|
| 224 |
+
"receiver_account": "ACC-9022",
|
| 225 |
+
"amount": 2500000.00,
|
| 226 |
+
"timestamp": "2025-04-10T09:00:00+00:00",
|
| 227 |
+
"memo_text": "Q2 Consulting and Advisory Retainer"
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"_task": "HARD - The Hidden Loop integration. Same Mastermind Director (ENT-9025).",
|
| 231 |
+
"txn_id": "TXN-900022",
|
| 232 |
+
"sender_account": "ACC-9022",
|
| 233 |
+
"receiver_account": "ACC-9023",
|
| 234 |
+
"amount": 2400000.00,
|
| 235 |
+
"timestamp": "2025-04-12T10:00:00+00:00",
|
| 236 |
+
"memo_text": "Offshore IP Licensing Fee"
|
| 237 |
+
}
|
| 238 |
+
]
|
| 239 |
+
}
|