Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- constants.py +190 -0
- create_transactions.py +158 -0
- data_utils.py +155 -0
- db_utils.py +75 -0
constants.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime, timedelta
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
DB_PATH = "sqlite.db"
|
| 5 |
+
DB_NAME = "transactions"
|
| 6 |
+
|
| 7 |
+
# Parameters
|
| 8 |
+
NUM_USERS = 10
|
| 9 |
+
NUM_TRANSACTIONS = 100000
|
| 10 |
+
START_DATE = datetime.now() - timedelta(days=360)
|
| 11 |
+
END_DATE = datetime.now()
|
| 12 |
+
|
| 13 |
+
# Generate user IDs
|
| 14 |
+
USER_IDS = [f"user_{i}" for i in range(1, NUM_USERS + 1)]
|
| 15 |
+
|
| 16 |
+
# IP addresses and ISPs (for simplicity)
|
| 17 |
+
IP_ADDRESSES = [f"192.168.1.{i}" for i in range(1, 101)]
|
| 18 |
+
ISPs = ["ISP_A", "ISP_B", "ISP_C", "ISP_D", "ISP_E"]
|
| 19 |
+
|
| 20 |
+
# Device and browser types
|
| 21 |
+
DEVICE_TYPES = ["Tablet", "PC", "Mobile"]
|
| 22 |
+
BROWSER_TYPES = ["Chrome", "Safari", "Firefox", "Edge", "Opera"]
|
| 23 |
+
|
| 24 |
+
# Payee types
|
| 25 |
+
PAYEE_TYPES = ["Individual", "Business or corporations", "Financial agency", "Charity", "Educational Institute"]
|
| 26 |
+
|
| 27 |
+
# Authentication levels
|
| 28 |
+
AUTH_LEVELS = ["Low", "Medium", "High"]
|
| 29 |
+
|
| 30 |
+
# Transaction statuses
|
| 31 |
+
STATUSES = ["pending", "completed", "failed"]
|
| 32 |
+
|
| 33 |
+
# Payment methods
|
| 34 |
+
PAYMENT_METHODS = ["Credit Card", "Debit Card", "PayPal", "Bank Transfer", "Cryptocurrency"]
|
| 35 |
+
|
| 36 |
+
# Transaction categories
|
| 37 |
+
CATEGORIES = ["groceries", "utilities", "entertainment", "travel", "healthcare", "education", "shopping", "other"]
|
| 38 |
+
|
| 39 |
+
# Transaction types
|
| 40 |
+
TRANSACTION_TYPES = ["credit", "debit"]
|
| 41 |
+
|
| 42 |
+
# Merchants mapped to payee types
|
| 43 |
+
MERCHANT_PAYEE_MAPPING = {
|
| 44 |
+
"groceries": [f"Supermarket_{i}" for i in range(1, 21)],
|
| 45 |
+
"utilities": [f"UtilityCompany_{i}" for i in range(1, 11)],
|
| 46 |
+
"entertainment": [f"EntertainmentVenue_{i}" for i in range(1, 21)],
|
| 47 |
+
"travel": ([f"TravelAgency_{i}" for i in range(1, 11)] + [f"Airline_{i}" for i in range(1, 11)]),
|
| 48 |
+
"healthcare": (
|
| 49 |
+
[f"Hospital_{i}" for i in range(1, 11)]
|
| 50 |
+
+ [f"Clinic_{i}" for i in range(1, 11)]
|
| 51 |
+
+ [f"Pharmacy_{i}" for i in range(1, 11)]
|
| 52 |
+
),
|
| 53 |
+
"education": (
|
| 54 |
+
[f"University_{i}" for i in range(1, 11)]
|
| 55 |
+
+ [f"School_{i}" for i in range(1, 11)]
|
| 56 |
+
+ [f"Bookstore_{i}" for i in range(1, 11)]
|
| 57 |
+
),
|
| 58 |
+
"shopping": (
|
| 59 |
+
[f"Mall_{i}" for i in range(1, 11)]
|
| 60 |
+
+ [f"ElectronicsStore_{i}" for i in range(1, 11)]
|
| 61 |
+
+ [f"ClothingStore_{i}" for i in range(1, 11)]
|
| 62 |
+
),
|
| 63 |
+
"other": (
|
| 64 |
+
[f"ServiceProvider_{i}" for i in range(1, 11)]
|
| 65 |
+
+ [f"Consultant_{i}" for i in range(1, 11)]
|
| 66 |
+
+ [f"Freelancer_{i}" for i in range(1, 11)]
|
| 67 |
+
),
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Generate user profile mappings
|
| 71 |
+
USER_PROFILES = {
|
| 72 |
+
user_id: {
|
| 73 |
+
"IPAddress": np.random.choice(
|
| 74 |
+
IP_ADDRESSES, p=[0.9] + ([0.1 / (len(IP_ADDRESSES) - 1)] * (len(IP_ADDRESSES) - 1))
|
| 75 |
+
),
|
| 76 |
+
"DeviceType": np.random.choice(
|
| 77 |
+
DEVICE_TYPES, p=[0.9] + [0.1 / (len(DEVICE_TYPES) - 1)] * (len(DEVICE_TYPES) - 1)
|
| 78 |
+
),
|
| 79 |
+
"BrowserType": np.random.choice(
|
| 80 |
+
BROWSER_TYPES, p=[0.9] + [0.1 / (len(BROWSER_TYPES) - 1)] * (len(BROWSER_TYPES) - 1)
|
| 81 |
+
),
|
| 82 |
+
"ISP": np.random.choice(ISPs, p=[0.9] + [0.1 / (len(ISPs) - 1)] * (len(ISPs) - 1)),
|
| 83 |
+
}
|
| 84 |
+
for user_id in USER_IDS
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
# Columns
|
| 88 |
+
LABEL_COLUMN = "isFraud"
|
| 89 |
+
|
| 90 |
+
RAW_DATA_COLUMNS = [
|
| 91 |
+
"TransactionId",
|
| 92 |
+
"UserId",
|
| 93 |
+
"TransactionTimestamp",
|
| 94 |
+
"TransactionAmount",
|
| 95 |
+
"LagAmount",
|
| 96 |
+
"IPAddress",
|
| 97 |
+
"DeviceType",
|
| 98 |
+
"BrowserType",
|
| 99 |
+
"PayeeType",
|
| 100 |
+
"ISP",
|
| 101 |
+
"OTP",
|
| 102 |
+
"AuthenticationLevel",
|
| 103 |
+
"Status",
|
| 104 |
+
"PaymentMethod",
|
| 105 |
+
"Category",
|
| 106 |
+
"Merchant",
|
| 107 |
+
"TransactionType",
|
| 108 |
+
"isFraud",
|
| 109 |
+
]
|
| 110 |
+
RAW_DATA_COLUMN_TYPES = {
|
| 111 |
+
"TransactionId": "TEXT",
|
| 112 |
+
"UserId": "TEXT",
|
| 113 |
+
"TransactionTimestamp": "TEXT",
|
| 114 |
+
"TransactionAmount": "REAL",
|
| 115 |
+
"LagAmount": "REAL",
|
| 116 |
+
"IPAddress": "TEXT",
|
| 117 |
+
"DeviceType": "TEXT",
|
| 118 |
+
"BrowserType": "TEXT",
|
| 119 |
+
"PayeeType": "TEXT",
|
| 120 |
+
"ISP": "TEXT",
|
| 121 |
+
"OTP": "INTEGER",
|
| 122 |
+
"AuthenticationLevel": "TEXT",
|
| 123 |
+
"Status": "TEXT",
|
| 124 |
+
"PaymentMethod": "TEXT",
|
| 125 |
+
"Category": "TEXT",
|
| 126 |
+
"Merchant": "TEXT",
|
| 127 |
+
"TransactionType": "TEXT",
|
| 128 |
+
"isFraud": "INTEGER",
|
| 129 |
+
}
|
| 130 |
+
CATEGORICAL_COLS = [
|
| 131 |
+
"DeviceType",
|
| 132 |
+
"BrowserType",
|
| 133 |
+
"PayeeType",
|
| 134 |
+
"ISP",
|
| 135 |
+
"Status",
|
| 136 |
+
"PaymentMethod",
|
| 137 |
+
"Category",
|
| 138 |
+
"TransactionType",
|
| 139 |
+
]
|
| 140 |
+
FEATURES = [
|
| 141 |
+
"TransactionAmount",
|
| 142 |
+
"LagAmount",
|
| 143 |
+
"OTP",
|
| 144 |
+
"DayOfMonth",
|
| 145 |
+
"DayOfWeek",
|
| 146 |
+
"HourOfDay",
|
| 147 |
+
"TimeSinceLastTx",
|
| 148 |
+
"KnownDeviceType",
|
| 149 |
+
"KnownBrowserType",
|
| 150 |
+
"KnownIP",
|
| 151 |
+
"KnownISP",
|
| 152 |
+
"DeviceType_Mobile",
|
| 153 |
+
"DeviceType_PC",
|
| 154 |
+
"DeviceType_Tablet",
|
| 155 |
+
"BrowserType_Chrome",
|
| 156 |
+
"BrowserType_Edge",
|
| 157 |
+
"BrowserType_Firefox",
|
| 158 |
+
"BrowserType_Opera",
|
| 159 |
+
"BrowserType_Safari",
|
| 160 |
+
"PayeeType_Business or corporations",
|
| 161 |
+
"PayeeType_Charity",
|
| 162 |
+
"PayeeType_Educational Institute",
|
| 163 |
+
"PayeeType_Financial agency",
|
| 164 |
+
"PayeeType_Individual",
|
| 165 |
+
"ISP_ISP_A",
|
| 166 |
+
"ISP_ISP_B",
|
| 167 |
+
"ISP_ISP_C",
|
| 168 |
+
"ISP_ISP_D",
|
| 169 |
+
"ISP_ISP_E",
|
| 170 |
+
"Status_completed",
|
| 171 |
+
"Status_failed",
|
| 172 |
+
"Status_pending",
|
| 173 |
+
"PaymentMethod_Bank Transfer",
|
| 174 |
+
"PaymentMethod_Credit Card",
|
| 175 |
+
"PaymentMethod_Cryptocurrency",
|
| 176 |
+
"PaymentMethod_Debit Card",
|
| 177 |
+
"PaymentMethod_PayPal",
|
| 178 |
+
"Category_education",
|
| 179 |
+
"Category_entertainment",
|
| 180 |
+
"Category_groceries",
|
| 181 |
+
"Category_healthcare",
|
| 182 |
+
"Category_other",
|
| 183 |
+
"Category_shopping",
|
| 184 |
+
"Category_travel",
|
| 185 |
+
"Category_utilities",
|
| 186 |
+
"TransactionType_credit",
|
| 187 |
+
"TransactionType_debit",
|
| 188 |
+
"AuthenticationLevel",
|
| 189 |
+
]
|
| 190 |
+
SCALED_COLUMNS = ["DayOfMonth", "DayOfWeek", "HourOfDay", "TimeSinceLastTx", "TransactionAmount", "LagAmount"]
|
create_transactions.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import uuid
|
| 3 |
+
from datetime import timedelta
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from constants import (
|
| 9 |
+
RAW_DATA_COLUMNS,
|
| 10 |
+
START_DATE,
|
| 11 |
+
END_DATE,
|
| 12 |
+
MERCHANT_PAYEE_MAPPING,
|
| 13 |
+
PAYEE_TYPES,
|
| 14 |
+
NUM_TRANSACTIONS,
|
| 15 |
+
USER_IDS,
|
| 16 |
+
IP_ADDRESSES,
|
| 17 |
+
ISPs,
|
| 18 |
+
AUTH_LEVELS,
|
| 19 |
+
STATUSES,
|
| 20 |
+
CATEGORIES,
|
| 21 |
+
PAYMENT_METHODS,
|
| 22 |
+
TRANSACTION_TYPES,
|
| 23 |
+
DEVICE_TYPES,
|
| 24 |
+
BROWSER_TYPES,
|
| 25 |
+
USER_PROFILES,
|
| 26 |
+
)
|
| 27 |
+
from db_utils import insert_multiple_transactions, delete_all_transactions
|
| 28 |
+
from fraud.constants import LABEL_COLUMN
|
| 29 |
+
from train import prepare_train_data
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# Function to generate random timestamps
|
| 33 |
+
def random_date(start, end):
|
| 34 |
+
return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# Function to get merchant based on category
|
| 38 |
+
def get_merchant_and_payee(category):
|
| 39 |
+
merchants = MERCHANT_PAYEE_MAPPING[category]
|
| 40 |
+
merchant = random.choice(merchants)
|
| 41 |
+
if "Supermarket" in merchant or "Mall" in merchant or "Store" in merchant:
|
| 42 |
+
payee_type = "Business or corporations"
|
| 43 |
+
elif "UtilityCompany" in merchant:
|
| 44 |
+
payee_type = "Financial agency"
|
| 45 |
+
elif "Hospital" in merchant or "Clinic" in merchant or "Pharmacy" in merchant:
|
| 46 |
+
payee_type = "Individual"
|
| 47 |
+
elif "University" in merchant or "School" in merchant or "Bookstore" in merchant:
|
| 48 |
+
payee_type = "Educational Institute"
|
| 49 |
+
elif "ServiceProvider" in merchant or "Consultant" in merchant or "Freelancer" in merchant:
|
| 50 |
+
payee_type = "Individual"
|
| 51 |
+
else:
|
| 52 |
+
payee_type = random.choice(PAYEE_TYPES)
|
| 53 |
+
return merchant, payee_type
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# Function to simulate a transaction
|
| 57 |
+
def generate_transaction(user_id):
|
| 58 |
+
device_type = USER_PROFILES[user_id]["DeviceType"] if np.random.rand() < 0.9 else np.random.choice(DEVICE_TYPES)
|
| 59 |
+
browser_type = USER_PROFILES[user_id]["BrowserType"] if np.random.rand() < 0.9 else np.random.choice(BROWSER_TYPES)
|
| 60 |
+
isp = USER_PROFILES[user_id]["ISP"] if np.random.rand() < 0.9 else np.random.choice(ISPs)
|
| 61 |
+
transaction_id = str(uuid.uuid4())
|
| 62 |
+
transaction_timestamp = random_date(START_DATE, END_DATE)
|
| 63 |
+
amount = round(random.uniform(1, 10000), 2) # Transaction amount between $1 and $10000
|
| 64 |
+
lag_amount = round(amount * random.uniform(0.5, 1.5), 2) # Lag amount as a factor of transaction amount
|
| 65 |
+
ip_address = random.choice(IP_ADDRESSES)
|
| 66 |
+
otp = np.random.binomial(1, 0.85) # 85% of transactions use OTP
|
| 67 |
+
auth_level = random.choice(AUTH_LEVELS)
|
| 68 |
+
is_fraud = np.random.binomial(1, 0.02) # Assume 2% transactions are fraudulent
|
| 69 |
+
status = random.choice(STATUSES)
|
| 70 |
+
payment_method = random.choice(PAYMENT_METHODS)
|
| 71 |
+
category = random.choice(CATEGORIES)
|
| 72 |
+
merchant, payee_type = get_merchant_and_payee(category)
|
| 73 |
+
transaction_type = random.choice(TRANSACTION_TYPES)
|
| 74 |
+
|
| 75 |
+
return [
|
| 76 |
+
transaction_id,
|
| 77 |
+
user_id,
|
| 78 |
+
transaction_timestamp,
|
| 79 |
+
amount,
|
| 80 |
+
lag_amount,
|
| 81 |
+
ip_address,
|
| 82 |
+
device_type,
|
| 83 |
+
browser_type,
|
| 84 |
+
payee_type,
|
| 85 |
+
isp,
|
| 86 |
+
otp,
|
| 87 |
+
auth_level,
|
| 88 |
+
status,
|
| 89 |
+
payment_method,
|
| 90 |
+
category,
|
| 91 |
+
merchant,
|
| 92 |
+
transaction_type,
|
| 93 |
+
is_fraud,
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def update_fraud_status(df):
|
| 98 |
+
is_fraud = []
|
| 99 |
+
for i, row in df.iterrows():
|
| 100 |
+
fraud_prob = row[LABEL_COLUMN]
|
| 101 |
+
|
| 102 |
+
if row["TransactionAmount"] > 6000:
|
| 103 |
+
fraud_prob += 0.3
|
| 104 |
+
if row["TimeSinceLastTx"] < 3600:
|
| 105 |
+
fraud_prob += 0.1
|
| 106 |
+
|
| 107 |
+
if row["KnownIP"] == 0:
|
| 108 |
+
fraud_prob += 0.1
|
| 109 |
+
if row["KnownDeviceType"] == 0:
|
| 110 |
+
fraud_prob += 0.1
|
| 111 |
+
if row["KnownBrowserType"] == 0:
|
| 112 |
+
fraud_prob += 0.1
|
| 113 |
+
if row["KnownISP"] == 0:
|
| 114 |
+
fraud_prob += 0.1
|
| 115 |
+
|
| 116 |
+
if row["OTP"]:
|
| 117 |
+
fraud_prob -= 0.3
|
| 118 |
+
|
| 119 |
+
if row["AuthenticationLevel"] > 2:
|
| 120 |
+
fraud_prob -= 0.2
|
| 121 |
+
elif row["AuthenticationLevel"] > 1:
|
| 122 |
+
fraud_prob -= 0.1
|
| 123 |
+
else:
|
| 124 |
+
fraud_prob -= 0.05
|
| 125 |
+
|
| 126 |
+
if fraud_prob < 0:
|
| 127 |
+
fraud_prob = 0
|
| 128 |
+
elif fraud_prob > 1:
|
| 129 |
+
fraud_prob = 1
|
| 130 |
+
|
| 131 |
+
row_is_fraud = np.random.choice([0, 1], p=[1 - fraud_prob, fraud_prob])
|
| 132 |
+
|
| 133 |
+
is_fraud.append(row_is_fraud)
|
| 134 |
+
return is_fraud
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def main():
|
| 138 |
+
# delete_all_transactions() # Commented out to avoid deleting transactions
|
| 139 |
+
# Generate data
|
| 140 |
+
transactions = [generate_transaction(user_id=random.choice(USER_IDS)) for _ in range(NUM_TRANSACTIONS)]
|
| 141 |
+
# Create DataFrame
|
| 142 |
+
df = pd.DataFrame(
|
| 143 |
+
transactions,
|
| 144 |
+
columns=RAW_DATA_COLUMNS,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Update fraud status
|
| 148 |
+
df2, _ = prepare_train_data(df.copy())
|
| 149 |
+
is_fraud = update_fraud_status(df2)
|
| 150 |
+
df[LABEL_COLUMN] = is_fraud
|
| 151 |
+
|
| 152 |
+
# Save
|
| 153 |
+
# df.to_csv("transactions_data.csv", index=False)
|
| 154 |
+
insert_multiple_transactions(df)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
if __name__ == "__main__":
|
| 158 |
+
main()
|
data_utils.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
import joblib
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import requests
|
| 7 |
+
|
| 8 |
+
# Load the trained model and feature names
|
| 9 |
+
model, feature_names = joblib.load("model.pkl")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Function to fetch data from final_data.csv based on user input
|
| 13 |
+
def fetch_data(user_id, txn_amount):
|
| 14 |
+
final_data = pd.read_csv("final_data.csv")
|
| 15 |
+
fetched_data = final_data[(final_data["user_id"] == user_id) & (final_data["TxnAmount($)"] == txn_amount)]
|
| 16 |
+
return fetched_data
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Define the prediction function
|
| 20 |
+
def predict_fraud(input_data):
|
| 21 |
+
# Reorder and align input_data to match the feature names used during training
|
| 22 |
+
input_data = input_data.reindex(columns=feature_names, fill_value=0)
|
| 23 |
+
# Make the prediction
|
| 24 |
+
prediction = model.predict(input_data)
|
| 25 |
+
return prediction[0]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Function to handle real-time data and make predictions
|
| 29 |
+
def handle_real_time_data(real_time_data):
|
| 30 |
+
predictions = []
|
| 31 |
+
for index, row in real_time_data.iterrows():
|
| 32 |
+
input_data = pd.DataFrame([row])
|
| 33 |
+
prediction = predict_fraud(input_data)
|
| 34 |
+
predictions.append(prediction)
|
| 35 |
+
real_time_data["Prediction"] = predictions
|
| 36 |
+
return real_time_data
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def fetch_test_data():
|
| 40 |
+
# Simulate real-time data fetching from an API or a streaming service
|
| 41 |
+
return pd.read_csv("test.csv") # Example CSV file for real-time transactions
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Function to determine time of day
|
| 45 |
+
def time_of_day(hour, minute, second):
|
| 46 |
+
if 5 <= hour < 11:
|
| 47 |
+
return 0
|
| 48 |
+
elif hour == 11 and minute < 1:
|
| 49 |
+
return 0.166666667
|
| 50 |
+
elif 11 <= hour < 13:
|
| 51 |
+
return 0.166666667
|
| 52 |
+
elif hour == 13 and minute < 1:
|
| 53 |
+
return 0.333333333
|
| 54 |
+
elif 13 <= hour < 15:
|
| 55 |
+
return 0.333333333
|
| 56 |
+
elif hour == 15 and minute < 1:
|
| 57 |
+
return 0.5
|
| 58 |
+
elif 15 <= hour < 17:
|
| 59 |
+
return 0.5
|
| 60 |
+
elif hour == 17 and minute < 1:
|
| 61 |
+
return 0.666666667
|
| 62 |
+
elif 17 <= hour < 19:
|
| 63 |
+
return 0.666666667
|
| 64 |
+
elif hour == 19 and minute < 1:
|
| 65 |
+
return 0.833333333
|
| 66 |
+
elif 19 <= hour < 24:
|
| 67 |
+
return 0.833333333
|
| 68 |
+
elif hour == 0 and minute < 1:
|
| 69 |
+
return 1
|
| 70 |
+
elif 0 <= hour < 5:
|
| 71 |
+
return 1
|
| 72 |
+
else:
|
| 73 |
+
return "Invalid time"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def payee_type(payee):
|
| 77 |
+
payee_types = ["Individual", "Business or corporations", "Financial agency", "Charity", "Educational Institute"]
|
| 78 |
+
payee_values = [0, 0.369070247, 0.630929753, 0.834043767, 1]
|
| 79 |
+
if payee in payee_types:
|
| 80 |
+
return payee_values[payee_types.index(payee)]
|
| 81 |
+
else:
|
| 82 |
+
return "Invalid Payee Type"
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Get current location details including latitude and longitude
|
| 86 |
+
def get_location():
|
| 87 |
+
try:
|
| 88 |
+
res = requests.get("https://ipinfo.io/")
|
| 89 |
+
data = res.json()
|
| 90 |
+
loc = data["loc"].split(",")
|
| 91 |
+
latitude = loc[0]
|
| 92 |
+
longitude = loc[1]
|
| 93 |
+
return data["city"], data["region"], data["country"], data["ip"], latitude, longitude
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print("Unable to retrieve location:", e)
|
| 96 |
+
return None, None, None, None, None, None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Function to generate synthetic data
|
| 100 |
+
def generate_synthetic_data(user_id, txn_amount):
|
| 101 |
+
# Define the feature values based on model requirements
|
| 102 |
+
lag_amount1 = random.randint(50, 200) # Random lag_amount1
|
| 103 |
+
average_amount_3txns = random.randint(50, 200) # Random average amount of last 3 transactions
|
| 104 |
+
time_since_last_txn = random.randint(1, 60) # Random time since last transaction (in minutes)
|
| 105 |
+
velocity_txns = random.randint(1, 10) # Random velocity of transactions
|
| 106 |
+
weekday = random.choice([0, 1, 2, 3, 4, 5, 6]) # Random weekday encoded as numerical value
|
| 107 |
+
ip_used_known = random.choice([0, 1]) # Random IP used known
|
| 108 |
+
time_slots = time_of_day(
|
| 109 |
+
datetime.now().hour, datetime.now().minute, datetime.now().second
|
| 110 |
+
) # Time of day based on current time
|
| 111 |
+
known_device_browser = random.choice([0, 1]) # Random known device and browser
|
| 112 |
+
payee_type_value = payee_type(
|
| 113 |
+
random.choice(
|
| 114 |
+
["Individual", "Business or corporations", "Financial agency", "Charity", "Educational Institute"]
|
| 115 |
+
)
|
| 116 |
+
) # Random payee type value
|
| 117 |
+
time_since_payee_regn = random.randint(1, 365) # Random time since payee registration (in days)
|
| 118 |
+
known_location = random.choice([0, 1]) # Random known location
|
| 119 |
+
otp = random.choice([0, 1]) # Random OTP
|
| 120 |
+
authentication_level = random.choice([0, 1, 2]) # Random authentication level
|
| 121 |
+
time_limit = random.choice([0, 1]) # Random time limit
|
| 122 |
+
browser_known = random.choice([0, 1]) # Random browser known
|
| 123 |
+
location_change = random.choice([0, 1]) # Random location change
|
| 124 |
+
|
| 125 |
+
# Create a DataFrame with the generated synthetic data
|
| 126 |
+
synthetic_data = pd.DataFrame(
|
| 127 |
+
{
|
| 128 |
+
"user_id": [user_id],
|
| 129 |
+
"TxnAmount($)": [txn_amount],
|
| 130 |
+
"LagAmount1": [lag_amount1],
|
| 131 |
+
"AverageAmount(Last3txns)": [average_amount_3txns],
|
| 132 |
+
"TimeSinceLastTxn": [time_since_last_txn],
|
| 133 |
+
"VelocityTxns": [velocity_txns],
|
| 134 |
+
"WeekDay": [weekday],
|
| 135 |
+
"IPUsedKnown": [ip_used_known],
|
| 136 |
+
"TimeSlots": [time_slots],
|
| 137 |
+
"KnownDeviceBrowser": [known_device_browser],
|
| 138 |
+
"PayeeType": [payee_type_value],
|
| 139 |
+
"Timesincepayeeregn": [time_since_payee_regn],
|
| 140 |
+
"KnownLocation": [known_location],
|
| 141 |
+
"OTP": [otp],
|
| 142 |
+
"AuthenticationLevel": [authentication_level],
|
| 143 |
+
"Timelimit": [time_limit],
|
| 144 |
+
"BrowserKnown": [browser_known],
|
| 145 |
+
"location_change": [location_change],
|
| 146 |
+
}
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return synthetic_data
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# Function to simulate fetching real-time data
|
| 153 |
+
def fetch_real_time_data(user_id, txn_amount):
|
| 154 |
+
# Simulate real-time data fetching from an API or a streaming service
|
| 155 |
+
return generate_synthetic_data(user_id, txn_amount) # Generating synthetic data instead of reading from CSV
|
db_utils.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from constants import DB_PATH, DB_NAME, RAW_DATA_COLUMNS, RAW_DATA_COLUMN_TYPES
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_connection():
|
| 9 |
+
# Connect to SQLite database (or create it if it doesn't exist)
|
| 10 |
+
db = sqlite3.connect(DB_PATH)
|
| 11 |
+
cursor = db.cursor()
|
| 12 |
+
return db, cursor
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def execute_query(query):
|
| 16 |
+
print(f"Executing query: {repr(query)}")
|
| 17 |
+
db, cursor = get_connection()
|
| 18 |
+
cursor.execute(query)
|
| 19 |
+
cursor.close()
|
| 20 |
+
# Commit the changes
|
| 21 |
+
db.commit()
|
| 22 |
+
db.close()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def create_transaction_table():
|
| 26 |
+
# Create a table
|
| 27 |
+
query = f"CREATE TABLE IF NOT EXISTS {DB_NAME} ("
|
| 28 |
+
for col in RAW_DATA_COLUMNS:
|
| 29 |
+
query += f"{col} {RAW_DATA_COLUMN_TYPES[col]}, "
|
| 30 |
+
query = query[:-2]
|
| 31 |
+
query += ")"
|
| 32 |
+
execute_query(query)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def insert_multiple_transactions(df):
|
| 36 |
+
create_transaction_table()
|
| 37 |
+
db, _ = get_connection()
|
| 38 |
+
df.to_sql(DB_NAME, db, if_exists="append", index=False)
|
| 39 |
+
db.commit()
|
| 40 |
+
db.close()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Function to insert a new user into the users table
|
| 44 |
+
def insert_single_transaction(transaction):
|
| 45 |
+
transaction["TransactionTimestamp"] = str(transaction["TransactionTimestamp"])
|
| 46 |
+
query = f"INSERT INTO {DB_NAME} " f"{tuple(transaction.keys())}" f" VALUES " f"{tuple(transaction.values())}"
|
| 47 |
+
execute_query(query)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def execute_return_query(query):
|
| 51 |
+
db, cursor = get_connection()
|
| 52 |
+
cursor.execute(query)
|
| 53 |
+
rows = cursor.fetchall()
|
| 54 |
+
cursor.close()
|
| 55 |
+
# Commit the changes
|
| 56 |
+
db.commit()
|
| 57 |
+
db.close()
|
| 58 |
+
return rows
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Function to fetch all users from the users table
|
| 62 |
+
def fetch_all_transactions():
|
| 63 |
+
db, cursor = get_connection()
|
| 64 |
+
query = f"SELECT * FROM {DB_NAME}"
|
| 65 |
+
df = pd.read_sql_query(query, db)
|
| 66 |
+
db.close()
|
| 67 |
+
return df
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def delete_all_transactions():
|
| 71 |
+
execute_return_query(f"DELETE FROM {DB_NAME}")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
delete_all_transactions()
|