Spaces:

Girish1432
/

fraud

Sleeping

App Files Files Community

Girish1432 commited on Jul 2, 2024

Commit

00a3bde

verified ·

1 Parent(s): d5c3f44

Upload 4 files

Browse files

Files changed (4) hide show

constants.py +190 -0
create_transactions.py +158 -0
data_utils.py +155 -0
db_utils.py +75 -0

constants.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from datetime import datetime, timedelta
+import numpy as np
+DB_PATH = "sqlite.db"
+DB_NAME = "transactions"
+# Parameters
+NUM_USERS = 10
+NUM_TRANSACTIONS = 100000
+START_DATE = datetime.now() - timedelta(days=360)
+END_DATE = datetime.now()
+# Generate user IDs
+USER_IDS = [f"user_{i}" for i in range(1, NUM_USERS + 1)]
+# IP addresses and ISPs (for simplicity)
+IP_ADDRESSES = [f"192.168.1.{i}" for i in range(1, 101)]
+ISPs = ["ISP_A", "ISP_B", "ISP_C", "ISP_D", "ISP_E"]
+# Device and browser types
+DEVICE_TYPES = ["Tablet", "PC", "Mobile"]
+BROWSER_TYPES = ["Chrome", "Safari", "Firefox", "Edge", "Opera"]
+# Payee types
+PAYEE_TYPES = ["Individual", "Business or corporations", "Financial agency", "Charity", "Educational Institute"]
+# Authentication levels
+AUTH_LEVELS = ["Low", "Medium", "High"]
+# Transaction statuses
+STATUSES = ["pending", "completed", "failed"]
+# Payment methods
+PAYMENT_METHODS = ["Credit Card", "Debit Card", "PayPal", "Bank Transfer", "Cryptocurrency"]
+# Transaction categories
+CATEGORIES = ["groceries", "utilities", "entertainment", "travel", "healthcare", "education", "shopping", "other"]
+# Transaction types
+TRANSACTION_TYPES = ["credit", "debit"]
+# Merchants mapped to payee types
+MERCHANT_PAYEE_MAPPING = {
+    "groceries": [f"Supermarket_{i}" for i in range(1, 21)],
+    "utilities": [f"UtilityCompany_{i}" for i in range(1, 11)],
+    "entertainment": [f"EntertainmentVenue_{i}" for i in range(1, 21)],
+    "travel": ([f"TravelAgency_{i}" for i in range(1, 11)] + [f"Airline_{i}" for i in range(1, 11)]),
+    "healthcare": (
+        [f"Hospital_{i}" for i in range(1, 11)]
+        + [f"Clinic_{i}" for i in range(1, 11)]
+        + [f"Pharmacy_{i}" for i in range(1, 11)]
+    ),
+    "education": (
+        [f"University_{i}" for i in range(1, 11)]
+        + [f"School_{i}" for i in range(1, 11)]
+        + [f"Bookstore_{i}" for i in range(1, 11)]
+    ),
+    "shopping": (
+        [f"Mall_{i}" for i in range(1, 11)]
+        + [f"ElectronicsStore_{i}" for i in range(1, 11)]
+        + [f"ClothingStore_{i}" for i in range(1, 11)]
+    ),
+    "other": (
+        [f"ServiceProvider_{i}" for i in range(1, 11)]
+        + [f"Consultant_{i}" for i in range(1, 11)]
+        + [f"Freelancer_{i}" for i in range(1, 11)]
+    ),
+}
+# Generate user profile mappings
+USER_PROFILES = {
+    user_id: {
+        "IPAddress": np.random.choice(
+            IP_ADDRESSES, p=[0.9] + ([0.1 / (len(IP_ADDRESSES) - 1)] * (len(IP_ADDRESSES) - 1))
+        ),
+        "DeviceType": np.random.choice(
+            DEVICE_TYPES, p=[0.9] + [0.1 / (len(DEVICE_TYPES) - 1)] * (len(DEVICE_TYPES) - 1)
+        ),
+        "BrowserType": np.random.choice(
+            BROWSER_TYPES, p=[0.9] + [0.1 / (len(BROWSER_TYPES) - 1)] * (len(BROWSER_TYPES) - 1)
+        ),
+        "ISP": np.random.choice(ISPs, p=[0.9] + [0.1 / (len(ISPs) - 1)] * (len(ISPs) - 1)),
+    }
+    for user_id in USER_IDS
+}
+# Columns
+LABEL_COLUMN = "isFraud"
+RAW_DATA_COLUMNS = [
+    "TransactionId",
+    "UserId",
+    "TransactionTimestamp",
+    "TransactionAmount",
+    "LagAmount",
+    "IPAddress",
+    "DeviceType",
+    "BrowserType",
+    "PayeeType",
+    "ISP",
+    "OTP",
+    "AuthenticationLevel",
+    "Status",
+    "PaymentMethod",
+    "Category",
+    "Merchant",
+    "TransactionType",
+    "isFraud",
+]
+RAW_DATA_COLUMN_TYPES = {
+    "TransactionId": "TEXT",
+    "UserId": "TEXT",
+    "TransactionTimestamp": "TEXT",
+    "TransactionAmount": "REAL",
+    "LagAmount": "REAL",
+    "IPAddress": "TEXT",
+    "DeviceType": "TEXT",
+    "BrowserType": "TEXT",
+    "PayeeType": "TEXT",
+    "ISP": "TEXT",
+    "OTP": "INTEGER",
+    "AuthenticationLevel": "TEXT",
+    "Status": "TEXT",
+    "PaymentMethod": "TEXT",
+    "Category": "TEXT",
+    "Merchant": "TEXT",
+    "TransactionType": "TEXT",
+    "isFraud": "INTEGER",
+}
+CATEGORICAL_COLS = [
+    "DeviceType",
+    "BrowserType",
+    "PayeeType",
+    "ISP",
+    "Status",
+    "PaymentMethod",
+    "Category",
+    "TransactionType",
+]
+FEATURES = [
+    "TransactionAmount",
+    "LagAmount",
+    "OTP",
+    "DayOfMonth",
+    "DayOfWeek",
+    "HourOfDay",
+    "TimeSinceLastTx",
+    "KnownDeviceType",
+    "KnownBrowserType",
+    "KnownIP",
+    "KnownISP",
+    "DeviceType_Mobile",
+    "DeviceType_PC",
+    "DeviceType_Tablet",
+    "BrowserType_Chrome",
+    "BrowserType_Edge",
+    "BrowserType_Firefox",
+    "BrowserType_Opera",
+    "BrowserType_Safari",
+    "PayeeType_Business or corporations",
+    "PayeeType_Charity",
+    "PayeeType_Educational Institute",
+    "PayeeType_Financial agency",
+    "PayeeType_Individual",
+    "ISP_ISP_A",
+    "ISP_ISP_B",
+    "ISP_ISP_C",
+    "ISP_ISP_D",
+    "ISP_ISP_E",
+    "Status_completed",
+    "Status_failed",
+    "Status_pending",
+    "PaymentMethod_Bank Transfer",
+    "PaymentMethod_Credit Card",
+    "PaymentMethod_Cryptocurrency",
+    "PaymentMethod_Debit Card",
+    "PaymentMethod_PayPal",
+    "Category_education",
+    "Category_entertainment",
+    "Category_groceries",
+    "Category_healthcare",
+    "Category_other",
+    "Category_shopping",
+    "Category_travel",
+    "Category_utilities",
+    "TransactionType_credit",
+    "TransactionType_debit",
+    "AuthenticationLevel",
+]
+SCALED_COLUMNS = ["DayOfMonth", "DayOfWeek", "HourOfDay", "TimeSinceLastTx", "TransactionAmount", "LagAmount"]

create_transactions.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import random
+import uuid
+from datetime import timedelta
+import numpy as np
+import pandas as pd
+from constants import (
+    RAW_DATA_COLUMNS,
+    START_DATE,
+    END_DATE,
+    MERCHANT_PAYEE_MAPPING,
+    PAYEE_TYPES,
+    NUM_TRANSACTIONS,
+    USER_IDS,
+    IP_ADDRESSES,
+    ISPs,
+    AUTH_LEVELS,
+    STATUSES,
+    CATEGORIES,
+    PAYMENT_METHODS,
+    TRANSACTION_TYPES,
+    DEVICE_TYPES,
+    BROWSER_TYPES,
+    USER_PROFILES,
+)
+from db_utils import insert_multiple_transactions, delete_all_transactions
+from fraud.constants import LABEL_COLUMN
+from train import prepare_train_data
+# Function to generate random timestamps
+def random_date(start, end):
+    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))
+# Function to get merchant based on category
+def get_merchant_and_payee(category):
+    merchants = MERCHANT_PAYEE_MAPPING[category]
+    merchant = random.choice(merchants)
+    if "Supermarket" in merchant or "Mall" in merchant or "Store" in merchant:
+        payee_type = "Business or corporations"
+    elif "UtilityCompany" in merchant:
+        payee_type = "Financial agency"
+    elif "Hospital" in merchant or "Clinic" in merchant or "Pharmacy" in merchant:
+        payee_type = "Individual"
+    elif "University" in merchant or "School" in merchant or "Bookstore" in merchant:
+        payee_type = "Educational Institute"
+    elif "ServiceProvider" in merchant or "Consultant" in merchant or "Freelancer" in merchant:
+        payee_type = "Individual"
+    else:
+        payee_type = random.choice(PAYEE_TYPES)
+    return merchant, payee_type
+# Function to simulate a transaction
+def generate_transaction(user_id):
+    device_type = USER_PROFILES[user_id]["DeviceType"] if np.random.rand() < 0.9 else np.random.choice(DEVICE_TYPES)
+    browser_type = USER_PROFILES[user_id]["BrowserType"] if np.random.rand() < 0.9 else np.random.choice(BROWSER_TYPES)
+    isp = USER_PROFILES[user_id]["ISP"] if np.random.rand() < 0.9 else np.random.choice(ISPs)
+    transaction_id = str(uuid.uuid4())
+    transaction_timestamp = random_date(START_DATE, END_DATE)
+    amount = round(random.uniform(1, 10000), 2)  # Transaction amount between $1 and $10000
+    lag_amount = round(amount * random.uniform(0.5, 1.5), 2)  # Lag amount as a factor of transaction amount
+    ip_address = random.choice(IP_ADDRESSES)
+    otp = np.random.binomial(1, 0.85)  # 85% of transactions use OTP
+    auth_level = random.choice(AUTH_LEVELS)
+    is_fraud = np.random.binomial(1, 0.02)  # Assume 2% transactions are fraudulent
+    status = random.choice(STATUSES)
+    payment_method = random.choice(PAYMENT_METHODS)
+    category = random.choice(CATEGORIES)
+    merchant, payee_type = get_merchant_and_payee(category)
+    transaction_type = random.choice(TRANSACTION_TYPES)
+    return [
+        transaction_id,
+        user_id,
+        transaction_timestamp,
+        amount,
+        lag_amount,
+        ip_address,
+        device_type,
+        browser_type,
+        payee_type,
+        isp,
+        otp,
+        auth_level,
+        status,
+        payment_method,
+        category,
+        merchant,
+        transaction_type,
+        is_fraud,
+    ]
+def update_fraud_status(df):
+    is_fraud = []
+    for i, row in df.iterrows():
+        fraud_prob = row[LABEL_COLUMN]
+        if row["TransactionAmount"] > 6000:
+            fraud_prob += 0.3
+        if row["TimeSinceLastTx"] < 3600:
+            fraud_prob += 0.1
+        if row["KnownIP"] == 0:
+            fraud_prob += 0.1
+        if row["KnownDeviceType"] == 0:
+            fraud_prob += 0.1
+        if row["KnownBrowserType"] == 0:
+            fraud_prob += 0.1
+        if row["KnownISP"] == 0:
+            fraud_prob += 0.1
+        if row["OTP"]:
+            fraud_prob -= 0.3
+        if row["AuthenticationLevel"] > 2:
+            fraud_prob -= 0.2
+        elif row["AuthenticationLevel"] > 1:
+            fraud_prob -= 0.1
+        else:
+            fraud_prob -= 0.05
+        if fraud_prob < 0:
+            fraud_prob = 0
+        elif fraud_prob > 1:
+            fraud_prob = 1
+        row_is_fraud = np.random.choice([0, 1], p=[1 - fraud_prob, fraud_prob])
+        is_fraud.append(row_is_fraud)
+    return is_fraud
+def main():
+    # delete_all_transactions()  # Commented out to avoid deleting transactions
+    # Generate data
+    transactions = [generate_transaction(user_id=random.choice(USER_IDS)) for _ in range(NUM_TRANSACTIONS)]
+    # Create DataFrame
+    df = pd.DataFrame(
+        transactions,
+        columns=RAW_DATA_COLUMNS,
+    )
+    # Update fraud status
+    df2, _ = prepare_train_data(df.copy())
+    is_fraud = update_fraud_status(df2)
+    df[LABEL_COLUMN] = is_fraud
+    # Save
+    # df.to_csv("transactions_data.csv", index=False)
+    insert_multiple_transactions(df)
+if __name__ == "__main__":
+    main()

data_utils.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import random
+from datetime import datetime
+import joblib
+import pandas as pd
+import requests
+# Load the trained model and feature names
+model, feature_names = joblib.load("model.pkl")
+# Function to fetch data from final_data.csv based on user input
+def fetch_data(user_id, txn_amount):
+    final_data = pd.read_csv("final_data.csv")
+    fetched_data = final_data[(final_data["user_id"] == user_id) & (final_data["TxnAmount($)"] == txn_amount)]
+    return fetched_data
+# Define the prediction function
+def predict_fraud(input_data):
+    # Reorder and align input_data to match the feature names used during training
+    input_data = input_data.reindex(columns=feature_names, fill_value=0)
+    # Make the prediction
+    prediction = model.predict(input_data)
+    return prediction[0]
+# Function to handle real-time data and make predictions
+def handle_real_time_data(real_time_data):
+    predictions = []
+    for index, row in real_time_data.iterrows():
+        input_data = pd.DataFrame([row])
+        prediction = predict_fraud(input_data)
+        predictions.append(prediction)
+    real_time_data["Prediction"] = predictions
+    return real_time_data
+def fetch_test_data():
+    # Simulate real-time data fetching from an API or a streaming service
+    return pd.read_csv("test.csv")  # Example CSV file for real-time transactions
+# Function to determine time of day
+def time_of_day(hour, minute, second):
+    if 5 <= hour < 11:
+        return 0
+    elif hour == 11 and minute < 1:
+        return 0.166666667
+    elif 11 <= hour < 13:
+        return 0.166666667
+    elif hour == 13 and minute < 1:
+        return 0.333333333
+    elif 13 <= hour < 15:
+        return 0.333333333
+    elif hour == 15 and minute < 1:
+        return 0.5
+    elif 15 <= hour < 17:
+        return 0.5
+    elif hour == 17 and minute < 1:
+        return 0.666666667
+    elif 17 <= hour < 19:
+        return 0.666666667
+    elif hour == 19 and minute < 1:
+        return 0.833333333
+    elif 19 <= hour < 24:
+        return 0.833333333
+    elif hour == 0 and minute < 1:
+        return 1
+    elif 0 <= hour < 5:
+        return 1
+    else:
+        return "Invalid time"
+def payee_type(payee):
+    payee_types = ["Individual", "Business or corporations", "Financial agency", "Charity", "Educational Institute"]
+    payee_values = [0, 0.369070247, 0.630929753, 0.834043767, 1]
+    if payee in payee_types:
+        return payee_values[payee_types.index(payee)]
+    else:
+        return "Invalid Payee Type"
+# Get current location details including latitude and longitude
+def get_location():
+    try:
+        res = requests.get("https://ipinfo.io/")
+        data = res.json()
+        loc = data["loc"].split(",")
+        latitude = loc[0]
+        longitude = loc[1]
+        return data["city"], data["region"], data["country"], data["ip"], latitude, longitude
+    except Exception as e:
+        print("Unable to retrieve location:", e)
+        return None, None, None, None, None, None
+# Function to generate synthetic data
+def generate_synthetic_data(user_id, txn_amount):
+    # Define the feature values based on model requirements
+    lag_amount1 = random.randint(50, 200)  # Random lag_amount1
+    average_amount_3txns = random.randint(50, 200)  # Random average amount of last 3 transactions
+    time_since_last_txn = random.randint(1, 60)  # Random time since last transaction (in minutes)
+    velocity_txns = random.randint(1, 10)  # Random velocity of transactions
+    weekday = random.choice([0, 1, 2, 3, 4, 5, 6])  # Random weekday encoded as numerical value
+    ip_used_known = random.choice([0, 1])  # Random IP used known
+    time_slots = time_of_day(
+        datetime.now().hour, datetime.now().minute, datetime.now().second
+    )  # Time of day based on current time
+    known_device_browser = random.choice([0, 1])  # Random known device and browser
+    payee_type_value = payee_type(
+        random.choice(
+            ["Individual", "Business or corporations", "Financial agency", "Charity", "Educational Institute"]
+        )
+    )  # Random payee type value
+    time_since_payee_regn = random.randint(1, 365)  # Random time since payee registration (in days)
+    known_location = random.choice([0, 1])  # Random known location
+    otp = random.choice([0, 1])  # Random OTP
+    authentication_level = random.choice([0, 1, 2])  # Random authentication level
+    time_limit = random.choice([0, 1])  # Random time limit
+    browser_known = random.choice([0, 1])  # Random browser known
+    location_change = random.choice([0, 1])  # Random location change
+    # Create a DataFrame with the generated synthetic data
+    synthetic_data = pd.DataFrame(
+        {
+            "user_id": [user_id],
+            "TxnAmount($)": [txn_amount],
+            "LagAmount1": [lag_amount1],
+            "AverageAmount(Last3txns)": [average_amount_3txns],
+            "TimeSinceLastTxn": [time_since_last_txn],
+            "VelocityTxns": [velocity_txns],
+            "WeekDay": [weekday],
+            "IPUsedKnown": [ip_used_known],
+            "TimeSlots": [time_slots],
+            "KnownDeviceBrowser": [known_device_browser],
+            "PayeeType": [payee_type_value],
+            "Timesincepayeeregn": [time_since_payee_regn],
+            "KnownLocation": [known_location],
+            "OTP": [otp],
+            "AuthenticationLevel": [authentication_level],
+            "Timelimit": [time_limit],
+            "BrowserKnown": [browser_known],
+            "location_change": [location_change],
+        }
+    )
+    return synthetic_data
+# Function to simulate fetching real-time data
+def fetch_real_time_data(user_id, txn_amount):
+    # Simulate real-time data fetching from an API or a streaming service
+    return generate_synthetic_data(user_id, txn_amount)  # Generating synthetic data instead of reading from CSV

db_utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import sqlite3
+import pandas as pd
+from constants import DB_PATH, DB_NAME, RAW_DATA_COLUMNS, RAW_DATA_COLUMN_TYPES
+def get_connection():
+    # Connect to SQLite database (or create it if it doesn't exist)
+    db = sqlite3.connect(DB_PATH)
+    cursor = db.cursor()
+    return db, cursor
+def execute_query(query):
+    print(f"Executing query: {repr(query)}")
+    db, cursor = get_connection()
+    cursor.execute(query)
+    cursor.close()
+    # Commit the changes
+    db.commit()
+    db.close()
+def create_transaction_table():
+    # Create a table
+    query = f"CREATE TABLE IF NOT EXISTS {DB_NAME} ("
+    for col in RAW_DATA_COLUMNS:
+        query += f"{col} {RAW_DATA_COLUMN_TYPES[col]}, "
+    query = query[:-2]
+    query += ")"
+    execute_query(query)
+def insert_multiple_transactions(df):
+    create_transaction_table()
+    db, _ = get_connection()
+    df.to_sql(DB_NAME, db, if_exists="append", index=False)
+    db.commit()
+    db.close()
+# Function to insert a new user into the users table
+def insert_single_transaction(transaction):
+    transaction["TransactionTimestamp"] = str(transaction["TransactionTimestamp"])
+    query = f"INSERT INTO {DB_NAME} " f"{tuple(transaction.keys())}" f" VALUES " f"{tuple(transaction.values())}"
+    execute_query(query)
+def execute_return_query(query):
+    db, cursor = get_connection()
+    cursor.execute(query)
+    rows = cursor.fetchall()
+    cursor.close()
+    # Commit the changes
+    db.commit()
+    db.close()
+    return rows
+# Function to fetch all users from the users table
+def fetch_all_transactions():
+    db, cursor = get_connection()
+    query = f"SELECT * FROM {DB_NAME}"
+    df = pd.read_sql_query(query, db)
+    db.close()
+    return df
+def delete_all_transactions():
+    execute_return_query(f"DELETE FROM {DB_NAME}")
+if __name__ == "__main__":
+    delete_all_transactions()