{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Fraud Detection Project"
],
"metadata": {
"id": "Omy5vXug5k2c"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "eSvjbJp55fYt"
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv('/content/synthetic_financial_data.csv')"
],
"metadata": {
"id": "iquO69CD5jZs"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 313
},
"id": "VRiRM6khFoFt",
"outputId": "85c265ec-7cf2-401e-a1da-89b2ee941fba"
},
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" transaction_id customer_id merchant_id amount transaction_time \\\n",
"0 1 1082 2027 5758.59 2023-01-01 00:00:00 \n",
"1 2 1015 2053 1901.56 2023-01-01 00:00:01 \n",
"2 3 1004 2035 1248.86 2023-01-01 00:00:02 \n",
"3 4 1095 2037 7619.05 2023-01-01 00:00:03 \n",
"4 5 1036 2083 1890.10 2023-01-01 00:00:04 \n",
"\n",
" is_fraudulent card_type location purchase_category customer_age \\\n",
"0 0 MasterCard City-30 Gas Station 43 \n",
"1 1 Visa City-47 Online Shopping 61 \n",
"2 1 MasterCard City-6 Gas Station 57 \n",
"3 1 Discover City-6 Travel 59 \n",
"4 1 MasterCard City-34 Retail 36 \n",
"\n",
" transaction_description country \n",
"0 Purchase at Merchant-2027 Slovakia \n",
"1 Purchase at Merchant-2053 Montenegro \n",
"2 Purchase at Merchant-2035 Germany \n",
"3 Purchase at Merchant-2037 Switzerland \n",
"4 Purchase at Merchant-2083 Croatia "
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" transaction_id | \n",
" customer_id | \n",
" merchant_id | \n",
" amount | \n",
" transaction_time | \n",
" is_fraudulent | \n",
" card_type | \n",
" location | \n",
" purchase_category | \n",
" customer_age | \n",
" transaction_description | \n",
" country | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1082 | \n",
" 2027 | \n",
" 5758.59 | \n",
" 2023-01-01 00:00:00 | \n",
" 0 | \n",
" MasterCard | \n",
" City-30 | \n",
" Gas Station | \n",
" 43 | \n",
" Purchase at Merchant-2027 | \n",
" Slovakia | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1015 | \n",
" 2053 | \n",
" 1901.56 | \n",
" 2023-01-01 00:00:01 | \n",
" 1 | \n",
" Visa | \n",
" City-47 | \n",
" Online Shopping | \n",
" 61 | \n",
" Purchase at Merchant-2053 | \n",
" Montenegro | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1004 | \n",
" 2035 | \n",
" 1248.86 | \n",
" 2023-01-01 00:00:02 | \n",
" 1 | \n",
" MasterCard | \n",
" City-6 | \n",
" Gas Station | \n",
" 57 | \n",
" Purchase at Merchant-2035 | \n",
" Germany | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1095 | \n",
" 2037 | \n",
" 7619.05 | \n",
" 2023-01-01 00:00:03 | \n",
" 1 | \n",
" Discover | \n",
" City-6 | \n",
" Travel | \n",
" 59 | \n",
" Purchase at Merchant-2037 | \n",
" Switzerland | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1036 | \n",
" 2083 | \n",
" 1890.10 | \n",
" 2023-01-01 00:00:04 | \n",
" 1 | \n",
" MasterCard | \n",
" City-34 | \n",
" Retail | \n",
" 36 | \n",
" Purchase at Merchant-2083 | \n",
" Croatia | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"transaction_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2886,\n \"min\": 1,\n \"max\": 10000,\n \"num_unique_values\": 10000,\n \"samples\": [\n 6253,\n 4685,\n 1732\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"customer_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28,\n \"min\": 1001,\n \"max\": 1100,\n \"num_unique_values\": 100,\n \"samples\": [\n 1056,\n 1048,\n 1051\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"merchant_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28,\n \"min\": 2001,\n \"max\": 2100,\n \"num_unique_values\": 100,\n \"samples\": [\n 2017,\n 2043,\n 2002\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"amount\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2899.69967496466,\n \"min\": 10.61,\n \"max\": 9999.75,\n \"num_unique_values\": 9945,\n \"samples\": [\n 8305.76,\n 364.11,\n 4334.28\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"transaction_time\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 10000,\n \"samples\": [\n \"2023-01-01 01:44:12\",\n \"2023-01-01 01:18:04\",\n \"2023-01-01 00:28:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"is_fraudulent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"card_type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Visa\",\n \"American Express\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"City-17\",\n \"City-9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"purchase_category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Gas Station\",\n \"Online Shopping\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"customer_age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 18,\n \"max\": 70,\n \"num_unique_values\": 53,\n \"samples\": [\n 42,\n 67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"transaction_description\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"Purchase at Merchant-2017\",\n \"Purchase at Merchant-2043\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"country\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 46,\n \"samples\": [\n \"United Kingdom\",\n \"San Marino\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "markdown",
"source": [
"# Descriptive Analyze"
],
"metadata": {
"id": "JWGP5IIC-S1c"
}
},
{
"cell_type": "code",
"source": [
"# Let's look at columns\n",
"df.columns"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "047bKRicztNc",
"outputId": "3da1c706-a46d-49a5-a994-6a4fa4a27edd"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Index(['transaction_id', 'customer_id', 'merchant_id', 'amount',\n",
" 'transaction_time', 'is_fraudulent', 'card_type', 'location',\n",
" 'purchase_category', 'customer_age', 'transaction_description',\n",
" 'country'],\n",
" dtype='object')"
]
},
"metadata": {},
"execution_count": 38
}
]
},
{
"cell_type": "code",
"source": [
"# Let's look at info of dataset\n",
"df.info()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OZXgdx2s50n2",
"outputId": "c434ffee-19f5-40b6-e3b1-24ea95a4ca22"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 transaction_id 10000 non-null int64 \n",
" 1 customer_id 10000 non-null int64 \n",
" 2 merchant_id 10000 non-null int64 \n",
" 3 amount 10000 non-null float64\n",
" 4 transaction_time 10000 non-null object \n",
" 5 is_fraudulent 10000 non-null int64 \n",
" 6 card_type 10000 non-null object \n",
" 7 location 10000 non-null object \n",
" 8 purchase_category 10000 non-null object \n",
" 9 customer_age 10000 non-null int64 \n",
" 10 transaction_description 10000 non-null object \n",
" 11 country 10000 non-null object \n",
"dtypes: float64(1), int64(5), object(6)\n",
"memory usage: 937.6+ KB\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Let's describe dataset\n",
"df.describe().T"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"id": "k_EhebiD7yuU",
"outputId": "c6c836cb-2ee1-4835-bc61-e1d2cd0fe717"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" count mean std min 25% \\\n",
"transaction_id 10000.0 5000.500000 2886.895680 1.00 2500.750 \n",
"customer_id 10000.0 1051.272300 28.864062 1001.00 1026.000 \n",
"merchant_id 10000.0 2050.486600 28.877801 2001.00 2025.000 \n",
"amount 10000.0 4958.381617 2899.699675 10.61 2438.175 \n",
"is_fraudulent 10000.0 0.506800 0.499979 0.00 0.000 \n",
"customer_age 10000.0 44.047500 15.321707 18.00 31.000 \n",
"\n",
" 50% 75% max \n",
"transaction_id 5000.500 7500.2500 10000.00 \n",
"customer_id 1052.000 1076.0000 1100.00 \n",
"merchant_id 2050.000 2076.0000 2100.00 \n",
"amount 4943.945 7499.3125 9999.75 \n",
"is_fraudulent 1.000 1.0000 1.00 \n",
"customer_age 44.000 57.0000 70.00 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" transaction_id | \n",
" 10000.0 | \n",
" 5000.500000 | \n",
" 2886.895680 | \n",
" 1.00 | \n",
" 2500.750 | \n",
" 5000.500 | \n",
" 7500.2500 | \n",
" 10000.00 | \n",
"
\n",
" \n",
" customer_id | \n",
" 10000.0 | \n",
" 1051.272300 | \n",
" 28.864062 | \n",
" 1001.00 | \n",
" 1026.000 | \n",
" 1052.000 | \n",
" 1076.0000 | \n",
" 1100.00 | \n",
"
\n",
" \n",
" merchant_id | \n",
" 10000.0 | \n",
" 2050.486600 | \n",
" 28.877801 | \n",
" 2001.00 | \n",
" 2025.000 | \n",
" 2050.000 | \n",
" 2076.0000 | \n",
" 2100.00 | \n",
"
\n",
" \n",
" amount | \n",
" 10000.0 | \n",
" 4958.381617 | \n",
" 2899.699675 | \n",
" 10.61 | \n",
" 2438.175 | \n",
" 4943.945 | \n",
" 7499.3125 | \n",
" 9999.75 | \n",
"
\n",
" \n",
" is_fraudulent | \n",
" 10000.0 | \n",
" 0.506800 | \n",
" 0.499979 | \n",
" 0.00 | \n",
" 0.000 | \n",
" 1.000 | \n",
" 1.0000 | \n",
" 1.00 | \n",
"
\n",
" \n",
" customer_age | \n",
" 10000.0 | \n",
" 44.047500 | \n",
" 15.321707 | \n",
" 18.00 | \n",
" 31.000 | \n",
" 44.000 | \n",
" 57.0000 | \n",
" 70.00 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"df\",\n \"rows\": 6,\n \"fields\": [\n {\n \"column\": \"count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 10000.0,\n \"max\": 10000.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 10000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2292.500926754255,\n \"min\": 0.5068,\n \"max\": 5000.5,\n \"num_unique_values\": 6,\n \"samples\": [\n 5000.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1484.6379945200913,\n \"min\": 0.4999787574245405,\n \"max\": 2899.69967496466,\n \"num_unique_values\": 6,\n \"samples\": [\n 2886.8956799071675\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"min\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 833.6261800211571,\n \"min\": 0.0,\n \"max\": 2001.0,\n \"num_unique_values\": 6,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"25%\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1151.5355183858494,\n \"min\": 0.0,\n \"max\": 2500.75,\n \"num_unique_values\": 6,\n \"samples\": [\n 2500.75\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"50%\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2288.8604755068623,\n \"min\": 1.0,\n \"max\": 5000.5,\n \"num_unique_values\": 6,\n \"samples\": [\n 5000.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"75%\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3541.173410841531,\n \"min\": 1.0,\n \"max\": 7500.25,\n \"num_unique_values\": 6,\n \"samples\": [\n 7500.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"max\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4803.441824402234,\n \"min\": 1.0,\n \"max\": 10000.0,\n \"num_unique_values\": 6,\n \"samples\": [\n 10000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 40
}
]
},
{
"cell_type": "code",
"source": [
"# Let's check null values in dataset\n",
"df.isnull().any()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YFCejyJE81La",
"outputId": "a1a3030c-71bb-4031-be98-70a1a9940950"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"transaction_id False\n",
"customer_id False\n",
"merchant_id False\n",
"amount False\n",
"transaction_time False\n",
"is_fraudulent False\n",
"card_type False\n",
"location False\n",
"purchase_category False\n",
"customer_age False\n",
"transaction_description False\n",
"country False\n",
"dtype: bool"
]
},
"metadata": {},
"execution_count": 41
}
]
},
{
"cell_type": "code",
"source": [
"# Let's look at count of missin values\n",
"df.isnull().sum()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ar5ZmYHD83W0",
"outputId": "e81b104c-42e4-4fb0-df0a-1aa564f9e718"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"transaction_id 0\n",
"customer_id 0\n",
"merchant_id 0\n",
"amount 0\n",
"transaction_time 0\n",
"is_fraudulent 0\n",
"card_type 0\n",
"location 0\n",
"purchase_category 0\n",
"customer_age 0\n",
"transaction_description 0\n",
"country 0\n",
"dtype: int64"
]
},
"metadata": {},
"execution_count": 42
}
]
},
{
"cell_type": "code",
"source": [
"# Let's look at count of columns unique value\n",
"df.nunique()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vZobXWkqW4s2",
"outputId": "6f3d9b9b-af5e-40d5-cdc2-936e7ba93eb5"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"transaction_id 10000\n",
"customer_id 100\n",
"merchant_id 100\n",
"amount 9945\n",
"transaction_time 10000\n",
"is_fraudulent 2\n",
"card_type 4\n",
"location 50\n",
"purchase_category 6\n",
"customer_age 53\n",
"transaction_description 100\n",
"country 46\n",
"dtype: int64"
]
},
"metadata": {},
"execution_count": 43
}
]
},
{
"cell_type": "code",
"source": [
"# Let's look at data types of columns\n",
"df.dtypes"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Lf1ec8-286Tb",
"outputId": "e3801f9e-2412-4232-c6fa-3991309bc4ea"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"transaction_id int64\n",
"customer_id int64\n",
"merchant_id int64\n",
"amount float64\n",
"transaction_time object\n",
"is_fraudulent int64\n",
"card_type object\n",
"location object\n",
"purchase_category object\n",
"customer_age int64\n",
"transaction_description object\n",
"country object\n",
"dtype: object"
]
},
"metadata": {},
"execution_count": 44
}
]
},
{
"cell_type": "code",
"source": [
"# Let's select object columns and look at unique value\n",
"for i in df.select_dtypes(include= 'object').columns:\n",
" print(f'Columns: {i} | Values: {df[i].unique()}\\n ------------------------------------')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Taijx5lY8-lT",
"outputId": "8a60db16-ab85-4f59-b10a-092d94d36e58"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Columns: transaction_time | Values: ['2023-01-01 00:00:00' '2023-01-01 00:00:01' '2023-01-01 00:00:02' ...\n",
" '2023-01-01 02:46:37' '2023-01-01 02:46:38' '2023-01-01 02:46:39']\n",
" ------------------------------------\n",
"Columns: card_type | Values: ['MasterCard' 'Visa' 'Discover' 'American Express']\n",
" ------------------------------------\n",
"Columns: location | Values: ['City-30' 'City-47' 'City-6' 'City-34' 'City-1' 'City-7' 'City-46'\n",
" 'City-45' 'City-25' 'City-29' 'City-19' 'City-13' 'City-4' 'City-17'\n",
" 'City-35' 'City-40' 'City-38' 'City-22' 'City-39' 'City-10' 'City-15'\n",
" 'City-44' 'City-27' 'City-37' 'City-50' 'City-18' 'City-31' 'City-14'\n",
" 'City-41' 'City-20' 'City-33' 'City-26' 'City-48' 'City-43' 'City-21'\n",
" 'City-11' 'City-5' 'City-3' 'City-2' 'City-9' 'City-24' 'City-32'\n",
" 'City-49' 'City-28' 'City-16' 'City-23' 'City-12' 'City-42' 'City-36'\n",
" 'City-8']\n",
" ------------------------------------\n",
"Columns: purchase_category | Values: ['Gas Station' 'Online Shopping' 'Travel' 'Retail' 'Groceries'\n",
" 'Restaurant']\n",
" ------------------------------------\n",
"Columns: transaction_description | Values: ['Purchase at Merchant-2027' 'Purchase at Merchant-2053'\n",
" 'Purchase at Merchant-2035' 'Purchase at Merchant-2037'\n",
" 'Purchase at Merchant-2083' 'Purchase at Merchant-2021'\n",
" 'Purchase at Merchant-2033' 'Purchase at Merchant-2088'\n",
" 'Purchase at Merchant-2077' 'Purchase at Merchant-2031'\n",
" 'Purchase at Merchant-2052' 'Purchase at Merchant-2015'\n",
" 'Purchase at Merchant-2020' 'Purchase at Merchant-2025'\n",
" 'Purchase at Merchant-2004' 'Purchase at Merchant-2032'\n",
" 'Purchase at Merchant-2040' 'Purchase at Merchant-2060'\n",
" 'Purchase at Merchant-2063' 'Purchase at Merchant-2072'\n",
" 'Purchase at Merchant-2056' 'Purchase at Merchant-2055'\n",
" 'Purchase at Merchant-2005' 'Purchase at Merchant-2026'\n",
" 'Purchase at Merchant-2018' 'Purchase at Merchant-2034'\n",
" 'Purchase at Merchant-2036' 'Purchase at Merchant-2085'\n",
" 'Purchase at Merchant-2071' 'Purchase at Merchant-2023'\n",
" 'Purchase at Merchant-2006' 'Purchase at Merchant-2093'\n",
" 'Purchase at Merchant-2003' 'Purchase at Merchant-2008'\n",
" 'Purchase at Merchant-2011' 'Purchase at Merchant-2041'\n",
" 'Purchase at Merchant-2078' 'Purchase at Merchant-2039'\n",
" 'Purchase at Merchant-2009' 'Purchase at Merchant-2047'\n",
" 'Purchase at Merchant-2086' 'Purchase at Merchant-2012'\n",
" 'Purchase at Merchant-2051' 'Purchase at Merchant-2074'\n",
" 'Purchase at Merchant-2095' 'Purchase at Merchant-2065'\n",
" 'Purchase at Merchant-2016' 'Purchase at Merchant-2067'\n",
" 'Purchase at Merchant-2075' 'Purchase at Merchant-2100'\n",
" 'Purchase at Merchant-2068' 'Purchase at Merchant-2064'\n",
" 'Purchase at Merchant-2099' 'Purchase at Merchant-2043'\n",
" 'Purchase at Merchant-2050' 'Purchase at Merchant-2096'\n",
" 'Purchase at Merchant-2028' 'Purchase at Merchant-2019'\n",
" 'Purchase at Merchant-2014' 'Purchase at Merchant-2057'\n",
" 'Purchase at Merchant-2076' 'Purchase at Merchant-2069'\n",
" 'Purchase at Merchant-2048' 'Purchase at Merchant-2090'\n",
" 'Purchase at Merchant-2070' 'Purchase at Merchant-2061'\n",
" 'Purchase at Merchant-2092' 'Purchase at Merchant-2007'\n",
" 'Purchase at Merchant-2089' 'Purchase at Merchant-2082'\n",
" 'Purchase at Merchant-2002' 'Purchase at Merchant-2097'\n",
" 'Purchase at Merchant-2081' 'Purchase at Merchant-2013'\n",
" 'Purchase at Merchant-2046' 'Purchase at Merchant-2054'\n",
" 'Purchase at Merchant-2094' 'Purchase at Merchant-2042'\n",
" 'Purchase at Merchant-2059' 'Purchase at Merchant-2098'\n",
" 'Purchase at Merchant-2024' 'Purchase at Merchant-2001'\n",
" 'Purchase at Merchant-2084' 'Purchase at Merchant-2017'\n",
" 'Purchase at Merchant-2058' 'Purchase at Merchant-2062'\n",
" 'Purchase at Merchant-2049' 'Purchase at Merchant-2029'\n",
" 'Purchase at Merchant-2079' 'Purchase at Merchant-2030'\n",
" 'Purchase at Merchant-2022' 'Purchase at Merchant-2080'\n",
" 'Purchase at Merchant-2073' 'Purchase at Merchant-2087'\n",
" 'Purchase at Merchant-2038' 'Purchase at Merchant-2091'\n",
" 'Purchase at Merchant-2010' 'Purchase at Merchant-2044'\n",
" 'Purchase at Merchant-2045' 'Purchase at Merchant-2066']\n",
" ------------------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Let's exclude object and look at unique values\n",
"for i in df.select_dtypes(exclude= 'object').columns:\n",
" print(f'Columns: {i} | Values: {df[i].unique()} | Max: {df[i].max()} | Min: {df[i].min()}\\n ------------------------------------')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3oupfTUq9mNE",
"outputId": "bda749b0-856e-482c-a4a5-b38846b565da"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Columns: transaction_id | Values: [ 1 2 3 ... 9998 9999 10000] | Max: 10000 | Min: 1\n",
" ------------------------------------\n",
"Columns: customer_id | Values: [1082 1015 1004 1095 1036 1032 1029 1018 1014 1087 1070 1012 1076 1055\n",
" 1005 1028 1030 1065 1078 1072 1026 1092 1084 1090 1054 1058 1001 1098\n",
" 1021 1044 1020 1049 1013 1046 1045 1034 1006 1094 1059 1069 1016 1011\n",
" 1071 1038 1081 1080 1047 1074 1025 1091 1009 1085 1099 1048 1027 1086\n",
" 1035 1088 1083 1010 1022 1060 1089 1042 1100 1008 1041 1052 1073 1064\n",
" 1051 1019 1096 1075 1066 1097 1007 1077 1050 1068 1033 1002 1093 1056\n",
" 1023 1039 1063 1003 1040 1031 1017 1061 1057 1067 1043 1053 1024 1062\n",
" 1037 1079] | Max: 1100 | Min: 1001\n",
" ------------------------------------\n",
"Columns: merchant_id | Values: [2027 2053 2035 2037 2083 2021 2033 2088 2077 2031 2052 2015 2020 2025\n",
" 2004 2032 2040 2060 2063 2072 2056 2055 2005 2026 2018 2034 2036 2085\n",
" 2071 2023 2006 2093 2003 2008 2011 2041 2078 2039 2009 2047 2086 2012\n",
" 2051 2074 2095 2065 2016 2067 2075 2100 2068 2064 2099 2043 2050 2096\n",
" 2028 2019 2014 2057 2076 2069 2048 2090 2070 2061 2092 2007 2089 2082\n",
" 2002 2097 2081 2013 2046 2054 2094 2042 2059 2098 2024 2001 2084 2017\n",
" 2058 2062 2049 2029 2079 2030 2022 2080 2073 2087 2038 2091 2010 2044\n",
" 2045 2066] | Max: 2100 | Min: 2001\n",
" ------------------------------------\n",
"Columns: amount | Values: [5758.59 1901.56 1248.86 ... 6333.64 2837.13 7209.43] | Max: 9999.75 | Min: 10.61\n",
" ------------------------------------\n",
"Columns: is_fraudulent | Values: [0 1] | Max: 1 | Min: 0\n",
" ------------------------------------\n",
"Columns: customer_age | Values: [43 61 57 59 36 19 40 55 70 27 46 34 41 64 62 44 60 51 28 42 23 45 65 24\n",
" 30 35 32 58 66 22 39 53 21 29 38 56 48 37 33 47 18 67 63 31 68 49 50 52\n",
" 26 54 20 25 69] | Max: 70 | Min: 18\n",
" ------------------------------------\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Data Preprocessing For Visualization"
],
"metadata": {
"id": "XSv32Xms-tZL"
}
},
{
"cell_type": "code",
"source": [
"# Let's divide transaction_time to date and time\n",
"df[['transaction_date', 'transaction_time']] = df['transaction_time'].str.split(' ', expand=True)"
],
"metadata": {
"id": "cQvrHdTh1Unr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Let's create new column like location_id\n",
"df['location_id'] = df['location'].str.split('-', expand=True)[1]"
],
"metadata": {
"id": "2sjGv05f1WAM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Let's create new age group column\n",
"import numpy as np\n",
"\n",
"# Let's calculate quartiles\n",
"q1 = int(np.percentile(df['customer_age'], 25))\n",
"q2 = int(np.percentile(df['customer_age'], 50))\n",
"q3 = int(np.percentile(df['customer_age'], 75))\n",
"\n",
"def group(x):\n",
" if x < q1:\n",
" return f'-{q1}'\n",
" elif q1 <= x < q2:\n",
" return f'{q1}-{q2}'\n",
" elif q2 <= x < q3:\n",
" return f'{q2}-{q3}'\n",
" else:\n",
" return f'{q3}+'\n",
"\n",
"df['Age_group'] = df['customer_age'].apply(lambda x: group(x))"
],
"metadata": {
"id": "43kI9CHg1V-L"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Exploratory Data Analysis"
],
"metadata": {
"id": "9H5oMvCA6iTi"
}
},
{
"cell_type": "code",
"source": [
"# Let's look at top n customer which do more transaction\n",
"import matplotlib.pyplot as plt\n",
"\n",
"transaction_counts = df['customer_id'].value_counts().reset_index()\n",
"transaction_counts.columns = ['customer_id', 'transaction_count']\n",
"\n",
"top_n = int(input('Please enter (top) n number you want to see top n customer:'))\n",
"top_customers = transaction_counts.head(top_n)\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"bars = plt.barh(top_customers['customer_id'].astype(str), top_customers['transaction_count'], color='skyblue')\n",
"plt.xlabel('Transaction Count')\n",
"plt.ylabel('Customer ID')\n",
"plt.title(f'Top {top_n} Customers by Number of Transactions')\n",
"plt.gca().invert_yaxis()\n",
"plt.grid(axis='x', linestyle='--', alpha=0.7)\n",
"\n",
"for bar in bars:\n",
" plt.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2, f'{bar.get_width():.0f}', va='center')\n",
"\n",
"plt.show()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "XxDsFDrz1V5k",
"outputId": "9b89eeb9-310a-4581-818b-717c91464d68"
},
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Please enter (top) n number you want to see top n customer:12\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"