File size: 9,274 Bytes
afca08a 1fef517 afca08a 1fef517 afca08a 1fef517 afca08a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
#Load data
fraud = pd.read_csv('fraud_test.csv')
# Define the percentage of data you want to sample
sample_percentage = 50 # Adjust this percentage as needed
# Randomly sample the data based on the percentage
data = fraud.sample(frac=sample_percentage/100, random_state=22)
# def annotate_bar(ax, custom_y_func, font_size=14):
# for p in ax.patches:
# # Calculate annotation
# value = str(round(p.get_height(), 1))
# x = (p.get_x() + p.get_width() / 2) * 0.99
# y = ((p.get_y() + p.get_height() / 2) * 0.99)
# y = custom_y_func(y)
# ax.annotate(
# value,
# (x, y),
# color="black",
# size=font_size, ha='center', va='center'
# )
def eda_page():
st.title("Eksploratory Data Analysis")
st.write('Analyze the DataFrame for Better Understanding')
st.markdown("<h2><b>Top 10 Transaction Amount</b></h2>", unsafe_allow_html=True)
# TOP Transaction Amount
columns = ['job', 'state', 'city', 'merchant']
fraud_labels = ['Not Fraud', 'Fraud']
for col in columns:
st.subheader(f"Top 10 transaction amount by {col}")
fig, ax = plt.subplots(1, 2, figsize=(30, 5))
for i, fraud_label in enumerate(fraud_labels):
temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
top = temp_data.groupby(col)['amt'].sum().nlargest(10)
ax[i].bar(top.index, top.values, color='#a1c9f4')
if col == 'state':
ax[i].tick_params(axis='x', rotation=0)
ax[i].tick_params(axis='x', rotation=90)
markdown_text = """
* From the top 10 transaction amount by job we can see `Science Writer` have the most fraud with over 10.000 transaction amount meanwhile `Film/Video editor` are the most non fraud with almost 160.000 transaction
* From the top 10 transaction amount by state we can see `NY` have the most fraud with almost 60.000 transaction amount meanwhile `TX`are the most non fraud with above 1.400.000 transaction
* From the top 10 transaction amount by city we can see `Camden` have the most fraud with over 10.000 transaction amount meanwhile `Meridian` are the most non fraud with almost 100.000 transaction
* From the top 10 transaction amount by merchant we can see `Heathcote, Yost and Kertzmann` have most fraud with almost 10.000 transaction amount meanwhile `Killback-LLC` are the most non fraud with over 80.000 transaction
st.markdown("<h2><b>Top 10 Transaction Count</b></h2>", unsafe_allow_html=True)
# By Transaction count
columns = ['job', 'state', 'city', 'merchant']
columns_name = ['Job', 'State', 'City', 'Merchant']
fraud = ['Not Fraud', 'Fraud']
for col, name in zip(columns, columns_name):
st.subheader(f"Top 10 transaction by {name}")
fig, ax = plt.subplots(1, 2, figsize=(30, 5))
for i, fraud_label in enumerate(fraud):
temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
top = temp_data.groupby(col).size().nlargest(10)
ax[i].bar(top.index, top.values, color='#a1c9f4')
if col == 'state':
ax[i].tick_params(axis='x', rotation=0)
ax[i].tick_params(axis='x', rotation=90)
st.write("") # Add a blank line
markdown_text = """
* From the top 10 transaction by job we can see `Color Technologist` have the most fraud with over 20 transaction meanwhile `Film/Video editor` are the most not fraud with over 2.000 transaction
* From the top 10 transaction by state we can see `NY` have the most fraud with over 80 transaction meanwhile `TX`are the most not fraud with 20.000 transaction
* From the top 10 transaction by city we can see `Camden` have the most fraud over 20 transaction meanwhile `Birmingham` are the most not fraud with almost 1.200 transaction
* From the top 10 transaction by merchant we can see `Healthcore LLC.` have most fraud with 10 transaction meanwhile `Killback LLC.` are the most not fraud with almost 1.000 transaction
st.markdown("<h2><b>Total Number and Amount for Fraud and Non Fraud Transaction</b></h2>", unsafe_allow_html=True)
def annotate_bar(ax, custom_y_func, font_size=14):
for p in ax.patches:
value = str(round(p.get_height(), 1))
x = (p.get_x() + p.get_width() / 2) * 0.99
y = ((p.get_y() + p.get_height() / 2) * 0.99)
y = custom_y_func(y)
ax.annotate(value, (x, y), color="black", size=font_size, ha='center', va='center')
# Fraud and Not Fraud Transactions
st.header("Fraud and Not Fraud Transactions Count")
data_fraud_count = data['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud').value_counts().reset_index()
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(data=data_fraud_count, x='is_fraud', y='count', color='#c6def8', ax=ax)
annotate_bar(ax, lambda y: 15000 if y < 10000 else y, font_size=14)
ax.set_title("Total number of transaction for fraud and not fraud transaction", fontsize=12, fontweight='bold')
ax.set_ylabel("Transaction count")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
# Fraud and Not Fraud Amount
st.header("Fraud and Not Fraud Transactions Amount")
data_fraud_amount = data.groupby('is_fraud')['amt'].sum().reset_index()
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(data=data_fraud_amount, x='is_fraud', y='amt', color='#c6def8', ax=ax)
annotate_bar(ax, lambda y: 1900000 if y < 1200000 else y, font_size=12)
ax.set_title("Total transaction amount for fraud and not fraud transaction", fontsize=12, fontweight='bold')
ax.set_ylabel("Transaction amount")
ax.set_xticklabels(['Not Fraud', 'Fraud'], rotation=0)
markdown_text = """
Based on visualisation above:
* There is 276743 total number of transaction `not fraud` and 1117 `fraud` transaction
* There is 18745296.5 total transaction amount of `not fraud` and 1117 `fraud` transaction
# Calculate age
data['dob'] = pd.to_datetime(data['dob'])
data['age'] = (2020 - data['dob'].dt.year)
def apply_age_group(age):
if age <= 18:
return 'Teenager'
elif age <= 25:
return "Young Adult"
elif age <= 64:
return "Adult"
return "Elder"
data['age_group'] = data['age'].apply(apply_age_group)
# Overview of dataset by month, gender, and category
st.header("Overview of dataset by Age, gender, and category")
columns = ['gender', 'category', 'age', 'age_group']
columns_name = ['gender', 'category', 'age', 'age group']
name = ['Not Fraud', 'Fraud']
for col in columns:
st.subheader("Distribution of transaction by " + columns_name[columns.index(col)])
fig, ax = plt.subplots(1, 2, figsize=(15, 5)) # Create a subplot with 2 columns
for i in range(0, 2):
data_1 = data[data['is_fraud'] == i]
if col == 'gender':
ax[i].pie(data_1[col].value_counts(), labels=['Female', 'Male'], autopct='%1.1f%%')
elif col == 'age_group':
ax[i].pie(data_1[col].value_counts(), labels=data_1[col].value_counts().index, autopct='%1.1f%%')
elif col == 'category':
sns.countplot(data=data_1, y=col, order=data_1[col].value_counts().index, ax=ax[i])
sns.histplot(data=data_1, x=col, ax=ax[i])
if col == 'category':
ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
markdown_text = """
Based on visualisation above we can see:
- There is 54,8% transaction of `female` and 45,2% transaction of `male` in `not fraud` and `fraud`
- Most distribution of `not fraud` transaction by category is from `gas_transport` meanwhile in fraud is from shopping_net
- In distribution transaction by age mostly between 30-40 in `fraud` and between 45-50 for `not fraud`
- By age group mostly `not fraud` transaction is from Adult with 73,9% and `fraud` also from Adult with 74,6%
st.markdown(markdown_text) |