Reaumur commited on
Commit
afca08a
·
verified ·
1 Parent(s): 9aae410

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. XGB_best_model.pkl +3 -0
  3. app.py +35 -0
  4. eda.py +183 -0
  5. fraud_test.csv +3 -0
  6. prediction.py +153 -0
  7. requirements.txt +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ fraud_test.csv filter=lfs diff=lfs merge=lfs -text
XGB_best_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d0f429f5bcb9ed9f8da34547feee4b3f0e491048606dce13a3caa7c1243613e
3
+ size 89072
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pickle
5
+
6
+ from eda import eda_page
7
+ from prediction import model_page
8
+
9
+ #Load data
10
+ data = pd.read_csv("fraud_test.csv")
11
+
12
+ st.header('Milestone 2')
13
+ st.write("""
14
+ Created by Reski Hidayat - HCK015 """)
15
+
16
+ st.write("This program is made to predict Credit Card Fraud using Model Classification.")
17
+ st.write("Dataset `fraud_test`")
18
+ data
19
+
20
+ def main():
21
+ # Define menu options
22
+ menu_options = ["Data Analysis", "Model Prediction"]
23
+
24
+ # Create sidebar menu
25
+ selected_option = st.sidebar.radio("Menu", menu_options)
26
+
27
+ # Display selected page
28
+ if selected_option == "Data Analysis":
29
+ eda_page()
30
+ elif selected_option == "Model Prediction":
31
+ model_page()
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
eda.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import pandas as pd
5
+ import seaborn as sns
6
+
7
+ # Load data from a CSV file
8
+ data = pd.read_csv('fraud_test.csv')
9
+
10
+ # def annotate_bar(ax, custom_y_func, font_size=14):
11
+
12
+ # for p in ax.patches:
13
+ # # Calculate annotation
14
+ # value = str(round(p.get_height(), 1))
15
+ # x = (p.get_x() + p.get_width() / 2) * 0.99
16
+ # y = ((p.get_y() + p.get_height() / 2) * 0.99)
17
+
18
+ # y = custom_y_func(y)
19
+ # ax.annotate(
20
+ # value,
21
+ # (x, y),
22
+ # color="black",
23
+ # size=font_size, ha='center', va='center'
24
+ # )
25
+
26
+ def eda_page():
27
+
28
+ st.title("Eksploratory Data Analysis")
29
+ st.write('Analyze the DataFrame for Better Understanding')
30
+ st.markdown("<h2><b>Top 10 Transaction Amount</b></h2>", unsafe_allow_html=True)
31
+
32
+ # TOP Transaction Amount
33
+ columns = ['job', 'state', 'city', 'merchant']
34
+ fraud_labels = ['Not Fraud', 'Fraud']
35
+
36
+ for col in columns:
37
+ st.subheader(f"Top 10 transaction amount by {col}")
38
+ fig, ax = plt.subplots(1, 2, figsize=(30, 5))
39
+ for i, fraud_label in enumerate(fraud_labels):
40
+ temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
41
+ top = temp_data.groupby(col)['amt'].sum().nlargest(10)
42
+ ax[i].bar(top.index, top.values, color='#a1c9f4')
43
+ ax[i].set_title(fraud_label)
44
+ ax[i].set_xlabel(col)
45
+ ax[i].set_ylabel('Amount')
46
+ if col == 'state':
47
+ ax[i].tick_params(axis='x', rotation=0)
48
+ else:
49
+ ax[i].tick_params(axis='x', rotation=90)
50
+ st.pyplot(fig)
51
+ st.write("**Explanation**:")
52
+ markdown_text = """
53
+ * From the top 10 transaction amount by job we can see `Therapist` have the most fraud with almost 4000 transaction amount meanwhile `Film/Video editor` are the most non fraud with 30.000 transaction
54
+ * From the top 10 transaction amount by state we can see `NY` have the most fraud with 10.000 transaction amount meanwhile `TX`are the most non fraud with above 250.000 transaction
55
+ * From the top 10 transaction amount by city we can see `Camden` have the most fraud with 3500 transaction amount meanwhile `Naples` are the most non fraud with 250.000 transaction
56
+ * From the top 10 transaction amount by merchant we can see `Commier` have most fraud with 3000 transaction amount meanwhile `Corwin-Romaguera` are the most non fraud with almost 250.000 transaction
57
+ """
58
+ st.markdown(markdown_text)
59
+
60
+ st.markdown("<h2><b>Top 10 Transaction Count</b></h2>", unsafe_allow_html=True)
61
+ # By Transaction count
62
+ columns = ['job', 'state', 'city', 'merchant']
63
+ columns_name = ['Job', 'State', 'City', 'Merchant']
64
+ fraud = ['Not Fraud', 'Fraud']
65
+
66
+ for col, name in zip(columns, columns_name):
67
+ st.subheader(f"Top 10 transaction by {name}")
68
+ fig, ax = plt.subplots(1, 2, figsize=(30, 5))
69
+ sns.set_palette("pastel")
70
+ for i, fraud_label in enumerate(fraud):
71
+ temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
72
+ top = temp_data.groupby(col).size().nlargest(10)
73
+ ax[i].bar(top.index, top.values, color='#a1c9f4')
74
+ ax[i].set_title(fraud_label)
75
+ ax[i].set_xlabel(name)
76
+ ax[i].set_ylabel('Count')
77
+ if col == 'state':
78
+ ax[i].tick_params(axis='x', rotation=0)
79
+ else:
80
+ ax[i].tick_params(axis='x', rotation=90)
81
+ st.pyplot(fig)
82
+
83
+ st.write("") # Add a blank line
84
+
85
+ st.write("**Explanation**:")
86
+ markdown_text = """
87
+ * From the top 10 transaction by job we can see `Color Technologist` have the most fraud with above 20 transaction meanwhile `Film/Video editor` are the most non fraud with 2.000 transaction
88
+ * From the top 10 transaction by state we can see `NY` have the most fraud with above 80 transaction meanwhile `TX`are the most non fraud with 20.000 transaction
89
+ * From the top 10 transaction by city we can see `Camden` have the most fraud above 20 transaction meanwhile `Birmingham` are the most non fraud with almost 1.200 transaction
90
+ * From the top 10 transaction by merchant we can see `Healthcore LLC.` have most fraud with 10 transaction meanwhile `Killback LLC.` are the most non fraud with almost 1.000 transaction
91
+ """
92
+ st.markdown(markdown_text)
93
+
94
+ st.markdown("<h2><b>Total Number and Amount for Fraud and Non Fraud Transaction</b></h2>", unsafe_allow_html=True)
95
+
96
+ def annotate_bar(ax, custom_y_func, font_size=14):
97
+ for p in ax.patches:
98
+ value = str(round(p.get_height(), 1))
99
+ x = (p.get_x() + p.get_width() / 2) * 0.99
100
+ y = ((p.get_y() + p.get_height() / 2) * 0.99)
101
+ y = custom_y_func(y)
102
+ ax.annotate(value, (x, y), color="black", size=font_size, ha='center', va='center')
103
+
104
+ # Fraud and Not Fraud Transactions
105
+ st.header("Fraud and Not Fraud Transactions Count")
106
+ data_fraud_count = data['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud').value_counts().reset_index()
107
+ fig, ax = plt.subplots(figsize=(15, 5))
108
+ sns.barplot(data=data_fraud_count, x='is_fraud', y='count', color='#c6def8', ax=ax)
109
+ annotate_bar(ax, lambda y: 15000 if y < 10000 else y, font_size=14)
110
+ ax.set_title("Total number of transaction for fraud and not fraud transaction", fontsize=12, fontweight='bold')
111
+ ax.set_ylabel("Transaction count")
112
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
113
+ st.pyplot(fig)
114
+
115
+ # Fraud and Not Fraud Amount
116
+ st.header("Fraud and Not Fraud Transactions Amount")
117
+ data_fraud_amount = data.groupby('is_fraud')['amt'].sum().reset_index()
118
+ fig, ax = plt.subplots(figsize=(15, 5))
119
+ sns.barplot(data=data_fraud_amount, x='is_fraud', y='amt', color='#c6def8', ax=ax)
120
+ annotate_bar(ax, lambda y: 1900000 if y < 1200000 else y, font_size=12)
121
+ ax.set_title("Total transaction amount for fraud and not fraud transaction", fontsize=12, fontweight='bold')
122
+ ax.set_ylabel("Transaction amount")
123
+ ax.set_xticklabels(['Not Fraud', 'Fraud'], rotation=0)
124
+ st.pyplot(fig)
125
+
126
+ st.write("**Explanation**:")
127
+ markdown_text = """
128
+ Based on visualisation above:
129
+ * There is 276743 total number of transaction `not fraud` and 1117 `fraud` transaction
130
+ * There is 18745296.5 total transaction amount of `not fraud` and 1117 `fraud` transaction
131
+ """
132
+ st.markdown(markdown_text)
133
+
134
+ # Calculate age
135
+ data['dob'] = pd.to_datetime(data['dob'])
136
+ data['age'] = (2020 - data['dob'].dt.year)
137
+
138
+ def apply_age_group(age):
139
+ if age <= 18:
140
+ return 'Teenager'
141
+ elif age <= 25:
142
+ return "Young Adult"
143
+ elif age <= 64:
144
+ return "Adult"
145
+ else:
146
+ return "Elder"
147
+
148
+ data['age_group'] = data['age'].apply(apply_age_group)
149
+
150
+ # Overview of dataset by month, gender, and category
151
+ st.header("Overview of dataset by Age, gender, and category")
152
+ columns = ['gender', 'category', 'age', 'age_group']
153
+ columns_name = ['gender', 'category', 'age', 'age group']
154
+ name = ['Not Fraud', 'Fraud']
155
+
156
+ for col in columns:
157
+ st.subheader("Distribution of transaction by " + columns_name[columns.index(col)])
158
+ fig, ax = plt.subplots(1, 2, figsize=(15, 5)) # Create a subplot with 2 columns
159
+ for i in range(0, 2):
160
+ data_1 = data[data['is_fraud'] == i]
161
+ if col == 'gender':
162
+ ax[i].pie(data_1[col].value_counts(), labels=['Female', 'Male'], autopct='%1.1f%%')
163
+ elif col == 'age_group':
164
+ ax[i].pie(data_1[col].value_counts(), labels=data_1[col].value_counts().index, autopct='%1.1f%%')
165
+ elif col == 'category':
166
+ sns.countplot(data=data_1, y=col, order=data_1[col].value_counts().index, ax=ax[i])
167
+ else:
168
+ sns.histplot(data=data_1, x=col, ax=ax[i])
169
+ ax[i].set_title(name[i])
170
+ ax[i].set_xlabel(columns_name[columns.index(col)])
171
+ if col == 'category':
172
+ ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
173
+ st.pyplot(fig)
174
+
175
+ st.write("**Explanation**:")
176
+ markdown_text = """
177
+ Based on visualisation above we can see:
178
+ - There is 54,8% transaction of `female` and 45,2% transaction of `male` in `not fraud` and `fraud`
179
+ - Most distribution of `not fraud` transaction by category is from `gas_transport` meanwhile in fraud is from shopping_net
180
+ - In distribution transaction by age mostly between 30-40 in `fraud` and between 45-50 for `not fraud`
181
+ - By age group mostly `not fraud` transaction is from Adult with 73,9% and `fraud` also from Adult with 74,6%
182
+ """
183
+ st.markdown(markdown_text)
fraud_test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a070405882d0414853dc0d2879451ded709d7327f6630ed4b39b5167ca815a
3
+ size 143639688
prediction.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pickle
5
+
6
+ # Library Random Data
7
+ from scipy.stats import randint
8
+
9
+ from datetime import datetime, timedelta
10
+ from sklearn.utils import shuffle
11
+
12
+ def model_page():
13
+ st.title("Model Prediction of Credit Card Fault")
14
+ st.write("The model predicts whether the customer's transaction is fraud or not")
15
+ st.sidebar.header('User Input Features')
16
+
17
+ input_data = user_input()
18
+
19
+ st.subheader('User Input')
20
+ st.write(input_data)
21
+
22
+ # Load the model using a context manager to ensure the file is closed
23
+ with open("XGB_best_model.pkl", "rb") as f:
24
+ load_model = pickle.load(f)
25
+
26
+ prediction = load_model.predict(input_data)
27
+
28
+ if prediction == 1:
29
+ prediction = 'The Transaction is Fraud'
30
+ else:
31
+ prediction = 'The Transaction is Legit'
32
+
33
+ st.write('Based on user input, the model predicted: ')
34
+ st.write(prediction)
35
+
36
+ def user_input(num_rows=1):
37
+ data = generate_data(num_rows)
38
+ return data
39
+
40
+ def generate_data(num_rows=555719):
41
+ trans_date_trans_time = st.sidebar.date_input("Transaction Date", value=datetime.now(), min_value=datetime.now() - timedelta(days=365), max_value=datetime.now())
42
+ trans_date_trans_time = [trans_date_trans_time for _ in range(num_rows)]
43
+
44
+ cc_num = st.sidebar.number_input("Credit Card Number", value=500000, min_value=100000, max_value=999999)
45
+ cc_num = [cc_num for _ in range(num_rows)]
46
+
47
+ merchant = st.sidebar.selectbox("Merchant", ['Merchant1', 'Merchant2', 'Merchant3'])
48
+ merchant = [merchant for _ in range(num_rows)]
49
+
50
+ category = st.sidebar.selectbox("Category", ['Personal', 'Childcare', 'Food', 'Transportation'])
51
+ category = [category for _ in range(num_rows)]
52
+
53
+ amt = st.sidebar.number_input("Amount", value=500, min_value=0, max_value=1000)
54
+ amt = [amt for _ in range(num_rows)]
55
+
56
+ first = st.sidebar.text_input("First Name")
57
+ first = [first for _ in range(num_rows)]
58
+
59
+ last = st.sidebar.text_input("Last Name")
60
+ last = [last for _ in range(num_rows)]
61
+
62
+ gender = st.sidebar.selectbox("Gender", ['Male', 'Female'])
63
+ gender = [gender for _ in range(num_rows)]
64
+
65
+ street = st.sidebar.text_input("Street")
66
+ street = [street for _ in range(num_rows)]
67
+
68
+ city = st.sidebar.text_input("City")
69
+ city = [city for _ in range(num_rows)]
70
+
71
+ state = st.sidebar.selectbox("State", ['NY', 'CA', 'IL', 'TX'])
72
+ state = [state for _ in range(num_rows)]
73
+
74
+ zip_code = st.sidebar.text_input("Zip Code")
75
+ zip_code = [zip_code for _ in range(num_rows)]
76
+
77
+ lat = st.sidebar.number_input("Latitude", value=40.7128, min_value=-90., max_value=90.)
78
+ lat = [lat for _ in range(num_rows)]
79
+
80
+ long_ = st.sidebar.number_input("Longitude", value=-74.0060, min_value=-180., max_value=180.)
81
+ long_ = [long_ for _ in range(num_rows)]
82
+
83
+ city_pop = st.sidebar.number_input("City Population", value=10000, min_value=10000, max_value=1000000)
84
+ city_pop = [city_pop for _ in range(num_rows)]
85
+
86
+ job = st.sidebar.selectbox("Job", ['Software Engineer', 'Doctor', 'Lawyer', 'Teacher'])
87
+ job = [job for _ in range(num_rows)]
88
+
89
+ dob = st.sidebar.date_input("Date of Birth", value=datetime.now() - timedelta(days=365*70), min_value=datetime.now() - timedelta(days=365*100), max_value=datetime.now())
90
+ dob = [dob for _ in range(num_rows)]
91
+
92
+ trans_num = np.arange(1, num_rows + 1)
93
+
94
+ unix_time = st.sidebar.number_input("Unix Time", value=int(datetime.now().timestamp()), min_value=0, max_value=int(datetime.now().timestamp()))
95
+ unix_time = [unix_time for _ in range(num_rows)]
96
+
97
+ merch_lat = st.sidebar.number_input("Merchant Latitude", value=40.7128, min_value=-90., max_value=90.)
98
+ merch_lat = [merch_lat for _ in range(num_rows)]
99
+
100
+ merch_long = st.sidebar.number_input("Merchant Longitude", value=-74.0060, min_value=-180., max_value=180.)
101
+ merch_long = [merch_long for _ in range(num_rows)]
102
+
103
+ age = st.sidebar.number_input("Age", value=30, min_value=18, max_value=80)
104
+ age = [age for _ in range(num_rows)]
105
+
106
+
107
+
108
+ data = {
109
+ 'Trans_date_trans_time': trans_date_trans_time,
110
+ 'Cc_num': cc_num,
111
+ 'Merchant': merchant,
112
+ 'Category': category,
113
+ 'Amt': amt,
114
+ 'First': first,
115
+ 'Last': last,
116
+ 'Gender': gender,
117
+ 'Street': street,
118
+ 'City': city,
119
+ 'State': state,
120
+ 'Zip': zip_code,
121
+ 'Lat': lat,
122
+ 'Long': long_,
123
+ 'City_pop': city_pop,
124
+ 'Job': job,
125
+ 'Dob': dob,
126
+ 'Trans_num': trans_num,
127
+ 'Unix_time': unix_time,
128
+ 'Merch_lat': merch_lat,
129
+ 'Merch_long': merch_long,
130
+ 'age': age,
131
+ 'category': category,
132
+ 'amt': amt,
133
+ 'state': state,
134
+ 'job': job
135
+ }
136
+
137
+ # Create a Pandas DataFrame
138
+ df = pd.DataFrame(data)
139
+
140
+ return df
141
+
142
+ # def main():
143
+ # st.title("Credit Card Transaction Data")
144
+ # st.write("This app generates random credit card transaction data.")
145
+
146
+ # num_rows = st.slider("Number of rows", 100, 100000, 555719)
147
+
148
+ # df = generate_data(num_rows)
149
+
150
+ # st.write(df)
151
+
152
+ # if __name__ == "__main__":
153
+ # main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ scikit-learn
2
+ pandas
3
+ matplotlib
4
+ pickle
5
+ transformers
6
+ seaborn