Spaces:

Reaumur
/

Credit-Card-Fraud-Detection

Sleeping

App Files Files Community

Reaumur commited on May 8, 2024

Commit

afca08a

verified ·

1 Parent(s): 9aae410

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
XGB_best_model.pkl +3 -0
app.py +35 -0
eda.py +183 -0
fraud_test.csv +3 -0
prediction.py +153 -0
requirements.txt +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fraud_test.csv filter=lfs diff=lfs merge=lfs -text

XGB_best_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d0f429f5bcb9ed9f8da34547feee4b3f0e491048606dce13a3caa7c1243613e
+size 89072

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import pickle
+from eda import eda_page
+from prediction import model_page
+#Load data
+data = pd.read_csv("fraud_test.csv")
+st.header('Milestone 2')
+st.write("""
+Created by Reski Hidayat - HCK015 """)
+st.write("This program is made to predict Credit Card Fraud using Model Classification.")
+st.write("Dataset `fraud_test`")
+data
+def main():
+    # Define menu options
+    menu_options = ["Data Analysis", "Model Prediction"]
+    # Create sidebar menu
+    selected_option = st.sidebar.radio("Menu", menu_options)
+    # Display selected page
+    if selected_option == "Data Analysis":
+        eda_page()
+    elif selected_option == "Model Prediction":
+        model_page()
+if __name__ == "__main__":
+    main()

eda.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import streamlit as st
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+# Load data from a CSV file
+data = pd.read_csv('fraud_test.csv')
+# def annotate_bar(ax, custom_y_func, font_size=14):
+#     for p in ax.patches:
+#         # Calculate annotation
+#         value = str(round(p.get_height(), 1))
+#         x = (p.get_x() + p.get_width() / 2) * 0.99
+#         y = ((p.get_y() + p.get_height() / 2) * 0.99)
+#         y = custom_y_func(y)
+#         ax.annotate(
+#             value,
+#             (x, y),
+#             color="black",
+#             size=font_size, ha='center', va='center'
+#         )
+def eda_page():
+    st.title("Eksploratory Data Analysis")
+    st.write('Analyze the DataFrame for Better Understanding')
+    st.markdown("<h2><b>Top 10 Transaction Amount</b></h2>", unsafe_allow_html=True)
+    # TOP Transaction Amount
+    columns = ['job', 'state', 'city', 'merchant']
+    fraud_labels = ['Not Fraud', 'Fraud']
+    for col in columns:
+        st.subheader(f"Top 10 transaction amount by {col}")
+        fig, ax = plt.subplots(1, 2, figsize=(30, 5))
+        for i, fraud_label in enumerate(fraud_labels):
+            temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
+            top = temp_data.groupby(col)['amt'].sum().nlargest(10)
+            ax[i].bar(top.index, top.values, color='#a1c9f4')
+            ax[i].set_title(fraud_label)
+            ax[i].set_xlabel(col)
+            ax[i].set_ylabel('Amount')
+            if col == 'state':
+                ax[i].tick_params(axis='x', rotation=0)
+            else:
+                ax[i].tick_params(axis='x', rotation=90)
+        st.pyplot(fig)
+    st.write("**Explanation**:")
+    markdown_text = """
+    * From the top 10 transaction amount by job we can see `Therapist` have the most fraud with almost 4000 transaction amount meanwhile `Film/Video editor` are the most non fraud with 30.000 transaction
+    * From the top 10 transaction amount by state we can see `NY` have the most fraud with 10.000 transaction amount meanwhile `TX`are the most non fraud with above 250.000 transaction
+    * From the top 10 transaction amount by city we can see `Camden` have the most fraud with 3500 transaction amount meanwhile `Naples` are the most non fraud with 250.000 transaction
+    * From the top 10 transaction amount by merchant we can see `Commier` have most fraud with 3000 transaction amount meanwhile `Corwin-Romaguera` are the most non fraud with almost 250.000 transaction
+    """
+    st.markdown(markdown_text)
+    st.markdown("<h2><b>Top 10 Transaction Count</b></h2>", unsafe_allow_html=True)
+    # By Transaction count
+    columns = ['job', 'state', 'city', 'merchant']
+    columns_name = ['Job', 'State', 'City', 'Merchant']
+    fraud = ['Not Fraud', 'Fraud']
+    for col, name in zip(columns, columns_name):
+        st.subheader(f"Top 10 transaction by {name}")
+        fig, ax = plt.subplots(1, 2, figsize=(30, 5))
+        sns.set_palette("pastel")
+        for i, fraud_label in enumerate(fraud):
+            temp_data = data[data['is_fraud'] == (0 if fraud_label == 'Not Fraud' else 1)]
+            top = temp_data.groupby(col).size().nlargest(10)
+            ax[i].bar(top.index, top.values, color='#a1c9f4')
+            ax[i].set_title(fraud_label)
+            ax[i].set_xlabel(name)
+            ax[i].set_ylabel('Count')
+            if col == 'state':
+                ax[i].tick_params(axis='x', rotation=0)
+            else:
+                ax[i].tick_params(axis='x', rotation=90)
+        st.pyplot(fig)
+    st.write("")  # Add a blank line
+    st.write("**Explanation**:")
+    markdown_text = """
+    * From the top 10 transaction by job we can see `Color Technologist` have the most fraud with above 20 transaction meanwhile `Film/Video editor` are the most non fraud with 2.000 transaction
+    * From the top 10 transaction by state we can see `NY` have the most fraud with above 80 transaction meanwhile `TX`are the most non fraud with 20.000 transaction
+    * From the top 10 transaction by city we can see `Camden` have the most fraud above 20 transaction meanwhile `Birmingham` are the most non fraud with almost 1.200 transaction
+    * From the top 10 transaction by merchant we can see `Healthcore LLC.` have most fraud with 10 transaction meanwhile `Killback LLC.` are the most non fraud with almost 1.000 transaction
+    """
+    st.markdown(markdown_text)
+    st.markdown("<h2><b>Total Number and Amount for Fraud and Non Fraud Transaction</b></h2>", unsafe_allow_html=True)
+    def annotate_bar(ax, custom_y_func, font_size=14):
+        for p in ax.patches:
+            value = str(round(p.get_height(), 1))
+            x = (p.get_x() + p.get_width() / 2) * 0.99
+            y = ((p.get_y() + p.get_height() / 2) * 0.99)
+            y = custom_y_func(y)
+            ax.annotate(value, (x, y), color="black", size=font_size, ha='center', va='center')
+    # Fraud and Not Fraud Transactions
+    st.header("Fraud and Not Fraud Transactions Count")
+    data_fraud_count = data['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud').value_counts().reset_index()
+    fig, ax = plt.subplots(figsize=(15, 5))
+    sns.barplot(data=data_fraud_count, x='is_fraud', y='count', color='#c6def8', ax=ax)
+    annotate_bar(ax, lambda y: 15000 if y < 10000 else y, font_size=14)
+    ax.set_title("Total number of transaction for fraud and not fraud transaction", fontsize=12, fontweight='bold')
+    ax.set_ylabel("Transaction count")
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
+    st.pyplot(fig)
+    # Fraud and Not Fraud Amount
+    st.header("Fraud and Not Fraud Transactions Amount")
+    data_fraud_amount = data.groupby('is_fraud')['amt'].sum().reset_index()
+    fig, ax = plt.subplots(figsize=(15, 5))
+    sns.barplot(data=data_fraud_amount, x='is_fraud', y='amt', color='#c6def8', ax=ax)
+    annotate_bar(ax, lambda y: 1900000 if y < 1200000 else y, font_size=12)
+    ax.set_title("Total transaction amount for fraud and not fraud transaction", fontsize=12, fontweight='bold')
+    ax.set_ylabel("Transaction amount")
+    ax.set_xticklabels(['Not Fraud', 'Fraud'], rotation=0)
+    st.pyplot(fig)
+    st.write("**Explanation**:")
+    markdown_text = """
+    Based on visualisation above:
+    * There is 276743 total number of transaction `not fraud` and 1117 `fraud` transaction
+    * There is 18745296.5 total transaction amount of `not fraud` and 1117 `fraud` transaction
+    """
+    st.markdown(markdown_text)
+    # Calculate age
+    data['dob'] = pd.to_datetime(data['dob'])
+    data['age'] = (2020 - data['dob'].dt.year)
+    def apply_age_group(age):
+        if age <= 18:
+            return 'Teenager'
+        elif age <= 25:
+            return "Young Adult"
+        elif age <= 64:
+            return "Adult"
+        else:
+            return "Elder"
+    data['age_group'] = data['age'].apply(apply_age_group)
+    # Overview of dataset by month, gender, and category
+    st.header("Overview of dataset by Age, gender, and category")
+    columns = ['gender', 'category', 'age', 'age_group']
+    columns_name = ['gender', 'category', 'age', 'age group']
+    name = ['Not Fraud', 'Fraud']
+    for col in columns:
+        st.subheader("Distribution of transaction by " + columns_name[columns.index(col)])
+        fig, ax = plt.subplots(1, 2, figsize=(15, 5))  # Create a subplot with 2 columns
+        for i in range(0, 2):
+            data_1 = data[data['is_fraud'] == i]
+            if col == 'gender':
+                ax[i].pie(data_1[col].value_counts(), labels=['Female', 'Male'], autopct='%1.1f%%')
+            elif col == 'age_group':
+                ax[i].pie(data_1[col].value_counts(), labels=data_1[col].value_counts().index, autopct='%1.1f%%')
+            elif col == 'category':
+                sns.countplot(data=data_1, y=col, order=data_1[col].value_counts().index, ax=ax[i])
+            else:
+                sns.histplot(data=data_1, x=col, ax=ax[i])
+            ax[i].set_title(name[i])
+            ax[i].set_xlabel(columns_name[columns.index(col)])
+            if col == 'category':
+                ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
+        st.pyplot(fig)
+    st.write("**Explanation**:")
+    markdown_text = """
+    Based on visualisation above we can see:
+    - There is 54,8% transaction of `female` and 45,2% transaction of `male` in `not fraud` and `fraud`
+    - Most distribution of `not fraud` transaction by category is from `gas_transport` meanwhile in fraud is from shopping_net
+    - In distribution transaction by age mostly between 30-40 in `fraud` and between 45-50 for `not fraud`
+    - By age group mostly `not fraud` transaction is from Adult with 73,9% and `fraud` also from Adult with 74,6%
+    """
+    st.markdown(markdown_text)

fraud_test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86a070405882d0414853dc0d2879451ded709d7327f6630ed4b39b5167ca815a
+size 143639688

prediction.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import pickle
+# Library Random Data
+from scipy.stats import randint
+from datetime import datetime, timedelta
+from sklearn.utils import shuffle
+def model_page():
+    st.title("Model Prediction of Credit Card Fault")
+    st.write("The model predicts whether the customer's transaction is fraud or not")
+    st.sidebar.header('User Input Features')
+    input_data = user_input()
+    st.subheader('User Input')
+    st.write(input_data)
+    # Load the model using a context manager to ensure the file is closed
+    with open("XGB_best_model.pkl", "rb") as f:
+        load_model = pickle.load(f)
+    prediction = load_model.predict(input_data)
+    if prediction == 1:
+        prediction = 'The Transaction is Fraud'
+    else:
+        prediction = 'The Transaction is Legit'
+    st.write('Based on user input, the model predicted: ')
+    st.write(prediction)
+def user_input(num_rows=1):
+    data = generate_data(num_rows)
+    return data
+def generate_data(num_rows=555719):
+    trans_date_trans_time = st.sidebar.date_input("Transaction Date", value=datetime.now(), min_value=datetime.now() - timedelta(days=365), max_value=datetime.now())
+    trans_date_trans_time = [trans_date_trans_time for _ in range(num_rows)]
+    cc_num = st.sidebar.number_input("Credit Card Number", value=500000, min_value=100000, max_value=999999)
+    cc_num = [cc_num for _ in range(num_rows)]
+    merchant = st.sidebar.selectbox("Merchant", ['Merchant1', 'Merchant2', 'Merchant3'])
+    merchant = [merchant for _ in range(num_rows)]
+    category = st.sidebar.selectbox("Category", ['Personal', 'Childcare', 'Food', 'Transportation'])
+    category = [category for _ in range(num_rows)]
+    amt = st.sidebar.number_input("Amount", value=500, min_value=0, max_value=1000)
+    amt = [amt for _ in range(num_rows)]
+    first = st.sidebar.text_input("First Name")
+    first = [first for _ in range(num_rows)]
+    last = st.sidebar.text_input("Last Name")
+    last = [last for _ in range(num_rows)]
+    gender = st.sidebar.selectbox("Gender", ['Male', 'Female'])
+    gender = [gender for _ in range(num_rows)]
+    street = st.sidebar.text_input("Street")
+    street = [street for _ in range(num_rows)]
+    city = st.sidebar.text_input("City")
+    city = [city for _ in range(num_rows)]
+    state = st.sidebar.selectbox("State", ['NY', 'CA', 'IL', 'TX'])
+    state = [state for _ in range(num_rows)]
+    zip_code = st.sidebar.text_input("Zip Code")
+    zip_code = [zip_code for _ in range(num_rows)]
+    lat = st.sidebar.number_input("Latitude", value=40.7128, min_value=-90., max_value=90.)
+    lat = [lat for _ in range(num_rows)]
+    long_ = st.sidebar.number_input("Longitude", value=-74.0060, min_value=-180., max_value=180.)
+    long_ = [long_ for _ in range(num_rows)]
+    city_pop = st.sidebar.number_input("City Population", value=10000, min_value=10000, max_value=1000000)
+    city_pop = [city_pop for _ in range(num_rows)]
+    job = st.sidebar.selectbox("Job", ['Software Engineer', 'Doctor', 'Lawyer', 'Teacher'])
+    job = [job for _ in range(num_rows)]
+    dob = st.sidebar.date_input("Date of Birth", value=datetime.now() - timedelta(days=365*70), min_value=datetime.now() - timedelta(days=365*100), max_value=datetime.now())
+    dob = [dob for _ in range(num_rows)]
+    trans_num = np.arange(1, num_rows + 1)
+    unix_time = st.sidebar.number_input("Unix Time", value=int(datetime.now().timestamp()), min_value=0, max_value=int(datetime.now().timestamp()))
+    unix_time = [unix_time for _ in range(num_rows)]
+    merch_lat = st.sidebar.number_input("Merchant Latitude", value=40.7128, min_value=-90., max_value=90.)
+    merch_lat = [merch_lat for _ in range(num_rows)]
+    merch_long = st.sidebar.number_input("Merchant Longitude", value=-74.0060, min_value=-180., max_value=180.)
+    merch_long = [merch_long for _ in range(num_rows)]
+    age = st.sidebar.number_input("Age", value=30, min_value=18, max_value=80)
+    age = [age for _ in range(num_rows)]
+    data = {
+        'Trans_date_trans_time': trans_date_trans_time,
+        'Cc_num': cc_num,
+        'Merchant': merchant,
+        'Category': category,
+        'Amt': amt,
+        'First': first,
+        'Last': last,
+        'Gender': gender,
+        'Street': street,
+        'City': city,
+        'State': state,
+        'Zip': zip_code,
+        'Lat': lat,
+        'Long': long_,
+        'City_pop': city_pop,
+        'Job': job,
+        'Dob': dob,
+        'Trans_num': trans_num,
+        'Unix_time': unix_time,
+        'Merch_lat': merch_lat,
+        'Merch_long': merch_long,
+        'age': age,
+        'category': category,
+        'amt': amt,
+        'state': state,
+        'job': job
+    }
+    # Create a Pandas DataFrame
+    df = pd.DataFrame(data)
+    return df
+# def main():
+#     st.title("Credit Card Transaction Data")
+#     st.write("This app generates random credit card transaction data.")
+#     num_rows = st.slider("Number of rows", 100, 100000, 555719)
+#     df = generate_data(num_rows)
+#     st.write(df)
+# if __name__ == "__main__":
+#     main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+scikit-learn
+pandas
+matplotlib
+pickle
+transformers
+seaborn