Spaces:

celineclarissa
/

GC5_Credit_Card_Data

Sleeping

File size: 10,649 Bytes

# import libraries
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

# set page title
st.set_page_config(
    page_title = 'GC5'
)

# make function to convert text to correlating digit
def default_id(i):
    # make condition
    if i == 'Default':
        return 1
    else:
        return 0

# make function run()
def run():
    # make title
    st.title('Credit Card Data EDA')

    # make description
    st.write('This page was made to predict whether the user will pay on time next month or fail to do so (default).')

    # insert image
    st.image('https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg', caption='Credit Cards (https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg)')

    # Membuat garis lurus
    st.markdown('---')

    # Show dataframe
    st.write('### Dataset')
    df = pd.read_csv('df_ori.csv')

    # data cleaning
    #drop unused columns
    df = df.drop(columns='Unnamed: 0')
    # replace "6" as "5" in "education_level" column
    df['education_level'] = df['education_level'].replace(6, 5)
    # replace "0.0" as "-1.0" in "pay_[i]" column
    df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']] = df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].replace(0.0, -1.0)

    # show cleaned dataset
    st.dataframe(df)

    # make border
    st.write('')
    st.markdown('---')
    st.write('')

    # EDA 1
    # make title
    st.write('### Default Payment Pie Chart')
    # count value
    default_counts = df['default_payment_next_month'].value_counts(dropna=False)
    # create pie chart
    fig1 = plt.figure(figsize=[15, 5])
    plt.pie(default_counts, autopct='%1.1f%%', labels=[0, 1], shadow=True)
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig1)
    # show insight
    st.write('From the pie chart, it was found that ***data is imbalanced***. Less than a quarter of recorded credit card users are default (failed to do minimum payment in period of time).')

    # make border
    st.markdown('---')

    # EDA 2
    # make title
    st.write('### Distribution of Unique Values in Pay_[i] for Default Payment Status')
    # make dictionaries
    # pay_[i] meaning
    dict1_eda2 = {'pay_0':'Payment Status (Sept 2005)',
     'pay_2':'Payment Status (Aug 2005)',
     'pay_3':'Payment Status (Jul 2005)',
     'pay_4':'Payment Status (Jun 2005)',
     'pay_5':'Payment Status (May 2005)',
     'pay_6':'Payment Status (Apr 2005)'
     }
    # unique values meaning
    dict2_eda2 = {
    '-2.0': 'No transaction',
    '-1.0': 'Paid duly',
    '1.0': 'Payment delay (1 mo)',
    '2.0': 'Payment delay (2 mo)',
    '3.0': 'Payment delay (3 mo)',
    '4.0': 'Payment delay (4 mo)',
    '5.0': 'Payment delay (5 mo)',
    '6.0': 'Payment delay (6 mo)',
    '7.0': 'Payment delay (7 mo)',
    '8.0': 'Payment delay (8 mo)',
    '9.0': 'Payment delay (9 mo)'}
    # make copy of dataframe
    eda2 = df.copy()
    # define choice_eda2 -> user input
    choice_eda2 = st.selectbox('Pilih Feature:', ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6'])
    # change data type and replace value according to dictionary
    eda2[choice_eda2] = eda2[choice_eda2].astype(str).replace(dict2_eda2)
    # create plot
    fig2 = plt.figure(figsize=[15, 5])
    plot_data = eda2.loc[eda2['default_payment_next_month'] == 1, [choice_eda2, 'default_payment_next_month']].groupby(choice_eda2).count().sort_values('default_payment_next_month', ascending=False).reset_index()
    plot_data.plot(kind='bar', x=choice_eda2, y='default_payment_next_month', xlabel='', ylabel='Count', legend=False, title=f'{dict1_eda2[choice_eda2]} Distribution', ax=plt.gca())
    # show plot
    st.pyplot(fig2)
    # show insight
    st.write('From the plots, it can be understood that most customers from April-September 2005 whose payment status is default ***mostly have paid duly***. ')

    # make border
    st.markdown('---')

    # EDA 3
    # make title
    st.write('### Age Group Distribution for Default and Non-Default Payment Status')
    # make new list
    age_group = []
    # make looping
    for i in df['age']:
        # classify age group 'Youth'
        if i in range(15, 24):
            age_group.append('Youth')
        # classify age group 'Adult'
        elif i in range(25, 64):
            age_group.append('Adult')
        # classify age group 'Senior'
        else:
            age_group.append('Senior')
    # create new column
    df['age_group'] = age_group
    # define choice_eda3 -> user input
    choice_eda3 = st.selectbox('Default:', ['Non-Default', 'Default'])
    # define dataframe
    agegroup_counts = df[df['default_payment_next_month']==default_id(choice_eda3)]['age_group'].value_counts(dropna=False)
    # create pie chart
    fig3 = plt.figure(figsize=[15, 5])
    plt.pie(agegroup_counts, autopct='%1.1f%%', labels=['Youth', 'Adult', 'Senior'], shadow=True)
    plt.title('Age Distribution Pie Chart')
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig3)
    # show insight
    st.write('From the pie charts, it can be understood that the age distribution for customers whose payment status is default and non-default are quite similar with the most being adults, followed by youth, and seniors. So, it can be concluded that ***there are no significant difference in age range between customers whose payment status is default and non-default***.')

    # make border
    st.markdown('---')

    # EDA 4
    # make title
    st.write('### Limit Balance Distribution for Default and Non-Default Payment Status')
    # define choice_eda4 -> user input
    choice_eda4 = st.selectbox('Default:  ', ['Non-Default', 'Default'])
    # create dataframe
    limit = df[df['default_payment_next_month']==default_id(choice_eda4)]['limit_balance']
    # create dictionary
    dict_eda4 = {0:'Limit Balance Distribution for Non-Default Payment Status',
             1:'Limit Balance Distribution for Default Payment Status'}
    # create box plot
    fig4 = plt.figure(figsize=[15, 5])
    plt.boxplot(limit)
    plt.title(dict_eda4[default_id(choice_eda4)])
    # show pie chart
    st.pyplot(fig4)
    # show insight
    st.write('From the box plots, it can be understood that ***customers with payment status of non-default have wider range of limit balance*** (less than 100,000 to around 800,000). Meanwhile, customers with payment status default have limit balance range of less than 100,000 to around 500,000.')

    # make border
    st.markdown('---')

    # EDA 5
    # make title
    st.write('### Sex Distribution for Default and Non-Default Payment Status')
    # define choice_eda5 -> user input
    choice_eda5 = st.selectbox('Default: ', ['Non-Default', 'Default'])
    # make query
    sex_counts = df[df['default_payment_next_month']==default_id(choice_eda5)]['sex'].value_counts(dropna=False)
    # create pie chart
    fig5 = plt.figure(figsize=[15, 5])
    plt.pie(sex_counts, autopct='%1.1f%%', labels=['Male', 'Female'], shadow=True)
    plt.title('Sex Distribution Pie Chart')
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig5)
    # show insight
    st.write('From the pie charts, it can be understood that the sex distribution for customers whose payment status is default and non-default are quite similar with the most being males. So, it can be concluded that ***there are no significant difference in sex distribution between customers whose payment status is default and non-default***.')

    # make border
    st.markdown('---')

    # EDA 6
    # make title
    st.write('### Average Bill Statement in Each Month for Default and Non-Default Payment Status')
    # define choice_eda6 -> user input
    choice_eda6 = st.selectbox('Default:         ', ['Non-Default', 'Default'])
    # make query
    eda6 = df.loc[df['default_payment_next_month']==default_id(choice_eda6), ['bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'default_payment_next_month']]
    # create plot
    fig6 = plt.figure(figsize=[15, 5])
    plot_data6 = pd.DataFrame({'Bill Statement': ['Apr 2005', 'May 2005', 'Jun 2005', 'Jul 2005', 'Aug 2005', 'Sept 2005'],
                             'Mean': [eda6['bill_amt_6'].mean(), eda6['bill_amt_5'].mean(), eda6['bill_amt_4'].mean(), eda6['bill_amt_3'].mean(), eda6['bill_amt_2'].mean(), eda6['bill_amt_1'].mean()]})
    # plot_data6.plot(kind='line', x='Bill Statement', y='Mean', legend=False)
    plt.plot(plot_data6['Bill Statement'], plot_data6['Mean'])
    # show plot
    st.pyplot(fig6)
    # show insight
    st.write('From the pie charts, it can be understood that the marital status distribution for customers whose payment status is default and non-default are quite similar with the most being married, followed by single, and unknown. So, it can be concluded that ***there are no significant difference in marital status distribution between customers whose payment status is default and non-default***.')

    # make border
    st.markdown('---')

    # EDA 7
    # make title
    st.write('### Marital Status Distribution for Default and Non-Default Payment Status')
    # define choice_eda7 -> user input
    choice_eda7 = st.selectbox('Default:    ', ['Non-Default', 'Default'])
    # create dataframe
    eda7 = df[df.marital_status != 0]
    # make query
    marital_counts = eda7[eda7['default_payment_next_month']==default_id(choice_eda7)]['marital_status'].value_counts(dropna=False)
    # create pie chart
    fig7 = plt.figure(figsize=[15, 5])
    marital_counts.plot.pie(autopct='%1.1f%%', shadow=True, labels=['Married', 'Single', 'Others'])
    plt.title('Marital Status Distribution Pie Chart')
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig7)
    # show insight
    st.write('From the pie charts, it can be understood that the average bill statement in each month for customers whose payment status is default and non-default are quite similar. In 5 months, the average bill statement gradually rose. Both default and non-default customers have highest average bill statement in September 2005. So, it can be concluded that ***there are no significant difference in average bill statement between customers whose payment status is default and non-default***. Other than that, it was found that ***as time goes by, the average bill statement tends to increase***.')

# execute file
if __name__=='__main__':
    run()