File size: 10,649 Bytes
8adcb68
1c767f1
 
 
 
8adcb68
1c767f1
 
 
 
8adcb68
1c767f1
8adcb68
1c767f1
 
 
 
 
8adcb68
1c767f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00ea715
 
 
 
 
 
 
 
 
 
1c767f1
 
00ea715
1c767f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6275b71
1c767f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33f936f
1c767f1
8adcb68
1c767f1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# import libraries
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

# set page title
st.set_page_config(
    page_title = 'GC5'
)

# make function to convert text to correlating digit
def default_id(i):
    # make condition
    if i == 'Default':
        return 1
    else:
        return 0

# make function run()
def run():
    # make title
    st.title('Credit Card Data EDA')

    # make description
    st.write('This page was made to predict whether the user will pay on time next month or fail to do so (default).')

    # insert image
    st.image('https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg', caption='Credit Cards (https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg)')

    # Membuat garis lurus
    st.markdown('---')

    # Show dataframe
    st.write('### Dataset')
    df = pd.read_csv('df_ori.csv')

    # data cleaning
    #drop unused columns
    df = df.drop(columns='Unnamed: 0')
    # replace "6" as "5" in "education_level" column
    df['education_level'] = df['education_level'].replace(6, 5)
    # replace "0.0" as "-1.0" in "pay_[i]" column
    df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']] = df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].replace(0.0, -1.0)

    # show cleaned dataset
    st.dataframe(df)

    # make border
    st.write('')
    st.markdown('---')
    st.write('')

    # EDA 1
    # make title
    st.write('### Default Payment Pie Chart')
    # count value
    default_counts = df['default_payment_next_month'].value_counts(dropna=False)
    # create pie chart
    fig1 = plt.figure(figsize=[15, 5])
    plt.pie(default_counts, autopct='%1.1f%%', labels=[0, 1], shadow=True)
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig1)
    # show insight
    st.write('From the pie chart, it was found that ***data is imbalanced***. Less than a quarter of recorded credit card users are default (failed to do minimum payment in period of time).')

    # make border
    st.markdown('---')

    # EDA 2
    # make title
    st.write('### Distribution of Unique Values in Pay_[i] for Default Payment Status')
    # make dictionaries
    # pay_[i] meaning
    dict1_eda2 = {'pay_0':'Payment Status (Sept 2005)',
     'pay_2':'Payment Status (Aug 2005)',
     'pay_3':'Payment Status (Jul 2005)',
     'pay_4':'Payment Status (Jun 2005)',
     'pay_5':'Payment Status (May 2005)',
     'pay_6':'Payment Status (Apr 2005)'
     }
    # unique values meaning
    dict2_eda2 = {
    '-2.0': 'No transaction',
    '-1.0': 'Paid duly',
    '1.0': 'Payment delay (1 mo)',
    '2.0': 'Payment delay (2 mo)',
    '3.0': 'Payment delay (3 mo)',
    '4.0': 'Payment delay (4 mo)',
    '5.0': 'Payment delay (5 mo)',
    '6.0': 'Payment delay (6 mo)',
    '7.0': 'Payment delay (7 mo)',
    '8.0': 'Payment delay (8 mo)',
    '9.0': 'Payment delay (9 mo)'}
    # make copy of dataframe
    eda2 = df.copy()
    # define choice_eda2 -> user input
    choice_eda2 = st.selectbox('Pilih Feature:', ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6'])
    # change data type and replace value according to dictionary
    eda2[choice_eda2] = eda2[choice_eda2].astype(str).replace(dict2_eda2)
    # create plot
    fig2 = plt.figure(figsize=[15, 5])
    plot_data = eda2.loc[eda2['default_payment_next_month'] == 1, [choice_eda2, 'default_payment_next_month']].groupby(choice_eda2).count().sort_values('default_payment_next_month', ascending=False).reset_index()
    plot_data.plot(kind='bar', x=choice_eda2, y='default_payment_next_month', xlabel='', ylabel='Count', legend=False, title=f'{dict1_eda2[choice_eda2]} Distribution', ax=plt.gca())
    # show plot
    st.pyplot(fig2)
    # show insight
    st.write('From the plots, it can be understood that most customers from April-September 2005 whose payment status is default ***mostly have paid duly***. ')

    # make border
    st.markdown('---')

    # EDA 3
    # make title
    st.write('### Age Group Distribution for Default and Non-Default Payment Status')
    # make new list
    age_group = []
    # make looping
    for i in df['age']:
        # classify age group 'Youth'
        if i in range(15, 24):
            age_group.append('Youth')
        # classify age group 'Adult'
        elif i in range(25, 64):
            age_group.append('Adult')
        # classify age group 'Senior'
        else:
            age_group.append('Senior')
    # create new column
    df['age_group'] = age_group
    # define choice_eda3 -> user input
    choice_eda3 = st.selectbox('Default:', ['Non-Default', 'Default'])
    # define dataframe
    agegroup_counts = df[df['default_payment_next_month']==default_id(choice_eda3)]['age_group'].value_counts(dropna=False)
    # create pie chart
    fig3 = plt.figure(figsize=[15, 5])
    plt.pie(agegroup_counts, autopct='%1.1f%%', labels=['Youth', 'Adult', 'Senior'], shadow=True)
    plt.title('Age Distribution Pie Chart')
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig3)
    # show insight
    st.write('From the pie charts, it can be understood that the age distribution for customers whose payment status is default and non-default are quite similar with the most being adults, followed by youth, and seniors. So, it can be concluded that ***there are no significant difference in age range between customers whose payment status is default and non-default***.')

    # make border
    st.markdown('---')

    # EDA 4
    # make title
    st.write('### Limit Balance Distribution for Default and Non-Default Payment Status')
    # define choice_eda4 -> user input
    choice_eda4 = st.selectbox('Default:  ', ['Non-Default', 'Default'])
    # create dataframe
    limit = df[df['default_payment_next_month']==default_id(choice_eda4)]['limit_balance']
    # create dictionary
    dict_eda4 = {0:'Limit Balance Distribution for Non-Default Payment Status',
             1:'Limit Balance Distribution for Default Payment Status'}
    # create box plot
    fig4 = plt.figure(figsize=[15, 5])
    plt.boxplot(limit)
    plt.title(dict_eda4[default_id(choice_eda4)])
    # show pie chart
    st.pyplot(fig4)
    # show insight
    st.write('From the box plots, it can be understood that ***customers with payment status of non-default have wider range of limit balance*** (less than 100,000 to around 800,000). Meanwhile, customers with payment status default have limit balance range of less than 100,000 to around 500,000.')

    # make border
    st.markdown('---')

    # EDA 5
    # make title
    st.write('### Sex Distribution for Default and Non-Default Payment Status')
    # define choice_eda5 -> user input
    choice_eda5 = st.selectbox('Default: ', ['Non-Default', 'Default'])
    # make query
    sex_counts = df[df['default_payment_next_month']==default_id(choice_eda5)]['sex'].value_counts(dropna=False)
    # create pie chart
    fig5 = plt.figure(figsize=[15, 5])
    plt.pie(sex_counts, autopct='%1.1f%%', labels=['Male', 'Female'], shadow=True)
    plt.title('Sex Distribution Pie Chart')
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig5)
    # show insight
    st.write('From the pie charts, it can be understood that the sex distribution for customers whose payment status is default and non-default are quite similar with the most being males. So, it can be concluded that ***there are no significant difference in sex distribution between customers whose payment status is default and non-default***.')

    # make border
    st.markdown('---')

    # EDA 6
    # make title
    st.write('### Average Bill Statement in Each Month for Default and Non-Default Payment Status')
    # define choice_eda6 -> user input
    choice_eda6 = st.selectbox('Default:         ', ['Non-Default', 'Default'])
    # make query
    eda6 = df.loc[df['default_payment_next_month']==default_id(choice_eda6), ['bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'default_payment_next_month']]
    # create plot
    fig6 = plt.figure(figsize=[15, 5])
    plot_data6 = pd.DataFrame({'Bill Statement': ['Apr 2005', 'May 2005', 'Jun 2005', 'Jul 2005', 'Aug 2005', 'Sept 2005'],
                             'Mean': [eda6['bill_amt_6'].mean(), eda6['bill_amt_5'].mean(), eda6['bill_amt_4'].mean(), eda6['bill_amt_3'].mean(), eda6['bill_amt_2'].mean(), eda6['bill_amt_1'].mean()]})
    # plot_data6.plot(kind='line', x='Bill Statement', y='Mean', legend=False)
    plt.plot(plot_data6['Bill Statement'], plot_data6['Mean'])
    # show plot
    st.pyplot(fig6)
    # show insight
    st.write('From the pie charts, it can be understood that the marital status distribution for customers whose payment status is default and non-default are quite similar with the most being married, followed by single, and unknown. So, it can be concluded that ***there are no significant difference in marital status distribution between customers whose payment status is default and non-default***.')

    # make border
    st.markdown('---')

    # EDA 7
    # make title
    st.write('### Marital Status Distribution for Default and Non-Default Payment Status')
    # define choice_eda7 -> user input
    choice_eda7 = st.selectbox('Default:    ', ['Non-Default', 'Default'])
    # create dataframe
    eda7 = df[df.marital_status != 0]
    # make query
    marital_counts = eda7[eda7['default_payment_next_month']==default_id(choice_eda7)]['marital_status'].value_counts(dropna=False)
    # create pie chart
    fig7 = plt.figure(figsize=[15, 5])
    marital_counts.plot.pie(autopct='%1.1f%%', shadow=True, labels=['Married', 'Single', 'Others'])
    plt.title('Marital Status Distribution Pie Chart')
    plt.axis('equal')
    # show pie chart
    st.pyplot(fig7)
    # show insight
    st.write('From the pie charts, it can be understood that the average bill statement in each month for customers whose payment status is default and non-default are quite similar. In 5 months, the average bill statement gradually rose. Both default and non-default customers have highest average bill statement in September 2005. So, it can be concluded that ***there are no significant difference in average bill statement between customers whose payment status is default and non-default***. Other than that, it was found that ***as time goes by, the average bill statement tends to increase***.')

# execute file
if __name__=='__main__':
    run()