Spaces:
Sleeping
Sleeping
# import libraries | |
import streamlit as st | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
# set page title | |
st.set_page_config( | |
page_title = 'GC5' | |
) | |
# make function to convert text to correlating digit | |
def default_id(i): | |
# make condition | |
if i == 'Default': | |
return 1 | |
else: | |
return 0 | |
# make function run() | |
def run(): | |
# make title | |
st.title('Credit Card Data EDA') | |
# make description | |
st.write('This page was made to predict whether the user will pay on time next month or fail to do so (default).') | |
# insert image | |
st.image('https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg', caption='Credit Cards (https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg)') | |
# Membuat garis lurus | |
st.markdown('---') | |
# Show dataframe | |
st.write('### Dataset') | |
df = pd.read_csv('df_ori.csv') | |
# data cleaning | |
#drop unused columns | |
df = df.drop(columns='Unnamed: 0') | |
# replace "6" as "5" in "education_level" column | |
df['education_level'] = df['education_level'].replace(6, 5) | |
# replace "0.0" as "-1.0" in "pay_[i]" column | |
df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']] = df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].replace(0.0, -1.0) | |
# show cleaned dataset | |
st.dataframe(df) | |
# make border | |
st.write('') | |
st.markdown('---') | |
st.write('') | |
# EDA 1 | |
# make title | |
st.write('### Default Payment Pie Chart') | |
# count value | |
default_counts = df['default_payment_next_month'].value_counts(dropna=False) | |
# create pie chart | |
fig1 = plt.figure(figsize=[15, 5]) | |
plt.pie(default_counts, autopct='%1.1f%%', labels=[0, 1], shadow=True) | |
plt.axis('equal') | |
# show pie chart | |
st.pyplot(fig1) | |
# show insight | |
st.write('From the pie chart, it was found that ***data is imbalanced***. Less than a quarter of recorded credit card users are default (failed to do minimum payment in period of time).') | |
# make border | |
st.markdown('---') | |
# EDA 2 | |
# make title | |
st.write('### Distribution of Unique Values in Pay_[i] for Default Payment Status') | |
# make dictionaries | |
# pay_[i] meaning | |
dict1_eda2 = {'pay_0':'Payment Status (Sept 2005)', | |
'pay_2':'Payment Status (Aug 2005)', | |
'pay_3':'Payment Status (Jul 2005)', | |
'pay_4':'Payment Status (Jun 2005)', | |
'pay_5':'Payment Status (May 2005)', | |
'pay_6':'Payment Status (Apr 2005)' | |
} | |
# unique values meaning | |
dict2_eda2 = { | |
'-2.0': 'No transaction', | |
'-1.0': 'Paid duly', | |
'1.0': 'Payment delay (1 mo)', | |
'2.0': 'Payment delay (2 mo)', | |
'3.0': 'Payment delay (3 mo)', | |
'4.0': 'Payment delay (4 mo)', | |
'5.0': 'Payment delay (5 mo)', | |
'6.0': 'Payment delay (6 mo)', | |
'7.0': 'Payment delay (7 mo)', | |
'8.0': 'Payment delay (8 mo)', | |
'9.0': 'Payment delay (9 mo)'} | |
# make copy of dataframe | |
eda2 = df.copy() | |
# define choice_eda2 -> user input | |
choice_eda2 = st.selectbox('Pilih Feature:', ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']) | |
# change data type and replace value according to dictionary | |
eda2[choice_eda2] = eda2[choice_eda2].astype(str).replace(dict2_eda2) | |
# create plot | |
fig2 = plt.figure(figsize=[15, 5]) | |
plot_data = eda2.loc[eda2['default_payment_next_month'] == 1, [choice_eda2, 'default_payment_next_month']].groupby(choice_eda2).count().sort_values('default_payment_next_month', ascending=False).reset_index() | |
plot_data.plot(kind='bar', x=choice_eda2, y='default_payment_next_month', xlabel='', ylabel='Count', legend=False, title=f'{dict1_eda2[choice_eda2]} Distribution', ax=plt.gca()) | |
# show plot | |
st.pyplot(fig2) | |
# show insight | |
st.write('From the plots, it can be understood that most customers from April-September 2005 whose payment status is default ***mostly have paid duly***. ') | |
# make border | |
st.markdown('---') | |
# EDA 3 | |
# make title | |
st.write('### Age Group Distribution for Default and Non-Default Payment Status') | |
# make new list | |
age_group = [] | |
# make looping | |
for i in df['age']: | |
# classify age group 'Youth' | |
if i in range(15, 24): | |
age_group.append('Youth') | |
# classify age group 'Adult' | |
elif i in range(25, 64): | |
age_group.append('Adult') | |
# classify age group 'Senior' | |
else: | |
age_group.append('Senior') | |
# create new column | |
df['age_group'] = age_group | |
# define choice_eda3 -> user input | |
choice_eda3 = st.selectbox('Default:', ['Non-Default', 'Default']) | |
# define dataframe | |
agegroup_counts = df[df['default_payment_next_month']==default_id(choice_eda3)]['age_group'].value_counts(dropna=False) | |
# create pie chart | |
fig3 = plt.figure(figsize=[15, 5]) | |
plt.pie(agegroup_counts, autopct='%1.1f%%', labels=['Youth', 'Adult', 'Senior'], shadow=True) | |
plt.title('Age Distribution Pie Chart') | |
plt.axis('equal') | |
# show pie chart | |
st.pyplot(fig3) | |
# show insight | |
st.write('From the pie charts, it can be understood that the age distribution for customers whose payment status is default and non-default are quite similar with the most being adults, followed by youth, and seniors. So, it can be concluded that ***there are no significant difference in age range between customers whose payment status is default and non-default***.') | |
# make border | |
st.markdown('---') | |
# EDA 4 | |
# make title | |
st.write('### Limit Balance Distribution for Default and Non-Default Payment Status') | |
# define choice_eda4 -> user input | |
choice_eda4 = st.selectbox('Default: ', ['Non-Default', 'Default']) | |
# create dataframe | |
limit = df[df['default_payment_next_month']==default_id(choice_eda4)]['limit_balance'] | |
# create dictionary | |
dict_eda4 = {0:'Limit Balance Distribution for Non-Default Payment Status', | |
1:'Limit Balance Distribution for Default Payment Status'} | |
# create box plot | |
fig4 = plt.figure(figsize=[15, 5]) | |
plt.boxplot(limit) | |
plt.title(dict_eda4[default_id(choice_eda4)]) | |
# show pie chart | |
st.pyplot(fig4) | |
# show insight | |
st.write('From the box plots, it can be understood that ***customers with payment status of non-default have wider range of limit balance*** (less than 100,000 to around 800,000). Meanwhile, customers with payment status default have limit balance range of less than 100,000 to around 500,000.') | |
# make border | |
st.markdown('---') | |
# EDA 5 | |
# make title | |
st.write('### Sex Distribution for Default and Non-Default Payment Status') | |
# define choice_eda5 -> user input | |
choice_eda5 = st.selectbox('Default: ', ['Non-Default', 'Default']) | |
# make query | |
sex_counts = df[df['default_payment_next_month']==default_id(choice_eda5)]['sex'].value_counts(dropna=False) | |
# create pie chart | |
fig5 = plt.figure(figsize=[15, 5]) | |
plt.pie(sex_counts, autopct='%1.1f%%', labels=['Male', 'Female'], shadow=True) | |
plt.title('Sex Distribution Pie Chart') | |
plt.axis('equal') | |
# show pie chart | |
st.pyplot(fig5) | |
# show insight | |
st.write('From the pie charts, it can be understood that the sex distribution for customers whose payment status is default and non-default are quite similar with the most being males. So, it can be concluded that ***there are no significant difference in sex distribution between customers whose payment status is default and non-default***.') | |
# make border | |
st.markdown('---') | |
# EDA 6 | |
# make title | |
st.write('### Average Bill Statement in Each Month for Default and Non-Default Payment Status') | |
# define choice_eda6 -> user input | |
choice_eda6 = st.selectbox('Default: ', ['Non-Default', 'Default']) | |
# make query | |
eda6 = df.loc[df['default_payment_next_month']==default_id(choice_eda6), ['bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'default_payment_next_month']] | |
# create plot | |
fig6 = plt.figure(figsize=[15, 5]) | |
plot_data6 = pd.DataFrame({'Bill Statement': ['Apr 2005', 'May 2005', 'Jun 2005', 'Jul 2005', 'Aug 2005', 'Sept 2005'], | |
'Mean': [eda6['bill_amt_6'].mean(), eda6['bill_amt_5'].mean(), eda6['bill_amt_4'].mean(), eda6['bill_amt_3'].mean(), eda6['bill_amt_2'].mean(), eda6['bill_amt_1'].mean()]}) | |
# plot_data6.plot(kind='line', x='Bill Statement', y='Mean', legend=False) | |
plt.plot(plot_data6['Bill Statement'], plot_data6['Mean']) | |
# show plot | |
st.pyplot(fig6) | |
# show insight | |
st.write('From the pie charts, it can be understood that the marital status distribution for customers whose payment status is default and non-default are quite similar with the most being married, followed by single, and unknown. So, it can be concluded that ***there are no significant difference in marital status distribution between customers whose payment status is default and non-default***.') | |
# make border | |
st.markdown('---') | |
# EDA 7 | |
# make title | |
st.write('### Marital Status Distribution for Default and Non-Default Payment Status') | |
# define choice_eda7 -> user input | |
choice_eda7 = st.selectbox('Default: ', ['Non-Default', 'Default']) | |
# create dataframe | |
eda7 = df[df.marital_status != 0] | |
# make query | |
marital_counts = eda7[eda7['default_payment_next_month']==default_id(choice_eda7)]['marital_status'].value_counts(dropna=False) | |
# create pie chart | |
fig7 = plt.figure(figsize=[15, 5]) | |
marital_counts.plot.pie(autopct='%1.1f%%', shadow=True, labels=['Married', 'Single', 'Others']) | |
plt.title('Marital Status Distribution Pie Chart') | |
plt.axis('equal') | |
# show pie chart | |
st.pyplot(fig7) | |
# show insight | |
st.write('From the pie charts, it can be understood that the average bill statement in each month for customers whose payment status is default and non-default are quite similar. In 5 months, the average bill statement gradually rose. Both default and non-default customers have highest average bill statement in September 2005. So, it can be concluded that ***there are no significant difference in average bill statement between customers whose payment status is default and non-default***. Other than that, it was found that ***as time goes by, the average bill statement tends to increase***.') | |
# execute file | |
if __name__=='__main__': | |
run() |