celineclarissa's picture
Upload eda.py
8adcb68 verified
raw
history blame contribute delete
No virus
10.6 kB
# import libraries
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
# set page title
st.set_page_config(
page_title = 'GC5'
)
# make function to convert text to correlating digit
def default_id(i):
# make condition
if i == 'Default':
return 1
else:
return 0
# make function run()
def run():
# make title
st.title('Credit Card Data EDA')
# make description
st.write('This page was made to predict whether the user will pay on time next month or fail to do so (default).')
# insert image
st.image('https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg', caption='Credit Cards (https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg)')
# Membuat garis lurus
st.markdown('---')
# Show dataframe
st.write('### Dataset')
df = pd.read_csv('df_ori.csv')
# data cleaning
#drop unused columns
df = df.drop(columns='Unnamed: 0')
# replace "6" as "5" in "education_level" column
df['education_level'] = df['education_level'].replace(6, 5)
# replace "0.0" as "-1.0" in "pay_[i]" column
df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']] = df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].replace(0.0, -1.0)
# show cleaned dataset
st.dataframe(df)
# make border
st.write('')
st.markdown('---')
st.write('')
# EDA 1
# make title
st.write('### Default Payment Pie Chart')
# count value
default_counts = df['default_payment_next_month'].value_counts(dropna=False)
# create pie chart
fig1 = plt.figure(figsize=[15, 5])
plt.pie(default_counts, autopct='%1.1f%%', labels=[0, 1], shadow=True)
plt.axis('equal')
# show pie chart
st.pyplot(fig1)
# show insight
st.write('From the pie chart, it was found that ***data is imbalanced***. Less than a quarter of recorded credit card users are default (failed to do minimum payment in period of time).')
# make border
st.markdown('---')
# EDA 2
# make title
st.write('### Distribution of Unique Values in Pay_[i] for Default Payment Status')
# make dictionaries
# pay_[i] meaning
dict1_eda2 = {'pay_0':'Payment Status (Sept 2005)',
'pay_2':'Payment Status (Aug 2005)',
'pay_3':'Payment Status (Jul 2005)',
'pay_4':'Payment Status (Jun 2005)',
'pay_5':'Payment Status (May 2005)',
'pay_6':'Payment Status (Apr 2005)'
}
# unique values meaning
dict2_eda2 = {
'-2.0': 'No transaction',
'-1.0': 'Paid duly',
'1.0': 'Payment delay (1 mo)',
'2.0': 'Payment delay (2 mo)',
'3.0': 'Payment delay (3 mo)',
'4.0': 'Payment delay (4 mo)',
'5.0': 'Payment delay (5 mo)',
'6.0': 'Payment delay (6 mo)',
'7.0': 'Payment delay (7 mo)',
'8.0': 'Payment delay (8 mo)',
'9.0': 'Payment delay (9 mo)'}
# make copy of dataframe
eda2 = df.copy()
# define choice_eda2 -> user input
choice_eda2 = st.selectbox('Pilih Feature:', ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6'])
# change data type and replace value according to dictionary
eda2[choice_eda2] = eda2[choice_eda2].astype(str).replace(dict2_eda2)
# create plot
fig2 = plt.figure(figsize=[15, 5])
plot_data = eda2.loc[eda2['default_payment_next_month'] == 1, [choice_eda2, 'default_payment_next_month']].groupby(choice_eda2).count().sort_values('default_payment_next_month', ascending=False).reset_index()
plot_data.plot(kind='bar', x=choice_eda2, y='default_payment_next_month', xlabel='', ylabel='Count', legend=False, title=f'{dict1_eda2[choice_eda2]} Distribution', ax=plt.gca())
# show plot
st.pyplot(fig2)
# show insight
st.write('From the plots, it can be understood that most customers from April-September 2005 whose payment status is default ***mostly have paid duly***. ')
# make border
st.markdown('---')
# EDA 3
# make title
st.write('### Age Group Distribution for Default and Non-Default Payment Status')
# make new list
age_group = []
# make looping
for i in df['age']:
# classify age group 'Youth'
if i in range(15, 24):
age_group.append('Youth')
# classify age group 'Adult'
elif i in range(25, 64):
age_group.append('Adult')
# classify age group 'Senior'
else:
age_group.append('Senior')
# create new column
df['age_group'] = age_group
# define choice_eda3 -> user input
choice_eda3 = st.selectbox('Default:', ['Non-Default', 'Default'])
# define dataframe
agegroup_counts = df[df['default_payment_next_month']==default_id(choice_eda3)]['age_group'].value_counts(dropna=False)
# create pie chart
fig3 = plt.figure(figsize=[15, 5])
plt.pie(agegroup_counts, autopct='%1.1f%%', labels=['Youth', 'Adult', 'Senior'], shadow=True)
plt.title('Age Distribution Pie Chart')
plt.axis('equal')
# show pie chart
st.pyplot(fig3)
# show insight
st.write('From the pie charts, it can be understood that the age distribution for customers whose payment status is default and non-default are quite similar with the most being adults, followed by youth, and seniors. So, it can be concluded that ***there are no significant difference in age range between customers whose payment status is default and non-default***.')
# make border
st.markdown('---')
# EDA 4
# make title
st.write('### Limit Balance Distribution for Default and Non-Default Payment Status')
# define choice_eda4 -> user input
choice_eda4 = st.selectbox('Default: ', ['Non-Default', 'Default'])
# create dataframe
limit = df[df['default_payment_next_month']==default_id(choice_eda4)]['limit_balance']
# create dictionary
dict_eda4 = {0:'Limit Balance Distribution for Non-Default Payment Status',
1:'Limit Balance Distribution for Default Payment Status'}
# create box plot
fig4 = plt.figure(figsize=[15, 5])
plt.boxplot(limit)
plt.title(dict_eda4[default_id(choice_eda4)])
# show pie chart
st.pyplot(fig4)
# show insight
st.write('From the box plots, it can be understood that ***customers with payment status of non-default have wider range of limit balance*** (less than 100,000 to around 800,000). Meanwhile, customers with payment status default have limit balance range of less than 100,000 to around 500,000.')
# make border
st.markdown('---')
# EDA 5
# make title
st.write('### Sex Distribution for Default and Non-Default Payment Status')
# define choice_eda5 -> user input
choice_eda5 = st.selectbox('Default: ', ['Non-Default', 'Default'])
# make query
sex_counts = df[df['default_payment_next_month']==default_id(choice_eda5)]['sex'].value_counts(dropna=False)
# create pie chart
fig5 = plt.figure(figsize=[15, 5])
plt.pie(sex_counts, autopct='%1.1f%%', labels=['Male', 'Female'], shadow=True)
plt.title('Sex Distribution Pie Chart')
plt.axis('equal')
# show pie chart
st.pyplot(fig5)
# show insight
st.write('From the pie charts, it can be understood that the sex distribution for customers whose payment status is default and non-default are quite similar with the most being males. So, it can be concluded that ***there are no significant difference in sex distribution between customers whose payment status is default and non-default***.')
# make border
st.markdown('---')
# EDA 6
# make title
st.write('### Average Bill Statement in Each Month for Default and Non-Default Payment Status')
# define choice_eda6 -> user input
choice_eda6 = st.selectbox('Default: ', ['Non-Default', 'Default'])
# make query
eda6 = df.loc[df['default_payment_next_month']==default_id(choice_eda6), ['bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'default_payment_next_month']]
# create plot
fig6 = plt.figure(figsize=[15, 5])
plot_data6 = pd.DataFrame({'Bill Statement': ['Apr 2005', 'May 2005', 'Jun 2005', 'Jul 2005', 'Aug 2005', 'Sept 2005'],
'Mean': [eda6['bill_amt_6'].mean(), eda6['bill_amt_5'].mean(), eda6['bill_amt_4'].mean(), eda6['bill_amt_3'].mean(), eda6['bill_amt_2'].mean(), eda6['bill_amt_1'].mean()]})
# plot_data6.plot(kind='line', x='Bill Statement', y='Mean', legend=False)
plt.plot(plot_data6['Bill Statement'], plot_data6['Mean'])
# show plot
st.pyplot(fig6)
# show insight
st.write('From the pie charts, it can be understood that the marital status distribution for customers whose payment status is default and non-default are quite similar with the most being married, followed by single, and unknown. So, it can be concluded that ***there are no significant difference in marital status distribution between customers whose payment status is default and non-default***.')
# make border
st.markdown('---')
# EDA 7
# make title
st.write('### Marital Status Distribution for Default and Non-Default Payment Status')
# define choice_eda7 -> user input
choice_eda7 = st.selectbox('Default: ', ['Non-Default', 'Default'])
# create dataframe
eda7 = df[df.marital_status != 0]
# make query
marital_counts = eda7[eda7['default_payment_next_month']==default_id(choice_eda7)]['marital_status'].value_counts(dropna=False)
# create pie chart
fig7 = plt.figure(figsize=[15, 5])
marital_counts.plot.pie(autopct='%1.1f%%', shadow=True, labels=['Married', 'Single', 'Others'])
plt.title('Marital Status Distribution Pie Chart')
plt.axis('equal')
# show pie chart
st.pyplot(fig7)
# show insight
st.write('From the pie charts, it can be understood that the average bill statement in each month for customers whose payment status is default and non-default are quite similar. In 5 months, the average bill statement gradually rose. Both default and non-default customers have highest average bill statement in September 2005. So, it can be concluded that ***there are no significant difference in average bill statement between customers whose payment status is default and non-default***. Other than that, it was found that ***as time goes by, the average bill statement tends to increase***.')
# execute file
if __name__=='__main__':
run()