# import libraries import streamlit as st import pandas as pd import matplotlib.pyplot as plt # set page title st.set_page_config( page_title = 'GC5' ) # make function to convert text to correlating digit def default_id(i): # make condition if i == 'Default': return 1 else: return 0 # make function run() def run(): # make title st.title('Credit Card Data EDA') # make description st.write('This page was made to predict whether the user will pay on time next month or fail to do so (default).') # insert image st.image('https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg', caption='Credit Cards (https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg)') # Membuat garis lurus st.markdown('---') # Show dataframe st.write('### Dataset') df = pd.read_csv('df_ori.csv') # data cleaning #drop unused columns df = df.drop(columns='Unnamed: 0') # replace "6" as "5" in "education_level" column df['education_level'] = df['education_level'].replace(6, 5) # replace "0.0" as "-1.0" in "pay_[i]" column df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']] = df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].replace(0.0, -1.0) # show cleaned dataset st.dataframe(df) # make border st.write('') st.markdown('---') st.write('') # EDA 1 # make title st.write('### Default Payment Pie Chart') # count value default_counts = df['default_payment_next_month'].value_counts(dropna=False) # create pie chart fig1 = plt.figure(figsize=[15, 5]) plt.pie(default_counts, autopct='%1.1f%%', labels=[0, 1], shadow=True) plt.axis('equal') # show pie chart st.pyplot(fig1) # show insight st.write('From the pie chart, it was found that ***data is imbalanced***. Less than a quarter of recorded credit card users are default (failed to do minimum payment in period of time).') # make border st.markdown('---') # EDA 2 # make title st.write('### Distribution of Unique Values in Pay_[i] for Default Payment Status') # make dictionaries # pay_[i] meaning dict1_eda2 = {'pay_0':'Payment Status (Sept 2005)', 'pay_2':'Payment Status (Aug 2005)', 'pay_3':'Payment Status (Jul 2005)', 'pay_4':'Payment Status (Jun 2005)', 'pay_5':'Payment Status (May 2005)', 'pay_6':'Payment Status (Apr 2005)' } # unique values meaning dict2_eda2 = { '-2.0': 'No transaction', '-1.0': 'Paid duly', '1.0': 'Payment delay (1 mo)', '2.0': 'Payment delay (2 mo)', '3.0': 'Payment delay (3 mo)', '4.0': 'Payment delay (4 mo)', '5.0': 'Payment delay (5 mo)', '6.0': 'Payment delay (6 mo)', '7.0': 'Payment delay (7 mo)', '8.0': 'Payment delay (8 mo)', '9.0': 'Payment delay (9 mo)'} # make copy of dataframe eda2 = df.copy() # define choice_eda2 -> user input choice_eda2 = st.selectbox('Pilih Feature:', ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']) # change data type and replace value according to dictionary eda2[choice_eda2] = eda2[choice_eda2].astype(str).replace(dict2_eda2) # create plot fig2 = plt.figure(figsize=[15, 5]) plot_data = eda2.loc[eda2['default_payment_next_month'] == 1, [choice_eda2, 'default_payment_next_month']].groupby(choice_eda2).count().sort_values('default_payment_next_month', ascending=False).reset_index() plot_data.plot(kind='bar', x=choice_eda2, y='default_payment_next_month', xlabel='', ylabel='Count', legend=False, title=f'{dict1_eda2[choice_eda2]} Distribution', ax=plt.gca()) # show plot st.pyplot(fig2) # show insight st.write('From the plots, it can be understood that most customers from April-September 2005 whose payment status is default ***mostly have paid duly***. ') # make border st.markdown('---') # EDA 3 # make title st.write('### Age Group Distribution for Default and Non-Default Payment Status') # make new list age_group = [] # make looping for i in df['age']: # classify age group 'Youth' if i in range(15, 24): age_group.append('Youth') # classify age group 'Adult' elif i in range(25, 64): age_group.append('Adult') # classify age group 'Senior' else: age_group.append('Senior') # create new column df['age_group'] = age_group # define choice_eda3 -> user input choice_eda3 = st.selectbox('Default:', ['Non-Default', 'Default']) # define dataframe agegroup_counts = df[df['default_payment_next_month']==default_id(choice_eda3)]['age_group'].value_counts(dropna=False) # create pie chart fig3 = plt.figure(figsize=[15, 5]) plt.pie(agegroup_counts, autopct='%1.1f%%', labels=['Youth', 'Adult', 'Senior'], shadow=True) plt.title('Age Distribution Pie Chart') plt.axis('equal') # show pie chart st.pyplot(fig3) # show insight st.write('From the pie charts, it can be understood that the age distribution for customers whose payment status is default and non-default are quite similar with the most being adults, followed by youth, and seniors. So, it can be concluded that ***there are no significant difference in age range between customers whose payment status is default and non-default***.') # make border st.markdown('---') # EDA 4 # make title st.write('### Limit Balance Distribution for Default and Non-Default Payment Status') # define choice_eda4 -> user input choice_eda4 = st.selectbox('Default: ', ['Non-Default', 'Default']) # create dataframe limit = df[df['default_payment_next_month']==default_id(choice_eda4)]['limit_balance'] # create dictionary dict_eda4 = {0:'Limit Balance Distribution for Non-Default Payment Status', 1:'Limit Balance Distribution for Default Payment Status'} # create box plot fig4 = plt.figure(figsize=[15, 5]) plt.boxplot(limit) plt.title(dict_eda4[default_id(choice_eda4)]) # show pie chart st.pyplot(fig4) # show insight st.write('From the box plots, it can be understood that ***customers with payment status of non-default have wider range of limit balance*** (less than 100,000 to around 800,000). Meanwhile, customers with payment status default have limit balance range of less than 100,000 to around 500,000.') # make border st.markdown('---') # EDA 5 # make title st.write('### Sex Distribution for Default and Non-Default Payment Status') # define choice_eda5 -> user input choice_eda5 = st.selectbox('Default: ', ['Non-Default', 'Default']) # make query sex_counts = df[df['default_payment_next_month']==default_id(choice_eda5)]['sex'].value_counts(dropna=False) # create pie chart fig5 = plt.figure(figsize=[15, 5]) plt.pie(sex_counts, autopct='%1.1f%%', labels=['Male', 'Female'], shadow=True) plt.title('Sex Distribution Pie Chart') plt.axis('equal') # show pie chart st.pyplot(fig5) # show insight st.write('From the pie charts, it can be understood that the sex distribution for customers whose payment status is default and non-default are quite similar with the most being males. So, it can be concluded that ***there are no significant difference in sex distribution between customers whose payment status is default and non-default***.') # make border st.markdown('---') # EDA 6 # make title st.write('### Average Bill Statement in Each Month for Default and Non-Default Payment Status') # define choice_eda6 -> user input choice_eda6 = st.selectbox('Default: ', ['Non-Default', 'Default']) # make query eda6 = df.loc[df['default_payment_next_month']==default_id(choice_eda6), ['bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'default_payment_next_month']] # create plot fig6 = plt.figure(figsize=[15, 5]) plot_data6 = pd.DataFrame({'Bill Statement': ['Apr 2005', 'May 2005', 'Jun 2005', 'Jul 2005', 'Aug 2005', 'Sept 2005'], 'Mean': [eda6['bill_amt_6'].mean(), eda6['bill_amt_5'].mean(), eda6['bill_amt_4'].mean(), eda6['bill_amt_3'].mean(), eda6['bill_amt_2'].mean(), eda6['bill_amt_1'].mean()]}) # plot_data6.plot(kind='line', x='Bill Statement', y='Mean', legend=False) plt.plot(plot_data6['Bill Statement'], plot_data6['Mean']) # show plot st.pyplot(fig6) # show insight st.write('From the pie charts, it can be understood that the marital status distribution for customers whose payment status is default and non-default are quite similar with the most being married, followed by single, and unknown. So, it can be concluded that ***there are no significant difference in marital status distribution between customers whose payment status is default and non-default***.') # make border st.markdown('---') # EDA 7 # make title st.write('### Marital Status Distribution for Default and Non-Default Payment Status') # define choice_eda7 -> user input choice_eda7 = st.selectbox('Default: ', ['Non-Default', 'Default']) # create dataframe eda7 = df[df.marital_status != 0] # make query marital_counts = eda7[eda7['default_payment_next_month']==default_id(choice_eda7)]['marital_status'].value_counts(dropna=False) # create pie chart fig7 = plt.figure(figsize=[15, 5]) marital_counts.plot.pie(autopct='%1.1f%%', shadow=True, labels=['Married', 'Single', 'Others']) plt.title('Marital Status Distribution Pie Chart') plt.axis('equal') # show pie chart st.pyplot(fig7) # show insight st.write('From the pie charts, it can be understood that the average bill statement in each month for customers whose payment status is default and non-default are quite similar. In 5 months, the average bill statement gradually rose. Both default and non-default customers have highest average bill statement in September 2005. So, it can be concluded that ***there are no significant difference in average bill statement between customers whose payment status is default and non-default***. Other than that, it was found that ***as time goes by, the average bill statement tends to increase***.') # execute file if __name__=='__main__': run()