# import libraries import streamlit as st import numpy as np import pandas as pd import matplotlib.pyplot as plt def app(): st.title('Exploratory Data Analysis') df = pd.read_csv('eda_data.csv') st.subheader('Dataset Preview') st.write(df) st.subheader('Data Analysis Questions') st.write('How is the percentage of default payment as education level increases?') vis_1(df) st.write('How is the contribution of each gender to default payment?') vis_2(df) st.write('Which one got more into default payment, customers with limit balance above or below average?') vis_3(df) st.write('How does the average of default payment changes as the total late payment rises?') vis_4(df) st.write('How is the contribution of each marital status to default payment?') vis_5(df) def vis_1(df): # make a copy of df df_eda_1 = df.copy() # get default payment sum of each education level df_eda_1_grouped_1 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].sum()).reset_index() # get row count of each education level df_eda_1_grouped_2 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].count()).reset_index() df_eda_1_grouped_2.rename(columns={'default_payment_next_month' : 'amount'}, inplace=True) # get default payment percentage of each educatin level dp_percentage_data = [] for i in range(len(df_eda_1_grouped_1)): dp_percentage = df_eda_1_grouped_1['default_payment_next_month'][i] / df_eda_1_grouped_2['amount'][i] dp_percentage *= 100 dp_percentage = round(dp_percentage, 2) dp_percentage_data.append(dp_percentage) # create dataframe with education level and default payment percentage data df_eda_1_final = pd.DataFrame({ 'education_level' : df_eda_1_grouped_1['education_level'], 'default_payment_percentage' : dp_percentage_data }) # set plot's title plt.title('Default Payment Percentage for each Education Level') # define plot plt.bar(data=df_eda_1_final, x='education_level', height='default_payment_percentage') # set y limit ax = plt.gca() ax.set_ylim([0, 100]) # add axis label ax.set_xlabel('education level') ax.set_ylabel('default payment percentage') # edit x ticks and their labels ax.set_xticks([1, 2, 3, 4]) ax.set_xticklabels(['graduate_school', 'university', 'highschool', 'others']) # add bar label rects = ax.patches for rect in rects: height = rect.get_height() ax.text(rect.get_x() + rect.get_width()/2., 1.05*height, f'{height}%', ha='center', va='bottom') # show plot st.pyplot(plt.gcf()) plt.clf() def vis_2(df): # make a copy of df df_eda_2 = df.copy() # group df by sex and sum their default_payment df_eda_2_grouped = pd.DataFrame(df_eda_2.groupby(by='sex')['default_payment_next_month'].sum()).reset_index() # set plot's title plt.title('Gender Contribution to Default Payment') # define plot plt.pie(data=df_eda_2_grouped, x='default_payment_next_month', labels=['male', 'female'], autopct='%1.1f%%') # show plot st.pyplot(plt.gcf()) plt.clf() def vis_3(df): # make a copy of df df_eda_3 = df.copy() # get the average limit_balance avg_lim = df_eda_3['limit_balance'].mean() # get the limit group data, below or above average lim_group = [] for lim in df_eda_3['limit_balance']: if lim < avg_lim: lim_group.append('below_average') else: lim_group.append('above_average') # add column df_eda_3['limit_group'] = lim_group # group df by limit group and sum their default payment count df_eda_3_grouped = pd.DataFrame(df_eda_3.groupby(by='limit_group')['default_payment_next_month'].sum()).reset_index() # set plot's title plt.title('Default Payment Amount Categorized by Limit Group') # define plot barh = plt.barh(data=df_eda_3_grouped, y='limit_group', width='default_payment_next_month', label='default_payment_next_month') # add axis label ax = plt.gca() ax.set_xlabel('default payment amount') ax.set_ylabel('limit balance group') # set x limit ax.set_xlim([0, 550]) # add label ax = plt.gca() ax.bar_label(barh) # show plot st.pyplot(plt.gcf()) plt.clf() def vis_4(df): # make a copy of df df_eda_4 = df.copy() # get total late payment total_late_payment = pd.Series() for i in range(1, 7): total_late_payment = total_late_payment.add(df_eda_4[f'pay_{i}'], fill_value=0) # add column to df df_eda_4['total_late_payment'] = total_late_payment # group df by total late payment and sum default payment amount df_eda_4_grouped = pd.DataFrame(df_eda_4.groupby(by='total_late_payment')['default_payment_next_month'].mean()).reset_index() # set plot's title plt.title('The Effect of Late Payment to Default Payment') # define plot plt.plot(df_eda_4_grouped['total_late_payment'], df_eda_4_grouped['default_payment_next_month']) # add axis label ax = plt.gca() ax.set_xlabel('total late payment (month)') ax.set_ylabel('average default payment') # show plot st.pyplot(plt.gcf()) plt.clf() def vis_5(df): # make a copy of df df_eda_5 = df.copy() # group df by marital status and sum their default_payment df_eda_5_grouped = pd.DataFrame(df_eda_5.groupby(by='marital_status')['default_payment_next_month'].sum()).reset_index() # set plot's title plt.title('Contribution to Default Payment by Marital Status ') # define plot plt.pie(data=df_eda_5_grouped, x='default_payment_next_month', labels=['others', 'married', 'single', 'divorced'], autopct='%1.1f%%') # show plot st.pyplot(plt.gcf()) plt.clf()