weewoo2636's picture
Upload 6 files
45c8de4 verified
raw
history blame contribute delete
No virus
6.14 kB
# import libraries
import streamlit as st
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def app():
st.title('Exploratory Data Analysis')
df = pd.read_csv('eda_data.csv')
st.subheader('Dataset Preview')
st.write(df)
st.subheader('Data Analysis Questions')
st.write('How is the percentage of default payment as education level increases?')
vis_1(df)
st.write('How is the contribution of each gender to default payment?')
vis_2(df)
st.write('Which one got more into default payment, customers with limit balance above or below average?')
vis_3(df)
st.write('How does the average of default payment changes as the total late payment rises?')
vis_4(df)
st.write('How is the contribution of each marital status to default payment?')
vis_5(df)
def vis_1(df):
# make a copy of df
df_eda_1 = df.copy()
# get default payment sum of each education level
df_eda_1_grouped_1 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].sum()).reset_index()
# get row count of each education level
df_eda_1_grouped_2 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].count()).reset_index()
df_eda_1_grouped_2.rename(columns={'default_payment_next_month' : 'amount'}, inplace=True)
# get default payment percentage of each educatin level
dp_percentage_data = []
for i in range(len(df_eda_1_grouped_1)):
dp_percentage = df_eda_1_grouped_1['default_payment_next_month'][i] / df_eda_1_grouped_2['amount'][i]
dp_percentage *= 100
dp_percentage = round(dp_percentage, 2)
dp_percentage_data.append(dp_percentage)
# create dataframe with education level and default payment percentage data
df_eda_1_final = pd.DataFrame({
'education_level' : df_eda_1_grouped_1['education_level'],
'default_payment_percentage' : dp_percentage_data
})
# set plot's title
plt.title('Default Payment Percentage for each Education Level')
# define plot
plt.bar(data=df_eda_1_final, x='education_level', height='default_payment_percentage')
# set y limit
ax = plt.gca()
ax.set_ylim([0, 100])
# add axis label
ax.set_xlabel('education level')
ax.set_ylabel('default payment percentage')
# edit x ticks and their labels
ax.set_xticks([1, 2, 3, 4])
ax.set_xticklabels(['graduate_school', 'university', 'highschool', 'others'])
# add bar label
rects = ax.patches
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
f'{height}%',
ha='center', va='bottom')
# show plot
st.pyplot(plt.gcf())
plt.clf()
def vis_2(df):
# make a copy of df
df_eda_2 = df.copy()
# group df by sex and sum their default_payment
df_eda_2_grouped = pd.DataFrame(df_eda_2.groupby(by='sex')['default_payment_next_month'].sum()).reset_index()
# set plot's title
plt.title('Gender Contribution to Default Payment')
# define plot
plt.pie(data=df_eda_2_grouped, x='default_payment_next_month', labels=['male', 'female'], autopct='%1.1f%%')
# show plot
st.pyplot(plt.gcf())
plt.clf()
def vis_3(df):
# make a copy of df
df_eda_3 = df.copy()
# get the average limit_balance
avg_lim = df_eda_3['limit_balance'].mean()
# get the limit group data, below or above average
lim_group = []
for lim in df_eda_3['limit_balance']:
if lim < avg_lim:
lim_group.append('below_average')
else:
lim_group.append('above_average')
# add column
df_eda_3['limit_group'] = lim_group
# group df by limit group and sum their default payment count
df_eda_3_grouped = pd.DataFrame(df_eda_3.groupby(by='limit_group')['default_payment_next_month'].sum()).reset_index()
# set plot's title
plt.title('Default Payment Amount Categorized by Limit Group')
# define plot
barh = plt.barh(data=df_eda_3_grouped, y='limit_group', width='default_payment_next_month', label='default_payment_next_month')
# add axis label
ax = plt.gca()
ax.set_xlabel('default payment amount')
ax.set_ylabel('limit balance group')
# set x limit
ax.set_xlim([0, 550])
# add label
ax = plt.gca()
ax.bar_label(barh)
# show plot
st.pyplot(plt.gcf())
plt.clf()
def vis_4(df):
# make a copy of df
df_eda_4 = df.copy()
# get total late payment
total_late_payment = pd.Series()
for i in range(1, 7):
total_late_payment = total_late_payment.add(df_eda_4[f'pay_{i}'], fill_value=0)
# add column to df
df_eda_4['total_late_payment'] = total_late_payment
# group df by total late payment and sum default payment amount
df_eda_4_grouped = pd.DataFrame(df_eda_4.groupby(by='total_late_payment')['default_payment_next_month'].mean()).reset_index()
# set plot's title
plt.title('The Effect of Late Payment to Default Payment')
# define plot
plt.plot(df_eda_4_grouped['total_late_payment'], df_eda_4_grouped['default_payment_next_month'])
# add axis label
ax = plt.gca()
ax.set_xlabel('total late payment (month)')
ax.set_ylabel('average default payment')
# show plot
st.pyplot(plt.gcf())
plt.clf()
def vis_5(df):
# make a copy of df
df_eda_5 = df.copy()
# group df by marital status and sum their default_payment
df_eda_5_grouped = pd.DataFrame(df_eda_5.groupby(by='marital_status')['default_payment_next_month'].sum()).reset_index()
# set plot's title
plt.title('Contribution to Default Payment by Marital Status ')
# define plot
plt.pie(data=df_eda_5_grouped, x='default_payment_next_month', labels=['others', 'married', 'single', 'divorced'], autopct='%1.1f%%')
# show plot
st.pyplot(plt.gcf())
plt.clf()