Spaces:

celineclarissa
/

GC5_Credit_Card_Data

Sleeping

App Files Files Community

GC5_Credit_Card_Data / eda.py

celineclarissa

Upload eda.py

8adcb68 verified about 1 year ago

raw

history blame contribute delete

10.6 kB

	# import libraries
	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt

	# set page title
	st.set_page_config(
	page_title = 'GC5'
	)

	# make function to convert text to correlating digit
	def default_id(i):
	# make condition
	if i == 'Default':
	return 1
	else:
	return 0

	# make function run()
	def run():
	# make title
	st.title('Credit Card Data EDA')

	# make description
	st.write('This page was made to predict whether the user will pay on time next month or fail to do so (default).')

	# insert image
	st.image('https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg', caption='Credit Cards (https://i.pinimg.com/originals/88/38/57/8838578a62903ed1aa389d199d173317.jpg)')

	# Membuat garis lurus
	st.markdown('---')

	# Show dataframe
	st.write('### Dataset')
	df = pd.read_csv('df_ori.csv')

	# data cleaning
	#drop unused columns
	df = df.drop(columns='Unnamed: 0')
	# replace "6" as "5" in "education_level" column
	df['education_level'] = df['education_level'].replace(6, 5)
	# replace "0.0" as "-1.0" in "pay_[i]" column
	df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']] = df[['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].replace(0.0, -1.0)

	# show cleaned dataset
	st.dataframe(df)

	# make border
	st.write('')
	st.markdown('---')
	st.write('')

	# EDA 1
	# make title
	st.write('### Default Payment Pie Chart')
	# count value
	default_counts = df['default_payment_next_month'].value_counts(dropna=False)
	# create pie chart
	fig1 = plt.figure(figsize=[15, 5])
	plt.pie(default_counts, autopct='%1.1f%%', labels=[0, 1], shadow=True)
	plt.axis('equal')
	# show pie chart
	st.pyplot(fig1)
	# show insight
	st.write('From the pie chart, it was found that *data is imbalanced*. Less than a quarter of recorded credit card users are default (failed to do minimum payment in period of time).')

	# make border
	st.markdown('---')

	# EDA 2
	# make title
	st.write('### Distribution of Unique Values in Pay_[i] for Default Payment Status')
	# make dictionaries
	# pay_[i] meaning
	dict1_eda2 = {'pay_0':'Payment Status (Sept 2005)',
	'pay_2':'Payment Status (Aug 2005)',
	'pay_3':'Payment Status (Jul 2005)',
	'pay_4':'Payment Status (Jun 2005)',
	'pay_5':'Payment Status (May 2005)',
	'pay_6':'Payment Status (Apr 2005)'
	}
	# unique values meaning
	dict2_eda2 = {
	'-2.0': 'No transaction',
	'-1.0': 'Paid duly',
	'1.0': 'Payment delay (1 mo)',
	'2.0': 'Payment delay (2 mo)',
	'3.0': 'Payment delay (3 mo)',
	'4.0': 'Payment delay (4 mo)',
	'5.0': 'Payment delay (5 mo)',
	'6.0': 'Payment delay (6 mo)',
	'7.0': 'Payment delay (7 mo)',
	'8.0': 'Payment delay (8 mo)',
	'9.0': 'Payment delay (9 mo)'}
	# make copy of dataframe
	eda2 = df.copy()
	# define choice_eda2 -> user input
	choice_eda2 = st.selectbox('Pilih Feature:', ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6'])
	# change data type and replace value according to dictionary
	eda2[choice_eda2] = eda2[choice_eda2].astype(str).replace(dict2_eda2)
	# create plot
	fig2 = plt.figure(figsize=[15, 5])
	plot_data = eda2.loc[eda2['default_payment_next_month'] == 1, [choice_eda2, 'default_payment_next_month']].groupby(choice_eda2).count().sort_values('default_payment_next_month', ascending=False).reset_index()
	plot_data.plot(kind='bar', x=choice_eda2, y='default_payment_next_month', xlabel='', ylabel='Count', legend=False, title=f'{dict1_eda2[choice_eda2]} Distribution', ax=plt.gca())
	# show plot
	st.pyplot(fig2)
	# show insight
	st.write('From the plots, it can be understood that most customers from April-September 2005 whose payment status is default *mostly have paid duly*. ')

	# make border
	st.markdown('---')

	# EDA 3
	# make title
	st.write('### Age Group Distribution for Default and Non-Default Payment Status')
	# make new list
	age_group = []
	# make looping
	for i in df['age']:
	# classify age group 'Youth'
	if i in range(15, 24):
	age_group.append('Youth')
	# classify age group 'Adult'
	elif i in range(25, 64):
	age_group.append('Adult')
	# classify age group 'Senior'
	else:
	age_group.append('Senior')
	# create new column
	df['age_group'] = age_group
	# define choice_eda3 -> user input
	choice_eda3 = st.selectbox('Default:', ['Non-Default', 'Default'])
	# define dataframe
	agegroup_counts = df[df['default_payment_next_month']==default_id(choice_eda3)]['age_group'].value_counts(dropna=False)
	# create pie chart
	fig3 = plt.figure(figsize=[15, 5])
	plt.pie(agegroup_counts, autopct='%1.1f%%', labels=['Youth', 'Adult', 'Senior'], shadow=True)
	plt.title('Age Distribution Pie Chart')
	plt.axis('equal')
	# show pie chart
	st.pyplot(fig3)
	# show insight
	st.write('From the pie charts, it can be understood that the age distribution for customers whose payment status is default and non-default are quite similar with the most being adults, followed by youth, and seniors. So, it can be concluded that *there are no significant difference in age range between customers whose payment status is default and non-default*.')

	# make border
	st.markdown('---')

	# EDA 4
	# make title
	st.write('### Limit Balance Distribution for Default and Non-Default Payment Status')
	# define choice_eda4 -> user input
	choice_eda4 = st.selectbox('Default: ', ['Non-Default', 'Default'])
	# create dataframe
	limit = df[df['default_payment_next_month']==default_id(choice_eda4)]['limit_balance']
	# create dictionary
	dict_eda4 = {0:'Limit Balance Distribution for Non-Default Payment Status',
	1:'Limit Balance Distribution for Default Payment Status'}
	# create box plot
	fig4 = plt.figure(figsize=[15, 5])
	plt.boxplot(limit)
	plt.title(dict_eda4[default_id(choice_eda4)])
	# show pie chart
	st.pyplot(fig4)
	# show insight
	st.write('From the box plots, it can be understood that *customers with payment status of non-default have wider range of limit balance* (less than 100,000 to around 800,000). Meanwhile, customers with payment status default have limit balance range of less than 100,000 to around 500,000.')

	# make border
	st.markdown('---')

	# EDA 5
	# make title
	st.write('### Sex Distribution for Default and Non-Default Payment Status')
	# define choice_eda5 -> user input
	choice_eda5 = st.selectbox('Default: ', ['Non-Default', 'Default'])
	# make query
	sex_counts = df[df['default_payment_next_month']==default_id(choice_eda5)]['sex'].value_counts(dropna=False)
	# create pie chart
	fig5 = plt.figure(figsize=[15, 5])
	plt.pie(sex_counts, autopct='%1.1f%%', labels=['Male', 'Female'], shadow=True)
	plt.title('Sex Distribution Pie Chart')
	plt.axis('equal')
	# show pie chart
	st.pyplot(fig5)
	# show insight
	st.write('From the pie charts, it can be understood that the sex distribution for customers whose payment status is default and non-default are quite similar with the most being males. So, it can be concluded that *there are no significant difference in sex distribution between customers whose payment status is default and non-default*.')

	# make border
	st.markdown('---')

	# EDA 6
	# make title
	st.write('### Average Bill Statement in Each Month for Default and Non-Default Payment Status')
	# define choice_eda6 -> user input
	choice_eda6 = st.selectbox('Default: ', ['Non-Default', 'Default'])
	# make query
	eda6 = df.loc[df['default_payment_next_month']==default_id(choice_eda6), ['bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 'default_payment_next_month']]
	# create plot
	fig6 = plt.figure(figsize=[15, 5])
	plot_data6 = pd.DataFrame({'Bill Statement': ['Apr 2005', 'May 2005', 'Jun 2005', 'Jul 2005', 'Aug 2005', 'Sept 2005'],
	'Mean': [eda6['bill_amt_6'].mean(), eda6['bill_amt_5'].mean(), eda6['bill_amt_4'].mean(), eda6['bill_amt_3'].mean(), eda6['bill_amt_2'].mean(), eda6['bill_amt_1'].mean()]})
	# plot_data6.plot(kind='line', x='Bill Statement', y='Mean', legend=False)
	plt.plot(plot_data6['Bill Statement'], plot_data6['Mean'])
	# show plot
	st.pyplot(fig6)
	# show insight
	st.write('From the pie charts, it can be understood that the marital status distribution for customers whose payment status is default and non-default are quite similar with the most being married, followed by single, and unknown. So, it can be concluded that *there are no significant difference in marital status distribution between customers whose payment status is default and non-default*.')

	# make border
	st.markdown('---')

	# EDA 7
	# make title
	st.write('### Marital Status Distribution for Default and Non-Default Payment Status')
	# define choice_eda7 -> user input
	choice_eda7 = st.selectbox('Default: ', ['Non-Default', 'Default'])
	# create dataframe
	eda7 = df[df.marital_status != 0]
	# make query
	marital_counts = eda7[eda7['default_payment_next_month']==default_id(choice_eda7)]['marital_status'].value_counts(dropna=False)
	# create pie chart
	fig7 = plt.figure(figsize=[15, 5])
	marital_counts.plot.pie(autopct='%1.1f%%', shadow=True, labels=['Married', 'Single', 'Others'])
	plt.title('Marital Status Distribution Pie Chart')
	plt.axis('equal')
	# show pie chart
	st.pyplot(fig7)
	# show insight
	st.write('From the pie charts, it can be understood that the average bill statement in each month for customers whose payment status is default and non-default are quite similar. In 5 months, the average bill statement gradually rose. Both default and non-default customers have highest average bill statement in September 2005. So, it can be concluded that *there are no significant difference in average bill statement between customers whose payment status is default and non-default. Other than that, it was found that as time goes by, the average bill statement tends to increase*.')

	# execute file
	if __name__=='__main__':
	run()