weewoo2636 commited on
Commit
30452f4
1 Parent(s): 816ff37

Upload 6 files

Browse files
Files changed (2) hide show
  1. eda.py +182 -3
  2. eda_data.csv +0 -0
eda.py CHANGED
@@ -7,6 +7,185 @@ import matplotlib.pyplot as plt
7
  def app():
8
  st.title('Exploratory Data Analysis')
9
 
10
- df = pd.read_csv('P1G5_Set_1_wilson.csv')
11
- st.write('dataset overview')
12
- st.write(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def app():
8
  st.title('Exploratory Data Analysis')
9
 
10
+ df = pd.read_csv('deployment/eda_data.csv')
11
+ st.write('Dataset Preview')
12
+ st.write(df)
13
+
14
+ st.write('How is the percentage of default payment as education level increases?')
15
+ vis_1(df)
16
+
17
+ st.write('How is the contribution of each gender to default payment?')
18
+ vis_2(df)
19
+
20
+ st.write('Which one got more into default payment, customers with limit balance above or below average?')
21
+ vis_3(df)
22
+
23
+ st.write('How does the average of default payment changes as the total late payment rises?')
24
+ vis_4(df)
25
+
26
+ st.write('How is the contribution of each marital status to default payment?')
27
+ vis_5(df)
28
+
29
+
30
+ def vis_1(df):
31
+ # make a copy of df
32
+ df_eda_1 = df.copy()
33
+
34
+ # get default payment sum of each education level
35
+ df_eda_1_grouped_1 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].sum()).reset_index()
36
+
37
+ # get row count of each education level
38
+ df_eda_1_grouped_2 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].count()).reset_index()
39
+ df_eda_1_grouped_2.rename(columns={'default_payment_next_month' : 'amount'}, inplace=True)
40
+
41
+ # get default payment percentage of each educatin level
42
+ dp_percentage_data = []
43
+
44
+ for i in range(len(df_eda_1_grouped_1)):
45
+ dp_percentage = df_eda_1_grouped_1['default_payment_next_month'][i] / df_eda_1_grouped_2['amount'][i]
46
+ dp_percentage *= 100
47
+ dp_percentage = round(dp_percentage, 2)
48
+ dp_percentage_data.append(dp_percentage)
49
+
50
+ # create dataframe with education level and default payment percentage data
51
+ df_eda_1_final = pd.DataFrame({
52
+ 'education_level' : df_eda_1_grouped_1['education_level'],
53
+ 'default_payment_percentage' : dp_percentage_data
54
+ })
55
+
56
+ # set plot's title
57
+ plt.title('Default Payment Percentage for each Education Level')
58
+
59
+ # define plot
60
+ plt.bar(data=df_eda_1_final, x='education_level', height='default_payment_percentage')
61
+
62
+ # set y limit
63
+ ax = plt.gca()
64
+ ax.set_ylim([0, 100])
65
+
66
+ # add axis label
67
+ ax.set_xlabel('education level')
68
+ ax.set_ylabel('default payment percentage')
69
+
70
+ # edit x ticks and their labels
71
+ ax.set_xticks([1, 2, 3, 4])
72
+ ax.set_xticklabels(['graduate_school', 'university', 'highschool', 'others'])
73
+
74
+ # add bar label
75
+ rects = ax.patches
76
+ for rect in rects:
77
+ height = rect.get_height()
78
+ ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
79
+ f'{height}%',
80
+ ha='center', va='bottom')
81
+
82
+ # show plot
83
+ st.pyplot(plt.gcf())
84
+ plt.clf()
85
+
86
+ def vis_2(df):
87
+ # make a copy of df
88
+ df_eda_2 = df.copy()
89
+
90
+ # group df by sex and sum their default_payment
91
+ df_eda_2_grouped = pd.DataFrame(df_eda_2.groupby(by='sex')['default_payment_next_month'].sum()).reset_index()
92
+
93
+ # set plot's title
94
+ plt.title('Gender Contribution to Default Payment')
95
+
96
+ # define plot
97
+ plt.pie(data=df_eda_2_grouped, x='default_payment_next_month', labels=['male', 'female'], autopct='%1.1f%%')
98
+
99
+ # show plot
100
+ st.pyplot(plt.gcf())
101
+ plt.clf()
102
+
103
+ def vis_3(df):
104
+ # make a copy of df
105
+ df_eda_3 = df.copy()
106
+
107
+ # get the average limit_balance
108
+ avg_lim = df_eda_3['limit_balance'].mean()
109
+
110
+ # get the limit group data, below or above average
111
+ lim_group = []
112
+ for lim in df_eda_3['limit_balance']:
113
+ if lim < avg_lim:
114
+ lim_group.append('below_average')
115
+ else:
116
+ lim_group.append('above_average')
117
+
118
+ # add column
119
+ df_eda_3['limit_group'] = lim_group
120
+
121
+ # group df by limit group and sum their default payment count
122
+ df_eda_3_grouped = pd.DataFrame(df_eda_3.groupby(by='limit_group')['default_payment_next_month'].sum()).reset_index()
123
+
124
+ # set plot's title
125
+ plt.title('Default Payment Amount Categorized by Limit Group')
126
+
127
+ # define plot
128
+ barh = plt.barh(data=df_eda_3_grouped, y='limit_group', width='default_payment_next_month', label='default_payment_next_month')
129
+
130
+ # add axis label
131
+ ax = plt.gca()
132
+ ax.set_xlabel('default payment amount')
133
+ ax.set_ylabel('limit balance group')
134
+
135
+ # set x limit
136
+ ax.set_xlim([0, 550])
137
+
138
+ # add label
139
+ ax = plt.gca()
140
+ ax.bar_label(barh)
141
+
142
+ # show plot
143
+ st.pyplot(plt.gcf())
144
+ plt.clf()
145
+
146
+ def vis_4(df):
147
+ # make a copy of df
148
+ df_eda_4 = df.copy()
149
+
150
+ # get total late payment
151
+ total_late_payment = pd.Series()
152
+ for i in range(1, 7):
153
+ total_late_payment = total_late_payment.add(df_eda_4[f'pay_{i}'], fill_value=0)
154
+
155
+ # add column to df
156
+ df_eda_4['total_late_payment'] = total_late_payment
157
+
158
+ # group df by total late payment and sum default payment amount
159
+ df_eda_4_grouped = pd.DataFrame(df_eda_4.groupby(by='total_late_payment')['default_payment_next_month'].mean()).reset_index()
160
+
161
+ # set plot's title
162
+ plt.title('The Effect of Late Payment to Default Payment')
163
+
164
+ # define plot
165
+ plt.plot(df_eda_4_grouped['total_late_payment'], df_eda_4_grouped['default_payment_next_month'])
166
+
167
+ # add axis label
168
+ ax = plt.gca()
169
+ ax.set_xlabel('total late payment (month)')
170
+ ax.set_ylabel('average default payment')
171
+
172
+ # show plot
173
+ st.pyplot(plt.gcf())
174
+ plt.clf()
175
+
176
+ def vis_5(df):
177
+ # make a copy of df
178
+ df_eda_5 = df.copy()
179
+
180
+ # group df by marital status and sum their default_payment
181
+ df_eda_5_grouped = pd.DataFrame(df_eda_5.groupby(by='marital_status')['default_payment_next_month'].sum()).reset_index()
182
+
183
+ # set plot's title
184
+ plt.title('Contribution to Default Payment by Marital Status ')
185
+
186
+ # define plot
187
+ plt.pie(data=df_eda_5_grouped, x='default_payment_next_month', labels=['others', 'married', 'single', 'divorced'], autopct='%1.1f%%')
188
+
189
+ # show plot
190
+ st.pyplot(plt.gcf())
191
+ plt.clf()
eda_data.csv ADDED
The diff for this file is too large to render. See raw diff