ardifarizky commited on
Commit
f7e19bd
1 Parent(s): 4c37034

Update eda.py

Browse files
Files changed (1) hide show
  1. eda.py +93 -1
eda.py CHANGED
@@ -28,6 +28,77 @@ def run():
28
  # st.subheader('Heart Failure Prediction Exploratory Data Analysis')
29
  # #Show Dataframe
30
  d = pd.read_csv('hotel_bookings.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
32
 
33
  sns.histplot(data=d, x='lead_time', hue='hotel', multiple='stack', bins=20, ax=ax[0, 0], palette='Set1')
@@ -45,7 +116,28 @@ def run():
45
 
46
  plt.tight_layout()
47
 
48
- st.pyplot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # st.write('#### scatterplot berdasarkan Input User')
50
  # pilihan1 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=1)
51
  # pilihan2 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=2)
 
28
  # st.subheader('Heart Failure Prediction Exploratory Data Analysis')
29
  # #Show Dataframe
30
  d = pd.read_csv('hotel_bookings.csv')
31
+ corr = d.corr()
32
+ def pearson_correlation(x, y):
33
+
34
+ # dind the mean of each array
35
+ x_mean = np.mean(x)
36
+ y_mean = np.mean(y)
37
+
38
+ # find the covariance of the two arrays
39
+ covariance = np.cov(x, y)[0, 1]
40
+
41
+ # find the standard deviation of each array
42
+ x_std = np.std(x)
43
+ y_std = np.std(y)
44
+
45
+ # calculate the Pearson correlation coefficient
46
+ r = covariance / (x_std * y_std)
47
+
48
+ return r
49
+
50
+ mask = np.zeros_like(corr)
51
+ mask[np.triu_indices_from(mask)] = True
52
+
53
+ sns.set(style='white')
54
+ fig, ax = plt.subplots(figsize=(12, 9))
55
+ cmap = sns.diverging_palette(220, 10, as_cmap=True)
56
+
57
+ sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
58
+ square=True, linewidths=.5, cbar_kws={"shrink": .5})
59
+
60
+ plt.title('Data Correlation')
61
+ st.pyplot(fig)
62
+
63
+ fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
64
+
65
+ sns.histplot(data=d, x='lead_time', hue='is_canceled',
66
+ kde=True, ax=ax[0][0], palette='Set1').set_title("distribution of Lead Time")
67
+
68
+ sns.histplot(data=d, x='booking_changes', hue='is_canceled',
69
+ ax=ax[0][1], palette='Set1').set_title("distribution of Booking Changes")
70
+
71
+ sns.histplot(data=d, x='deposit_type', hue='is_canceled',
72
+ ax=ax[1][0], palette='Set1').set_title("distribution of Deposit Type")
73
+
74
+ plt.tight_layout()
75
+ st.pyplot(fig)
76
+
77
+ booking_counts = d.groupby(['arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'hotel']).size().reset_index(name='booking_count')
78
+
79
+ pivot_table = booking_counts.pivot_table(index=['arrival_date_month', 'arrival_date_week_number'], columns=['arrival_date_year', 'hotel'], values='booking_count', fill_value=0)
80
+
81
+ plt.figure(figsize=(12, 10))
82
+ pivot_table.plot(kind='line')
83
+ plt.title('Seasonal Booking Trends')
84
+ plt.xlabel('Month and Week Number')
85
+ plt.ylabel('Booking Count')
86
+ plt.legend(title='Hotel Type')
87
+ plt.xticks(rotation=45)
88
+ plt.tight_layout()
89
+ st.pyplot(fig)
90
+
91
+ demographics_counts = d[['babies', 'adults', 'children']].sum()
92
+
93
+ # creating the pie chart
94
+ plt.figure(figsize=(8, 8))
95
+ plt.pie(demographics_counts, labels=demographics_counts.index, autopct='%1.1f%%', startangle=140)
96
+ plt.title('Distribution of Guest Demographics')
97
+ plt.axis('equal')
98
+
99
+ st.pyplot(fig)
100
+
101
+
102
  fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
103
 
104
  sns.histplot(data=d, x='lead_time', hue='hotel', multiple='stack', bins=20, ax=ax[0, 0], palette='Set1')
 
116
 
117
  plt.tight_layout()
118
 
119
+ st.pyplot(fig)
120
+
121
+ plt.figure(figsize=(12, 6))
122
+ sns.countplot(data=d, x='market_segment', palette='Set3')
123
+ plt.title('Distribution of Market Segmentation')
124
+ plt.xlabel('Market Segment')
125
+ plt.ylabel('Count')
126
+ plt.xticks(rotation=45, ha='right')
127
+ plt.tight_layout()
128
+
129
+ plt.show()
130
+
131
+ # create a count plot for distribution channels
132
+ plt.figure(figsize=(10, 6))
133
+ sns.countplot(data=d, x='distribution_channel', palette='Set2')
134
+ plt.title('Distribution of Distribution Channels')
135
+ plt.xlabel('Distribution Channel')
136
+ plt.ylabel('Count')
137
+ plt.tight_layout()
138
+
139
+ st.pyplot(fig)
140
+
141
  # st.write('#### scatterplot berdasarkan Input User')
142
  # pilihan1 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=1)
143
  # pilihan2 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=2)