Spaces:

ardifarizky
/

milestone2

Runtime error

App Files Files Community

ardifarizky commited on Aug 17, 2023

Commit

f7e19bd

•

1 Parent(s): 4c37034

Update eda.py

Browse files

Files changed (1) hide show

eda.py +93 -1

eda.py CHANGED Viewed

@@ -28,6 +28,77 @@ def run():
     # st.subheader('Heart Failure Prediction Exploratory Data Analysis')
     # #Show Dataframe
     d = pd.read_csv('hotel_bookings.csv')
     fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
     sns.histplot(data=d, x='lead_time', hue='hotel', multiple='stack', bins=20, ax=ax[0, 0], palette='Set1')
@@ -45,7 +116,28 @@ def run():
     plt.tight_layout()
-    st.pyplot()
     # st.write('#### scatterplot berdasarkan Input User')
     # pilihan1 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=1)
     # pilihan2 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=2)

     # st.subheader('Heart Failure Prediction Exploratory Data Analysis')
     # #Show Dataframe
     d = pd.read_csv('hotel_bookings.csv')
+    corr = d.corr()
+    def pearson_correlation(x, y):
+      # dind the mean of each array
+      x_mean = np.mean(x)
+      y_mean = np.mean(y)
+      # find the covariance of the two arrays
+      covariance = np.cov(x, y)[0, 1]
+      # find the standard deviation of each array
+      x_std = np.std(x)
+      y_std = np.std(y)
+      # calculate the Pearson correlation coefficient
+      r = covariance / (x_std * y_std)
+      return r
+    mask = np.zeros_like(corr)
+    mask[np.triu_indices_from(mask)] = True
+    sns.set(style='white')
+    fig, ax = plt.subplots(figsize=(12, 9))
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
+                square=True, linewidths=.5, cbar_kws={"shrink": .5})
+    plt.title('Data Correlation')
+    st.pyplot(fig)
+    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
+    sns.histplot(data=d, x='lead_time', hue='is_canceled',
+                 kde=True, ax=ax[0][0], palette='Set1').set_title("distribution of Lead Time")
+    sns.histplot(data=d, x='booking_changes', hue='is_canceled',
+                 ax=ax[0][1], palette='Set1').set_title("distribution of Booking Changes")
+    sns.histplot(data=d, x='deposit_type', hue='is_canceled',
+                 ax=ax[1][0], palette='Set1').set_title("distribution of Deposit Type")
+    plt.tight_layout()
+    st.pyplot(fig)
+    booking_counts = d.groupby(['arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'hotel']).size().reset_index(name='booking_count')
+    pivot_table = booking_counts.pivot_table(index=['arrival_date_month', 'arrival_date_week_number'], columns=['arrival_date_year', 'hotel'], values='booking_count', fill_value=0)
+    plt.figure(figsize=(12, 10))
+    pivot_table.plot(kind='line')
+    plt.title('Seasonal Booking Trends')
+    plt.xlabel('Month and Week Number')
+    plt.ylabel('Booking Count')
+    plt.legend(title='Hotel Type')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    st.pyplot(fig)
+    demographics_counts = d[['babies', 'adults', 'children']].sum()
+    # creating the pie chart
+    plt.figure(figsize=(8, 8))
+    plt.pie(demographics_counts, labels=demographics_counts.index, autopct='%1.1f%%', startangle=140)
+    plt.title('Distribution of Guest Demographics')
+    plt.axis('equal')
+    st.pyplot(fig)
     fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
     sns.histplot(data=d, x='lead_time', hue='hotel', multiple='stack', bins=20, ax=ax[0, 0], palette='Set1')
     plt.tight_layout()
+    st.pyplot(fig)
+    plt.figure(figsize=(12, 6))
+    sns.countplot(data=d, x='market_segment', palette='Set3')
+    plt.title('Distribution of Market Segmentation')
+    plt.xlabel('Market Segment')
+    plt.ylabel('Count')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    plt.show()
+    # create a count plot for distribution channels
+    plt.figure(figsize=(10, 6))
+    sns.countplot(data=d, x='distribution_channel', palette='Set2')
+    plt.title('Distribution of Distribution Channels')
+    plt.xlabel('Distribution Channel')
+    plt.ylabel('Count')
+    plt.tight_layout()
+    st.pyplot(fig)
     # st.write('#### scatterplot berdasarkan Input User')
     # pilihan1 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=1)
     # pilihan2 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=2)