milestone2 / eda.py
ardifarizky's picture
Update eda.py
f7e19bd
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
st.set_page_config(
page_title= 'FIFA 2022',
layout='wide',
initial_sidebar_state='expanded'
)
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
def run():
st.title('Heart Failure Prediction')
# st.subheader('Heart Failure Prediction Exploratory Data Analysis')
# #Show Dataframe
d = pd.read_csv('hotel_bookings.csv')
corr = d.corr()
def pearson_correlation(x, y):
# dind the mean of each array
x_mean = np.mean(x)
y_mean = np.mean(y)
# find the covariance of the two arrays
covariance = np.cov(x, y)[0, 1]
# find the standard deviation of each array
x_std = np.std(x)
y_std = np.std(y)
# calculate the Pearson correlation coefficient
r = covariance / (x_std * y_std)
return r
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.set(style='white')
fig, ax = plt.subplots(figsize=(12, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('Data Correlation')
st.pyplot(fig)
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
sns.histplot(data=d, x='lead_time', hue='is_canceled',
kde=True, ax=ax[0][0], palette='Set1').set_title("distribution of Lead Time")
sns.histplot(data=d, x='booking_changes', hue='is_canceled',
ax=ax[0][1], palette='Set1').set_title("distribution of Booking Changes")
sns.histplot(data=d, x='deposit_type', hue='is_canceled',
ax=ax[1][0], palette='Set1').set_title("distribution of Deposit Type")
plt.tight_layout()
st.pyplot(fig)
booking_counts = d.groupby(['arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'hotel']).size().reset_index(name='booking_count')
pivot_table = booking_counts.pivot_table(index=['arrival_date_month', 'arrival_date_week_number'], columns=['arrival_date_year', 'hotel'], values='booking_count', fill_value=0)
plt.figure(figsize=(12, 10))
pivot_table.plot(kind='line')
plt.title('Seasonal Booking Trends')
plt.xlabel('Month and Week Number')
plt.ylabel('Booking Count')
plt.legend(title='Hotel Type')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(fig)
demographics_counts = d[['babies', 'adults', 'children']].sum()
# creating the pie chart
plt.figure(figsize=(8, 8))
plt.pie(demographics_counts, labels=demographics_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Guest Demographics')
plt.axis('equal')
st.pyplot(fig)
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
sns.histplot(data=d, x='lead_time', hue='hotel', multiple='stack', bins=20, ax=ax[0, 0], palette='Set1')
ax[0, 0].set_title("Booking Behavior by Hotel Type (Lead Time)")
sns.barplot(data=d, x='hotel', y='is_canceled', ax=ax[0, 1], palette='Set1')
ax[0, 1].set_title("Cancellation Rate by Hotel Type")
sns.countplot(data=d, x='booking_changes', hue='hotel', ax=ax[1, 0], palette='Set1')
ax[1, 0].set_title("Booking Changes by Hotel Type")
sns.countplot(data=d, x='hotel', ax=ax[1, 1], palette='Set1')
ax[1, 1].set_title("Total Bookings by Hotel Type")
plt.tight_layout()
st.pyplot(fig)
plt.figure(figsize=(12, 6))
sns.countplot(data=d, x='market_segment', palette='Set3')
plt.title('Distribution of Market Segmentation')
plt.xlabel('Market Segment')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# create a count plot for distribution channels
plt.figure(figsize=(10, 6))
sns.countplot(data=d, x='distribution_channel', palette='Set2')
plt.title('Distribution of Distribution Channels')
plt.xlabel('Distribution Channel')
plt.ylabel('Count')
plt.tight_layout()
st.pyplot(fig)
# st.write('#### scatterplot berdasarkan Input User')
# pilihan1 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=1)
# pilihan2 = st.selectbox('Pilih column : ', ('age', 'creatinine_phosphokinase','ejection_fraction', 'platelets','serum_creatinine', 'serum_sodium', 'time'),key=2)
# pilihan3 = st.selectbox('Pilih column : ', ('anaemia', 'diabetes','high_blood_pressure', 'sex','smoking', 'DEATH_EVENT'),key=3)
# fig = plt.figure(figsize=(15, 5))
# sns.scatterplot(data=d,x=d[pilihan1],y=d[pilihan2],hue=d[pilihan3])
# st.pyplot(fig)
if __name__ == '__main__':
run()