milestone2 / eda.py
ardifarizky's picture
Update eda.py
44ff538
raw
history blame
3.46 kB
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
st.set_page_config(
page_title= 'EDA',
layout='wide',
initial_sidebar_state='expanded'
)
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
def run():
st.title('EDA')
d = pd.read_csv('hotel_bookings.csv')
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
sns.histplot(data=d, x='lead_time', hue='is_canceled',
kde=True, ax=ax[0][0], palette='Set1').set_title("distribution of Lead Time")
sns.histplot(data=d, x='booking_changes', hue='is_canceled',
ax=ax[0][1], palette='Set1').set_title("distribution of Booking Changes")
sns.histplot(data=d, x='deposit_type', hue='is_canceled',
ax=ax[1][0], palette='Set1').set_title("distribution of Deposit Type")
plt.tight_layout()
st.pyplot(fig)
booking_counts = d.groupby(['arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'hotel']).size().reset_index(name='booking_count')
pivot_table = booking_counts.pivot_table(index=['arrival_date_month', 'arrival_date_week_number'], columns=['arrival_date_year', 'hotel'], values='booking_count', fill_value=0)
plt.figure(figsize=(12, 10))
pivot_table.plot(kind='line')
plt.title('Seasonal Booking Trends')
plt.xlabel('Month and Week Number')
plt.ylabel('Booking Count')
plt.legend(title='Hotel Type')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot()
demographics_counts = d[['babies', 'adults', 'children']].sum()
# creating the pie chart
plt.figure(figsize=(8, 8))
plt.pie(demographics_counts, labels=demographics_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Guest Demographics')
plt.axis('equal')
st.pyplot()
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
sns.histplot(data=d, x='lead_time', hue='hotel', multiple='stack', bins=20, ax=ax[0, 0], palette='Set1')
ax[0, 0].set_title("Booking Behavior by Hotel Type (Lead Time)")
sns.barplot(data=d, x='hotel', y='is_canceled', ax=ax[0, 1], palette='Set1')
ax[0, 1].set_title("Cancellation Rate by Hotel Type")
sns.countplot(data=d, x='booking_changes', hue='hotel', ax=ax[1, 0], palette='Set1')
ax[1, 0].set_title("Booking Changes by Hotel Type")
sns.countplot(data=d, x='hotel', ax=ax[1, 1], palette='Set1')
ax[1, 1].set_title("Total Bookings by Hotel Type")
plt.tight_layout()
st.pyplot()
plt.figure(figsize=(12, 6))
sns.countplot(data=d, x='market_segment', palette='Set3')
plt.title('Distribution of Market Segmentation')
plt.xlabel('Market Segment')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# create a count plot for distribution channels
plt.figure(figsize=(10, 6))
sns.countplot(data=d, x='distribution_channel', palette='Set2')
plt.title('Distribution of Distribution Channels')
plt.xlabel('Distribution Channel')
plt.ylabel('Count')
plt.tight_layout()
st.pyplot()
if __name__ == '__main__':
run()