project / app.py
mayankraghav's picture
Add application file
b965b34
raw
history blame
No virus
6.19 kB
# app.py
import streamlit as st
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from pandas.tseries.offsets import MonthEnd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
# # Load models
# with open('./revenue_forcast.pkl', 'rb') as file:
# arima_model = pickle.load(file)
# # Load data
# file_path = './Dataset/hotel_booking.csv'
# df = pd.read_csv(file_path)
# Preprocess data for Streamlit
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
df[col] = df[col].astype('category')
# Streamlit app
st.title('Hotel Booking Analysis')
# Navigation
st.sidebar.title('Navigation')
options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])
if options == 'Overview':
st.header('Overview')
st.write('This app provides insights and predictions for hotel bookings.')
elif options == 'Revenue Forecasting':
# Streamlit app title
st.title('Hotel Booking Revenue Forecasting with SARIMA')
# File uploader
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
# Load the dataset
data = pd.read_csv(uploaded_file)
# Display the first few rows of the dataset
st.write("## Dataset Preview")
st.write(data.head())
# Convert arrival_date_year and arrival_date_month to a datetime format
data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
data['arrival_date_month'].astype(str) + '-01')
data['arrival_date'] += MonthEnd(0)
# Calculate monthly revenue
monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()
# Plot monthly revenue
st.write("## Monthly Revenue")
plt.figure(figsize=(12, 6))
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
plt.title('Monthly Revenue')
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# Check for stationarity
result = adfuller(monthly_revenue['adr'])
st.write(f'## ADF Statistic: {result[0]}')
st.write(f'## p-value: {result[1]}')
# If the series is not stationary, take the first difference
monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()
# Model parameters
p = st.slider('AR order (p)', 0, 5, 1)
d = st.slider('Differencing order (d)', 0, 2, 1)
q = st.slider('MA order (q)', 0, 5, 1)
P = st.slider('Seasonal AR order (P)', 0, 2, 1)
D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)
# Fit the SARIMA model with user-defined parameters
model = SARIMAX(monthly_revenue['adr'],
order=(p, d, q),
seasonal_order=(P, D, Q, 12))
model_fit = model.fit(disp=False)
# Make predictions
forecast_steps = 12 # Forecast for the next 12 months
forecast = model_fit.get_forecast(steps=forecast_steps)
forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max(),
periods=forecast_steps, freq='M')
forecast_df = pd.DataFrame({'arrival_date': forecast_index,
'forecast': forecast.predicted_mean})
# Plot the results
st.write("## Revenue Forecast")
plt.figure(figsize=(12, 6))
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
sns.lineplot(x='arrival_date', y='forecast', data=forecast_df, label='Forecasted Revenue')
plt.title('Revenue Forecast')
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
st.pyplot(plt)
# Display forecasted values
st.write("## Forecasted Revenue for the Next 12 Months")
st.write(forecast_df.set_index('arrival_date'))
elif options == 'Predict Booking Cancellations':
st.header('Predict Booking Cancellations')
st.write('Provide input data to predict if a booking will be canceled.')
input_data = {}
for col in df.drop(columns=['is_canceled']).columns:
input_data[col] = st.text_input(f'{col}:', value='0')
input_df = pd.DataFrame(input_data, index=[0])
prediction = random_forest_model.predict(input_df)
st.write('Prediction:', 'Canceled' if prediction[0] else 'Not Canceled')
elif options == 'Market Segmentation':
st.header('Market Segmentation')
segmentation_features = df[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
scaler = StandardScaler()
segmentation_features_scaled = scaler.fit_transform(segmentation_features)
kmeans = KMeans(n_clusters=4, random_state=42)
df['customer_segment'] = kmeans.fit_predict(segmentation_features_scaled)
plt.figure(figsize=(10, 5))
sns.scatterplot(x=segmentation_features_scaled[:, 0], y=segmentation_features_scaled[:, 1], hue=df['customer_segment'], palette='viridis')
plt.title('Customer Segmentation')
plt.xlabel('Total Guests (Standardized)')
plt.ylabel('Total Special Requests (Standardized)')
st.pyplot(plt)
elif options == 'Customer Lifetime Value':
st.header('Customer Lifetime Value')
clv_df = df.groupby('customer_id')['revenue'].sum().reset_index()
clv_df.columns = ['customer_id', 'lifetime_value']
plt.figure(figsize=(10, 5))
sns.histplot(clv_df['lifetime_value'], kde=True)
plt.title('Customer Lifetime Value Distribution')
plt.xlabel('Lifetime Value')
plt.ylabel('Frequency')
st.pyplot(plt)