Spaces:
Runtime error
Runtime error
mayankraghav
commited on
Commit
•
b965b34
1
Parent(s):
8f57fb2
Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import pickle
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
from datetime import timedelta
|
9 |
+
from pandas.tseries.offsets import MonthEnd
|
10 |
+
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
11 |
+
from statsmodels.tsa.stattools import adfuller
|
12 |
+
|
13 |
+
# # Load models
|
14 |
+
# with open('./revenue_forcast.pkl', 'rb') as file:
|
15 |
+
# arima_model = pickle.load(file)
|
16 |
+
|
17 |
+
# # Load data
|
18 |
+
# file_path = './Dataset/hotel_booking.csv'
|
19 |
+
# df = pd.read_csv(file_path)
|
20 |
+
|
21 |
+
# Preprocess data for Streamlit
|
22 |
+
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
23 |
+
categorical_cols = df.select_dtypes(include=['object']).columns
|
24 |
+
for col in categorical_cols:
|
25 |
+
df[col] = df[col].astype('category')
|
26 |
+
|
27 |
+
# Streamlit app
|
28 |
+
st.title('Hotel Booking Analysis')
|
29 |
+
|
30 |
+
# Navigation
|
31 |
+
st.sidebar.title('Navigation')
|
32 |
+
options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])
|
33 |
+
|
34 |
+
if options == 'Overview':
|
35 |
+
st.header('Overview')
|
36 |
+
st.write('This app provides insights and predictions for hotel bookings.')
|
37 |
+
|
38 |
+
elif options == 'Revenue Forecasting':
|
39 |
+
# Streamlit app title
|
40 |
+
st.title('Hotel Booking Revenue Forecasting with SARIMA')
|
41 |
+
|
42 |
+
# File uploader
|
43 |
+
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
44 |
+
|
45 |
+
if uploaded_file is not None:
|
46 |
+
# Load the dataset
|
47 |
+
data = pd.read_csv(uploaded_file)
|
48 |
+
|
49 |
+
# Display the first few rows of the dataset
|
50 |
+
st.write("## Dataset Preview")
|
51 |
+
st.write(data.head())
|
52 |
+
|
53 |
+
# Convert arrival_date_year and arrival_date_month to a datetime format
|
54 |
+
data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
|
55 |
+
data['arrival_date_month'].astype(str) + '-01')
|
56 |
+
data['arrival_date'] += MonthEnd(0)
|
57 |
+
|
58 |
+
# Calculate monthly revenue
|
59 |
+
monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()
|
60 |
+
|
61 |
+
# Plot monthly revenue
|
62 |
+
st.write("## Monthly Revenue")
|
63 |
+
plt.figure(figsize=(12, 6))
|
64 |
+
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
|
65 |
+
plt.title('Monthly Revenue')
|
66 |
+
plt.xlabel('Month')
|
67 |
+
plt.ylabel('Revenue')
|
68 |
+
plt.xticks(rotation=45)
|
69 |
+
plt.tight_layout()
|
70 |
+
st.pyplot(plt)
|
71 |
+
|
72 |
+
# Check for stationarity
|
73 |
+
result = adfuller(monthly_revenue['adr'])
|
74 |
+
st.write(f'## ADF Statistic: {result[0]}')
|
75 |
+
st.write(f'## p-value: {result[1]}')
|
76 |
+
|
77 |
+
# If the series is not stationary, take the first difference
|
78 |
+
monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()
|
79 |
+
|
80 |
+
# Model parameters
|
81 |
+
p = st.slider('AR order (p)', 0, 5, 1)
|
82 |
+
d = st.slider('Differencing order (d)', 0, 2, 1)
|
83 |
+
q = st.slider('MA order (q)', 0, 5, 1)
|
84 |
+
P = st.slider('Seasonal AR order (P)', 0, 2, 1)
|
85 |
+
D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
|
86 |
+
Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)
|
87 |
+
|
88 |
+
# Fit the SARIMA model with user-defined parameters
|
89 |
+
model = SARIMAX(monthly_revenue['adr'],
|
90 |
+
order=(p, d, q),
|
91 |
+
seasonal_order=(P, D, Q, 12))
|
92 |
+
model_fit = model.fit(disp=False)
|
93 |
+
|
94 |
+
# Make predictions
|
95 |
+
forecast_steps = 12 # Forecast for the next 12 months
|
96 |
+
forecast = model_fit.get_forecast(steps=forecast_steps)
|
97 |
+
forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max(),
|
98 |
+
periods=forecast_steps, freq='M')
|
99 |
+
|
100 |
+
forecast_df = pd.DataFrame({'arrival_date': forecast_index,
|
101 |
+
'forecast': forecast.predicted_mean})
|
102 |
+
|
103 |
+
# Plot the results
|
104 |
+
st.write("## Revenue Forecast")
|
105 |
+
plt.figure(figsize=(12, 6))
|
106 |
+
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
|
107 |
+
sns.lineplot(x='arrival_date', y='forecast', data=forecast_df, label='Forecasted Revenue')
|
108 |
+
plt.title('Revenue Forecast')
|
109 |
+
plt.xlabel('Month')
|
110 |
+
plt.ylabel('Revenue')
|
111 |
+
plt.xticks(rotation=45)
|
112 |
+
plt.legend()
|
113 |
+
plt.tight_layout()
|
114 |
+
st.pyplot(plt)
|
115 |
+
|
116 |
+
# Display forecasted values
|
117 |
+
st.write("## Forecasted Revenue for the Next 12 Months")
|
118 |
+
st.write(forecast_df.set_index('arrival_date'))
|
119 |
+
|
120 |
+
|
121 |
+
elif options == 'Predict Booking Cancellations':
|
122 |
+
st.header('Predict Booking Cancellations')
|
123 |
+
st.write('Provide input data to predict if a booking will be canceled.')
|
124 |
+
|
125 |
+
input_data = {}
|
126 |
+
for col in df.drop(columns=['is_canceled']).columns:
|
127 |
+
input_data[col] = st.text_input(f'{col}:', value='0')
|
128 |
+
|
129 |
+
input_df = pd.DataFrame(input_data, index=[0])
|
130 |
+
prediction = random_forest_model.predict(input_df)
|
131 |
+
st.write('Prediction:', 'Canceled' if prediction[0] else 'Not Canceled')
|
132 |
+
|
133 |
+
elif options == 'Market Segmentation':
|
134 |
+
st.header('Market Segmentation')
|
135 |
+
segmentation_features = df[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
|
136 |
+
scaler = StandardScaler()
|
137 |
+
segmentation_features_scaled = scaler.fit_transform(segmentation_features)
|
138 |
+
|
139 |
+
kmeans = KMeans(n_clusters=4, random_state=42)
|
140 |
+
df['customer_segment'] = kmeans.fit_predict(segmentation_features_scaled)
|
141 |
+
|
142 |
+
plt.figure(figsize=(10, 5))
|
143 |
+
sns.scatterplot(x=segmentation_features_scaled[:, 0], y=segmentation_features_scaled[:, 1], hue=df['customer_segment'], palette='viridis')
|
144 |
+
plt.title('Customer Segmentation')
|
145 |
+
plt.xlabel('Total Guests (Standardized)')
|
146 |
+
plt.ylabel('Total Special Requests (Standardized)')
|
147 |
+
st.pyplot(plt)
|
148 |
+
|
149 |
+
elif options == 'Customer Lifetime Value':
|
150 |
+
st.header('Customer Lifetime Value')
|
151 |
+
clv_df = df.groupby('customer_id')['revenue'].sum().reset_index()
|
152 |
+
clv_df.columns = ['customer_id', 'lifetime_value']
|
153 |
+
|
154 |
+
plt.figure(figsize=(10, 5))
|
155 |
+
sns.histplot(clv_df['lifetime_value'], kde=True)
|
156 |
+
plt.title('Customer Lifetime Value Distribution')
|
157 |
+
plt.xlabel('Lifetime Value')
|
158 |
+
plt.ylabel('Frequency')
|
159 |
+
st.pyplot(plt)
|