Spaces:
Runtime error
Runtime error
mayankraghav
commited on
Commit
•
aea9fdb
1
Parent(s):
9bd2d1f
modefied app file
Browse files
app.py
CHANGED
@@ -9,6 +9,9 @@ from datetime import timedelta
|
|
9 |
from pandas.tseries.offsets import MonthEnd
|
10 |
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
11 |
from statsmodels.tsa.stattools import adfuller
|
|
|
|
|
|
|
12 |
from sklearn.cluster import KMeans
|
13 |
from sklearn.preprocessing import StandardScaler
|
14 |
|
@@ -24,29 +27,21 @@ if options == 'Overview':
|
|
24 |
st.write('This app provides insights and predictions for hotel bookings.')
|
25 |
|
26 |
elif options == 'Revenue Forecasting':
|
27 |
-
|
28 |
-
st.title('Hotel Booking Revenue Forecasting with SARIMA')
|
29 |
|
30 |
-
# File uploader
|
31 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
32 |
|
33 |
if uploaded_file is not None:
|
34 |
-
# Load the dataset
|
35 |
data = pd.read_csv(uploaded_file)
|
36 |
-
|
37 |
-
# Display the first few rows of the dataset
|
38 |
st.write("## Dataset Preview")
|
39 |
st.write(data.head())
|
40 |
|
41 |
-
# Convert arrival_date_year and arrival_date_month to a datetime format
|
42 |
data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
|
43 |
data['arrival_date_month'].astype(str) + '-01')
|
44 |
data['arrival_date'] += MonthEnd(0)
|
45 |
|
46 |
-
# Calculate monthly revenue
|
47 |
monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()
|
48 |
|
49 |
-
# Plot monthly revenue
|
50 |
st.write("## Monthly Revenue")
|
51 |
plt.figure(figsize=(12, 6))
|
52 |
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
|
@@ -57,16 +52,13 @@ elif options == 'Revenue Forecasting':
|
|
57 |
plt.tight_layout()
|
58 |
st.pyplot(plt)
|
59 |
|
60 |
-
# Check for stationarity
|
61 |
result = adfuller(monthly_revenue['adr'])
|
62 |
st.write(f'## ADF Statistic: {result[0]}')
|
63 |
st.write(f'## p-value: {result[1]}')
|
64 |
|
65 |
-
# If the series is not stationary, take the first difference
|
66 |
if result[1] > 0.05:
|
67 |
monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()
|
68 |
|
69 |
-
# Model parameters
|
70 |
p = st.slider('AR order (p)', 0, 5, 1)
|
71 |
d = st.slider('Differencing order (d)', 0, 2, 1)
|
72 |
q = st.slider('MA order (q)', 0, 5, 1)
|
@@ -74,14 +66,12 @@ elif options == 'Revenue Forecasting':
|
|
74 |
D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
|
75 |
Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)
|
76 |
|
77 |
-
# Fit the SARIMA model with user-defined parameters
|
78 |
model = SARIMAX(monthly_revenue['adr'],
|
79 |
order=(p, d, q),
|
80 |
seasonal_order=(P, D, Q, 12))
|
81 |
model_fit = model.fit(disp=False)
|
82 |
|
83 |
-
|
84 |
-
forecast_steps = 12 # Forecast for the next 12 months
|
85 |
forecast = model_fit.get_forecast(steps=forecast_steps)
|
86 |
forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max() + pd.DateOffset(months=1),
|
87 |
periods=forecast_steps, freq='M')
|
@@ -89,7 +79,6 @@ elif options == 'Revenue Forecasting':
|
|
89 |
forecast_df = pd.DataFrame({'arrival_date': forecast_index,
|
90 |
'forecast': forecast.predicted_mean})
|
91 |
|
92 |
-
# Plot the results
|
93 |
st.write("## Revenue Forecast")
|
94 |
plt.figure(figsize=(12, 6))
|
95 |
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
|
@@ -102,7 +91,6 @@ elif options == 'Revenue Forecasting':
|
|
102 |
plt.tight_layout()
|
103 |
st.pyplot(plt)
|
104 |
|
105 |
-
# Display forecasted values
|
106 |
st.write("## Forecasted Revenue for the Next 12 Months")
|
107 |
st.write(forecast_df.set_index('arrival_date'))
|
108 |
|
@@ -112,20 +100,35 @@ elif options == 'Predict Booking Cancellations':
|
|
112 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
113 |
|
114 |
if uploaded_file is not None:
|
115 |
-
# Load the dataset
|
116 |
data = pd.read_csv(uploaded_file)
|
117 |
st.write("## Dataset Preview")
|
118 |
st.write(data.head())
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
input_data = {}
|
127 |
-
for col in
|
128 |
-
input_data[col] = st.text_input(f'{col}:', value='0')
|
129 |
|
130 |
input_df = pd.DataFrame(input_data, index=[0])
|
131 |
prediction = model.predict(input_df)
|
@@ -137,11 +140,11 @@ elif options == 'Market Segmentation':
|
|
137 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
138 |
|
139 |
if uploaded_file is not None:
|
140 |
-
# Load the dataset
|
141 |
data = pd.read_csv(uploaded_file)
|
142 |
st.write("## Dataset Preview")
|
143 |
st.write(data.head())
|
144 |
|
|
|
145 |
segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
|
146 |
scaler = StandardScaler()
|
147 |
segmentation_features_scaled = scaler.fit_transform(segmentation_features)
|
@@ -165,12 +168,12 @@ elif options == 'Customer Lifetime Value':
|
|
165 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
166 |
|
167 |
if uploaded_file is not None:
|
168 |
-
# Load the dataset
|
169 |
data = pd.read_csv(uploaded_file)
|
170 |
st.write("## Dataset Preview")
|
171 |
st.write(data.head())
|
172 |
|
173 |
-
|
|
|
174 |
clv_df.columns = ['customer_id', 'lifetime_value']
|
175 |
|
176 |
st.write("## Customer Lifetime Value Distribution")
|
|
|
9 |
from pandas.tseries.offsets import MonthEnd
|
10 |
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
11 |
from statsmodels.tsa.stattools import adfuller
|
12 |
+
from sklearn.ensemble import RandomForestClassifier
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from sklearn.metrics import accuracy_score
|
15 |
from sklearn.cluster import KMeans
|
16 |
from sklearn.preprocessing import StandardScaler
|
17 |
|
|
|
27 |
st.write('This app provides insights and predictions for hotel bookings.')
|
28 |
|
29 |
elif options == 'Revenue Forecasting':
|
30 |
+
st.header('Hotel Booking Revenue Forecasting with SARIMA')
|
|
|
31 |
|
|
|
32 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
33 |
|
34 |
if uploaded_file is not None:
|
|
|
35 |
data = pd.read_csv(uploaded_file)
|
|
|
|
|
36 |
st.write("## Dataset Preview")
|
37 |
st.write(data.head())
|
38 |
|
|
|
39 |
data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
|
40 |
data['arrival_date_month'].astype(str) + '-01')
|
41 |
data['arrival_date'] += MonthEnd(0)
|
42 |
|
|
|
43 |
monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()
|
44 |
|
|
|
45 |
st.write("## Monthly Revenue")
|
46 |
plt.figure(figsize=(12, 6))
|
47 |
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
|
|
|
52 |
plt.tight_layout()
|
53 |
st.pyplot(plt)
|
54 |
|
|
|
55 |
result = adfuller(monthly_revenue['adr'])
|
56 |
st.write(f'## ADF Statistic: {result[0]}')
|
57 |
st.write(f'## p-value: {result[1]}')
|
58 |
|
|
|
59 |
if result[1] > 0.05:
|
60 |
monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()
|
61 |
|
|
|
62 |
p = st.slider('AR order (p)', 0, 5, 1)
|
63 |
d = st.slider('Differencing order (d)', 0, 2, 1)
|
64 |
q = st.slider('MA order (q)', 0, 5, 1)
|
|
|
66 |
D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
|
67 |
Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)
|
68 |
|
|
|
69 |
model = SARIMAX(monthly_revenue['adr'],
|
70 |
order=(p, d, q),
|
71 |
seasonal_order=(P, D, Q, 12))
|
72 |
model_fit = model.fit(disp=False)
|
73 |
|
74 |
+
forecast_steps = 12
|
|
|
75 |
forecast = model_fit.get_forecast(steps=forecast_steps)
|
76 |
forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max() + pd.DateOffset(months=1),
|
77 |
periods=forecast_steps, freq='M')
|
|
|
79 |
forecast_df = pd.DataFrame({'arrival_date': forecast_index,
|
80 |
'forecast': forecast.predicted_mean})
|
81 |
|
|
|
82 |
st.write("## Revenue Forecast")
|
83 |
plt.figure(figsize=(12, 6))
|
84 |
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
|
|
|
91 |
plt.tight_layout()
|
92 |
st.pyplot(plt)
|
93 |
|
|
|
94 |
st.write("## Forecasted Revenue for the Next 12 Months")
|
95 |
st.write(forecast_df.set_index('arrival_date'))
|
96 |
|
|
|
100 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
101 |
|
102 |
if uploaded_file is not None:
|
|
|
103 |
data = pd.read_csv(uploaded_file)
|
104 |
st.write("## Dataset Preview")
|
105 |
st.write(data.head())
|
106 |
|
107 |
+
features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number',
|
108 |
+
'arrival_date_day_of_month', 'stays_in_weekend_nights',
|
109 |
+
'stays_in_week_nights', 'adults', 'children', 'babies',
|
110 |
+
'is_repeated_guest', 'previous_cancellations',
|
111 |
+
'previous_bookings_not_canceled', 'booking_changes',
|
112 |
+
'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
|
113 |
+
'total_of_special_requests']
|
114 |
+
|
115 |
+
X = data[features]
|
116 |
+
y = data['is_canceled']
|
117 |
+
|
118 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
119 |
|
120 |
+
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
121 |
+
model.fit(X_train, y_train)
|
122 |
+
|
123 |
+
y_pred = model.predict(X_test)
|
124 |
+
accuracy = accuracy_score(y_test, y_pred)
|
125 |
+
st.write(f'Model Accuracy: {accuracy:.2f}')
|
126 |
+
|
127 |
+
st.write("## Predict Booking Cancellation for New Data")
|
128 |
+
|
129 |
input_data = {}
|
130 |
+
for col in features:
|
131 |
+
input_data[col] = float(st.text_input(f'{col}:', value='0'))
|
132 |
|
133 |
input_df = pd.DataFrame(input_data, index=[0])
|
134 |
prediction = model.predict(input_df)
|
|
|
140 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
141 |
|
142 |
if uploaded_file is not None:
|
|
|
143 |
data = pd.read_csv(uploaded_file)
|
144 |
st.write("## Dataset Preview")
|
145 |
st.write(data.head())
|
146 |
|
147 |
+
data['total_guests'] = data['adults'] + data['children'] + data['babies']
|
148 |
segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
|
149 |
scaler = StandardScaler()
|
150 |
segmentation_features_scaled = scaler.fit_transform(segmentation_features)
|
|
|
168 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
169 |
|
170 |
if uploaded_file is not None:
|
|
|
171 |
data = pd.read_csv(uploaded_file)
|
172 |
st.write("## Dataset Preview")
|
173 |
st.write(data.head())
|
174 |
|
175 |
+
data['customer_id'] = data['lead_time'].astype(str) + '_' + data['arrival_date_year'].astype(str) + '_' + data['arrival_date_month'].astype(str) + '_' + data['arrival_date_day_of_month'].astype(str)
|
176 |
+
clv_df = data.groupby('customer_id')['adr'].sum().reset_index()
|
177 |
clv_df.columns = ['customer_id', 'lifetime_value']
|
178 |
|
179 |
st.write("## Customer Lifetime Value Distribution")
|