mayankraghav commited on
Commit
2c2ffac
1 Parent(s): aea9fdb

Changes to hotel booking prediction

Browse files
Files changed (2) hide show
  1. app.py +128 -35
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,24 +1,35 @@
1
- # app.py
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
  import pickle
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
 
 
8
  from datetime import timedelta
9
  from pandas.tseries.offsets import MonthEnd
10
  from statsmodels.tsa.statespace.sarimax import SARIMAX
11
  from statsmodels.tsa.stattools import adfuller
12
  from sklearn.ensemble import RandomForestClassifier
13
  from sklearn.model_selection import train_test_split
14
- from sklearn.metrics import accuracy_score
15
  from sklearn.cluster import KMeans
16
  from sklearn.preprocessing import StandardScaler
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Streamlit app
19
  st.title('Hotel Booking Analysis')
20
 
21
- # Navigation
22
  st.sidebar.title('Navigation')
23
  options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])
24
 
@@ -29,10 +40,21 @@ if options == 'Overview':
29
  elif options == 'Revenue Forecasting':
30
  st.header('Hotel Booking Revenue Forecasting with SARIMA')
31
 
32
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
 
33
 
34
- if uploaded_file is not None:
35
- data = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
36
  st.write("## Dataset Preview")
37
  st.write(data.head())
38
 
@@ -97,53 +119,112 @@ elif options == 'Revenue Forecasting':
97
  elif options == 'Predict Booking Cancellations':
98
  st.header('Predict Booking Cancellations')
99
 
100
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
101
-
102
- if uploaded_file is not None:
103
- data = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
104
  st.write("## Dataset Preview")
105
  st.write(data.head())
106
 
107
  features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number',
108
  'arrival_date_day_of_month', 'stays_in_weekend_nights',
109
  'stays_in_week_nights', 'adults', 'children', 'babies',
110
- 'is_repeated_guest', 'previous_cancellations',
111
- 'previous_bookings_not_canceled', 'booking_changes',
112
- 'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
113
- 'total_of_special_requests']
114
-
115
  X = data[features]
116
  y = data['is_canceled']
117
 
 
118
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
119
 
120
- model = RandomForestClassifier(n_estimators=100, random_state=42)
121
- model.fit(X_train, y_train)
 
122
 
123
- y_pred = model.predict(X_test)
124
- accuracy = accuracy_score(y_test, y_pred)
125
- st.write(f'Model Accuracy: {accuracy:.2f}')
126
 
127
- st.write("## Predict Booking Cancellation for New Data")
 
 
 
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  input_data = {}
130
- for col in features:
131
- input_data[col] = float(st.text_input(f'{col}:', value='0'))
 
 
 
 
 
 
 
132
 
133
- input_df = pd.DataFrame(input_data, index=[0])
134
- prediction = model.predict(input_df)
135
- st.write('Prediction:', 'Canceled' if prediction[0] else 'Not Canceled')
 
 
 
 
 
 
 
 
136
 
137
  elif options == 'Market Segmentation':
138
  st.header('Market Segmentation')
139
 
140
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
141
-
142
- if uploaded_file is not None:
143
- data = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
144
  st.write("## Dataset Preview")
145
  st.write(data.head())
146
 
 
147
  data['total_guests'] = data['adults'] + data['children'] + data['babies']
148
  segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
149
  scaler = StandardScaler()
@@ -165,10 +246,21 @@ elif options == 'Market Segmentation':
165
  elif options == 'Customer Lifetime Value':
166
  st.header('Customer Lifetime Value')
167
 
168
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
169
-
170
- if uploaded_file is not None:
171
- data = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
172
  st.write("## Dataset Preview")
173
  st.write(data.head())
174
 
@@ -183,3 +275,4 @@ elif options == 'Customer Lifetime Value':
183
  plt.xlabel('Lifetime Value')
184
  plt.ylabel('Frequency')
185
  st.pyplot(plt)
 
 
1
+ import os
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
  import pickle
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
8
+ import boto3
9
+ from io import StringIO
10
  from datetime import timedelta
11
  from pandas.tseries.offsets import MonthEnd
12
  from statsmodels.tsa.statespace.sarimax import SARIMAX
13
  from statsmodels.tsa.stattools import adfuller
14
  from sklearn.ensemble import RandomForestClassifier
15
  from sklearn.model_selection import train_test_split
16
+ from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
17
  from sklearn.cluster import KMeans
18
  from sklearn.preprocessing import StandardScaler
19
+ import joblib
20
+
21
+
22
+ os.environ['AWS_ACCESS_KEY_ID'] = os.getenv("getdata")
23
+ os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv("getdatake")
24
+
25
+ def load_data_from_s3(bucket_name, file_key):
26
+ s3 = boto3.client('s3')
27
+ obj = s3.get_object(Bucket=bucket_name, Key=file_key)
28
+ data = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))
29
+ return data
30
 
 
31
  st.title('Hotel Booking Analysis')
32
 
 
33
  st.sidebar.title('Navigation')
34
  options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])
35
 
 
40
  elif options == 'Revenue Forecasting':
41
  st.header('Hotel Booking Revenue Forecasting with SARIMA')
42
 
43
+ # Option to choose data source
44
+ data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
45
 
46
+ if data_source == "Upload CSV":
47
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
48
+ if uploaded_file is not None:
49
+ data = pd.read_csv(uploaded_file)
50
+ else:
51
+ bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
52
+ file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
53
+ if st.button("Load Data"):
54
+ data = load_data_from_s3(bucket_name, file_key)
55
+
56
+ if 'data' in locals():
57
+ # Display the first few rows of the dataset
58
  st.write("## Dataset Preview")
59
  st.write(data.head())
60
 
 
119
  elif options == 'Predict Booking Cancellations':
120
  st.header('Predict Booking Cancellations')
121
 
122
+ # Option to choose data source
123
+ data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
124
+
125
+ if data_source == "Upload CSV":
126
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
127
+ if uploaded_file is not None:
128
+ data = pd.read_csv(uploaded_file)
129
+ else:
130
+ bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
131
+ file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
132
+ if st.button("Load Data"):
133
+ data = load_data_from_s3(bucket_name, file_key)
134
+
135
+ if 'data' in locals():
136
+ # Display the first few rows of the dataset
137
  st.write("## Dataset Preview")
138
  st.write(data.head())
139
 
140
  features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number',
141
  'arrival_date_day_of_month', 'stays_in_weekend_nights',
142
  'stays_in_week_nights', 'adults', 'children', 'babies',
143
+ 'previous_cancellations', 'previous_bookings_not_canceled',
144
+ 'booking_changes', 'days_in_waiting_list', 'adr',
145
+ 'required_car_parking_spaces', 'total_of_special_requests']
146
+
147
+ data = data.dropna(subset=features + ['is_canceled'])
148
  X = data[features]
149
  y = data['is_canceled']
150
 
151
+ # Split the data into training and testing sets
152
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
153
 
154
+ # Train the Random Forest Classifier
155
+ rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
156
+ rf_model.fit(X_train, y_train)
157
 
158
+ # Make predictions
159
+ y_pred = rf_model.predict(X_test)
 
160
 
161
+ # Display model performance metrics
162
+ st.write("## Model Performance Metrics")
163
+ st.write("### Confusion Matrix")
164
+ cm = confusion_matrix(y_test, y_pred)
165
+ st.write(cm)
166
 
167
+ st.write("### Classification Report")
168
+ cr = classification_report(y_test, y_pred, output_dict=True)
169
+ st.write(pd.DataFrame(cr).transpose())
170
+
171
+ st.write("### Accuracy Score")
172
+ acc = accuracy_score(y_test, y_pred)
173
+ st.write(acc)
174
+
175
+ # Save the model to a file
176
+ joblib.dump(rf_model, 'rf_model.pkl')
177
+ st.write("Model saved as rf_model.pkl")
178
+
179
+ st.write("## Predict Booking Cancellation")
180
+ st.write("Enter the details to predict if a booking will be canceled:")
181
+
182
+ # Collect user input for prediction
183
  input_data = {}
184
+ for feature in features:
185
+ input_data[feature] = st.number_input(f"Enter {feature}:", min_value=0.0)
186
+
187
+ if st.button("Predict"):
188
+ input_df = pd.DataFrame([input_data])
189
+
190
+ # Ensure the input data has the correct data types
191
+ for feature in features:
192
+ input_df[feature] = input_df[feature].astype(X[feature].dtype)
193
 
194
+ prediction = rf_model.predict(input_df)
195
+ prediction_proba = rf_model.predict_proba(input_df)
196
+
197
+ st.write(f"Input Data: {input_df}")
198
+ st.write(f"Prediction: {prediction}")
199
+ st.write(f"Prediction Probability: {prediction_proba}")
200
+
201
+ if prediction[0] == 1:
202
+ st.write("Prediction: The booking is likely to be canceled.")
203
+ else:
204
+ st.write("Prediction: The booking is not likely to be canceled.")
205
 
206
  elif options == 'Market Segmentation':
207
  st.header('Market Segmentation')
208
 
209
+ # Option to choose data source
210
+ data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
211
+
212
+ if data_source == "Upload CSV":
213
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
214
+ if uploaded_file is not None:
215
+ data = pd.read_csv(uploaded_file)
216
+ else:
217
+ bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
218
+ file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
219
+ if st.button("Load Data"):
220
+ data = load_data_from_s3(bucket_name, file_key)
221
+
222
+ if 'data' in locals():
223
+ # Display the first few rows of the dataset
224
  st.write("## Dataset Preview")
225
  st.write(data.head())
226
 
227
+
228
  data['total_guests'] = data['adults'] + data['children'] + data['babies']
229
  segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
230
  scaler = StandardScaler()
 
246
  elif options == 'Customer Lifetime Value':
247
  st.header('Customer Lifetime Value')
248
 
249
+ # Option to choose data source
250
+ data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
251
+
252
+ if data_source == "Upload CSV":
253
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
254
+ if uploaded_file is not None:
255
+ data = pd.read_csv(uploaded_file)
256
+ else:
257
+ bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
258
+ file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
259
+ if st.button("Load Data"):
260
+ data = load_data_from_s3(bucket_name, file_key)
261
+
262
+ if 'data' in locals():
263
+ # Display the first few rows of the dataset
264
  st.write("## Dataset Preview")
265
  st.write(data.head())
266
 
 
275
  plt.xlabel('Lifetime Value')
276
  plt.ylabel('Frequency')
277
  st.pyplot(plt)
278
+
requirements.txt CHANGED
@@ -4,4 +4,6 @@ numpy
4
  matplotlib
5
  seaborn
6
  scikit-learn
7
- statsmodels
 
 
 
4
  matplotlib
5
  seaborn
6
  scikit-learn
7
+ statsmodels
8
+ boto3
9
+ joblib