mayankraghav commited on
Commit
aea9fdb
1 Parent(s): 9bd2d1f

modefied app file

Browse files
Files changed (1) hide show
  1. app.py +31 -28
app.py CHANGED
@@ -9,6 +9,9 @@ from datetime import timedelta
9
  from pandas.tseries.offsets import MonthEnd
10
  from statsmodels.tsa.statespace.sarimax import SARIMAX
11
  from statsmodels.tsa.stattools import adfuller
 
 
 
12
  from sklearn.cluster import KMeans
13
  from sklearn.preprocessing import StandardScaler
14
 
@@ -24,29 +27,21 @@ if options == 'Overview':
24
  st.write('This app provides insights and predictions for hotel bookings.')
25
 
26
  elif options == 'Revenue Forecasting':
27
- # Streamlit app title
28
- st.title('Hotel Booking Revenue Forecasting with SARIMA')
29
 
30
- # File uploader
31
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
32
 
33
  if uploaded_file is not None:
34
- # Load the dataset
35
  data = pd.read_csv(uploaded_file)
36
-
37
- # Display the first few rows of the dataset
38
  st.write("## Dataset Preview")
39
  st.write(data.head())
40
 
41
- # Convert arrival_date_year and arrival_date_month to a datetime format
42
  data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
43
  data['arrival_date_month'].astype(str) + '-01')
44
  data['arrival_date'] += MonthEnd(0)
45
 
46
- # Calculate monthly revenue
47
  monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()
48
 
49
- # Plot monthly revenue
50
  st.write("## Monthly Revenue")
51
  plt.figure(figsize=(12, 6))
52
  sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
@@ -57,16 +52,13 @@ elif options == 'Revenue Forecasting':
57
  plt.tight_layout()
58
  st.pyplot(plt)
59
 
60
- # Check for stationarity
61
  result = adfuller(monthly_revenue['adr'])
62
  st.write(f'## ADF Statistic: {result[0]}')
63
  st.write(f'## p-value: {result[1]}')
64
 
65
- # If the series is not stationary, take the first difference
66
  if result[1] > 0.05:
67
  monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()
68
 
69
- # Model parameters
70
  p = st.slider('AR order (p)', 0, 5, 1)
71
  d = st.slider('Differencing order (d)', 0, 2, 1)
72
  q = st.slider('MA order (q)', 0, 5, 1)
@@ -74,14 +66,12 @@ elif options == 'Revenue Forecasting':
74
  D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
75
  Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)
76
 
77
- # Fit the SARIMA model with user-defined parameters
78
  model = SARIMAX(monthly_revenue['adr'],
79
  order=(p, d, q),
80
  seasonal_order=(P, D, Q, 12))
81
  model_fit = model.fit(disp=False)
82
 
83
- # Make predictions
84
- forecast_steps = 12 # Forecast for the next 12 months
85
  forecast = model_fit.get_forecast(steps=forecast_steps)
86
  forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max() + pd.DateOffset(months=1),
87
  periods=forecast_steps, freq='M')
@@ -89,7 +79,6 @@ elif options == 'Revenue Forecasting':
89
  forecast_df = pd.DataFrame({'arrival_date': forecast_index,
90
  'forecast': forecast.predicted_mean})
91
 
92
- # Plot the results
93
  st.write("## Revenue Forecast")
94
  plt.figure(figsize=(12, 6))
95
  sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
@@ -102,7 +91,6 @@ elif options == 'Revenue Forecasting':
102
  plt.tight_layout()
103
  st.pyplot(plt)
104
 
105
- # Display forecasted values
106
  st.write("## Forecasted Revenue for the Next 12 Months")
107
  st.write(forecast_df.set_index('arrival_date'))
108
 
@@ -112,20 +100,35 @@ elif options == 'Predict Booking Cancellations':
112
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
113
 
114
  if uploaded_file is not None:
115
- # Load the dataset
116
  data = pd.read_csv(uploaded_file)
117
  st.write("## Dataset Preview")
118
  st.write(data.head())
119
 
120
- # Load the trained model
121
- with open('random_forest_model.pkl', 'rb') as file:
122
- model = pickle.load(file)
123
-
124
- st.write("## Provide input data to predict if a booking will be canceled.")
 
 
 
 
 
 
 
125
 
 
 
 
 
 
 
 
 
 
126
  input_data = {}
127
- for col in data.columns:
128
- input_data[col] = st.text_input(f'{col}:', value='0')
129
 
130
  input_df = pd.DataFrame(input_data, index=[0])
131
  prediction = model.predict(input_df)
@@ -137,11 +140,11 @@ elif options == 'Market Segmentation':
137
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
138
 
139
  if uploaded_file is not None:
140
- # Load the dataset
141
  data = pd.read_csv(uploaded_file)
142
  st.write("## Dataset Preview")
143
  st.write(data.head())
144
 
 
145
  segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
146
  scaler = StandardScaler()
147
  segmentation_features_scaled = scaler.fit_transform(segmentation_features)
@@ -165,12 +168,12 @@ elif options == 'Customer Lifetime Value':
165
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
166
 
167
  if uploaded_file is not None:
168
- # Load the dataset
169
  data = pd.read_csv(uploaded_file)
170
  st.write("## Dataset Preview")
171
  st.write(data.head())
172
 
173
- clv_df = data.groupby('customer_id')['revenue'].sum().reset_index()
 
174
  clv_df.columns = ['customer_id', 'lifetime_value']
175
 
176
  st.write("## Customer Lifetime Value Distribution")
 
9
  from pandas.tseries.offsets import MonthEnd
10
  from statsmodels.tsa.statespace.sarimax import SARIMAX
11
  from statsmodels.tsa.stattools import adfuller
12
+ from sklearn.ensemble import RandomForestClassifier
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.metrics import accuracy_score
15
  from sklearn.cluster import KMeans
16
  from sklearn.preprocessing import StandardScaler
17
 
 
27
  st.write('This app provides insights and predictions for hotel bookings.')
28
 
29
  elif options == 'Revenue Forecasting':
30
+ st.header('Hotel Booking Revenue Forecasting with SARIMA')
 
31
 
 
32
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
33
 
34
  if uploaded_file is not None:
 
35
  data = pd.read_csv(uploaded_file)
 
 
36
  st.write("## Dataset Preview")
37
  st.write(data.head())
38
 
 
39
  data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
40
  data['arrival_date_month'].astype(str) + '-01')
41
  data['arrival_date'] += MonthEnd(0)
42
 
 
43
  monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()
44
 
 
45
  st.write("## Monthly Revenue")
46
  plt.figure(figsize=(12, 6))
47
  sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
 
52
  plt.tight_layout()
53
  st.pyplot(plt)
54
 
 
55
  result = adfuller(monthly_revenue['adr'])
56
  st.write(f'## ADF Statistic: {result[0]}')
57
  st.write(f'## p-value: {result[1]}')
58
 
 
59
  if result[1] > 0.05:
60
  monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()
61
 
 
62
  p = st.slider('AR order (p)', 0, 5, 1)
63
  d = st.slider('Differencing order (d)', 0, 2, 1)
64
  q = st.slider('MA order (q)', 0, 5, 1)
 
66
  D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
67
  Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)
68
 
 
69
  model = SARIMAX(monthly_revenue['adr'],
70
  order=(p, d, q),
71
  seasonal_order=(P, D, Q, 12))
72
  model_fit = model.fit(disp=False)
73
 
74
+ forecast_steps = 12
 
75
  forecast = model_fit.get_forecast(steps=forecast_steps)
76
  forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max() + pd.DateOffset(months=1),
77
  periods=forecast_steps, freq='M')
 
79
  forecast_df = pd.DataFrame({'arrival_date': forecast_index,
80
  'forecast': forecast.predicted_mean})
81
 
 
82
  st.write("## Revenue Forecast")
83
  plt.figure(figsize=(12, 6))
84
  sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
 
91
  plt.tight_layout()
92
  st.pyplot(plt)
93
 
 
94
  st.write("## Forecasted Revenue for the Next 12 Months")
95
  st.write(forecast_df.set_index('arrival_date'))
96
 
 
100
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
101
 
102
  if uploaded_file is not None:
 
103
  data = pd.read_csv(uploaded_file)
104
  st.write("## Dataset Preview")
105
  st.write(data.head())
106
 
107
+ features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number',
108
+ 'arrival_date_day_of_month', 'stays_in_weekend_nights',
109
+ 'stays_in_week_nights', 'adults', 'children', 'babies',
110
+ 'is_repeated_guest', 'previous_cancellations',
111
+ 'previous_bookings_not_canceled', 'booking_changes',
112
+ 'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
113
+ 'total_of_special_requests']
114
+
115
+ X = data[features]
116
+ y = data['is_canceled']
117
+
118
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
119
 
120
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
121
+ model.fit(X_train, y_train)
122
+
123
+ y_pred = model.predict(X_test)
124
+ accuracy = accuracy_score(y_test, y_pred)
125
+ st.write(f'Model Accuracy: {accuracy:.2f}')
126
+
127
+ st.write("## Predict Booking Cancellation for New Data")
128
+
129
  input_data = {}
130
+ for col in features:
131
+ input_data[col] = float(st.text_input(f'{col}:', value='0'))
132
 
133
  input_df = pd.DataFrame(input_data, index=[0])
134
  prediction = model.predict(input_df)
 
140
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
141
 
142
  if uploaded_file is not None:
 
143
  data = pd.read_csv(uploaded_file)
144
  st.write("## Dataset Preview")
145
  st.write(data.head())
146
 
147
+ data['total_guests'] = data['adults'] + data['children'] + data['babies']
148
  segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
149
  scaler = StandardScaler()
150
  segmentation_features_scaled = scaler.fit_transform(segmentation_features)
 
168
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
169
 
170
  if uploaded_file is not None:
 
171
  data = pd.read_csv(uploaded_file)
172
  st.write("## Dataset Preview")
173
  st.write(data.head())
174
 
175
+ data['customer_id'] = data['lead_time'].astype(str) + '_' + data['arrival_date_year'].astype(str) + '_' + data['arrival_date_month'].astype(str) + '_' + data['arrival_date_day_of_month'].astype(str)
176
+ clv_df = data.groupby('customer_id')['adr'].sum().reset_index()
177
  clv_df.columns = ['customer_id', 'lifetime_value']
178
 
179
  st.write("## Customer Lifetime Value Distribution")