# app.py import streamlit as st import pandas as pd import numpy as np import pickle import matplotlib.pyplot as plt import seaborn as sns from datetime import timedelta from pandas.tseries.offsets import MonthEnd from statsmodels.tsa.statespace.sarimax import SARIMAX from statsmodels.tsa.stattools import adfuller # # Load models # with open('./revenue_forcast.pkl', 'rb') as file: # arima_model = pickle.load(file) # # Load data # file_path = './Dataset/hotel_booking.csv' # df = pd.read_csv(file_path) # Preprocess data for Streamlit numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns categorical_cols = df.select_dtypes(include=['object']).columns for col in categorical_cols: df[col] = df[col].astype('category') # Streamlit app st.title('Hotel Booking Analysis') # Navigation st.sidebar.title('Navigation') options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value']) if options == 'Overview': st.header('Overview') st.write('This app provides insights and predictions for hotel bookings.') elif options == 'Revenue Forecasting': # Streamlit app title st.title('Hotel Booking Revenue Forecasting with SARIMA') # File uploader uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: # Load the dataset data = pd.read_csv(uploaded_file) # Display the first few rows of the dataset st.write("## Dataset Preview") st.write(data.head()) # Convert arrival_date_year and arrival_date_month to a datetime format data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' + data['arrival_date_month'].astype(str) + '-01') data['arrival_date'] += MonthEnd(0) # Calculate monthly revenue monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index() # Plot monthly revenue st.write("## Monthly Revenue") plt.figure(figsize=(12, 6)) sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue) plt.title('Monthly Revenue') plt.xlabel('Month') plt.ylabel('Revenue') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) # Check for stationarity result = adfuller(monthly_revenue['adr']) st.write(f'## ADF Statistic: {result[0]}') st.write(f'## p-value: {result[1]}') # If the series is not stationary, take the first difference monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna() # Model parameters p = st.slider('AR order (p)', 0, 5, 1) d = st.slider('Differencing order (d)', 0, 2, 1) q = st.slider('MA order (q)', 0, 5, 1) P = st.slider('Seasonal AR order (P)', 0, 2, 1) D = st.slider('Seasonal differencing order (D)', 0, 2, 1) Q = st.slider('Seasonal MA order (Q)', 0, 2, 1) # Fit the SARIMA model with user-defined parameters model = SARIMAX(monthly_revenue['adr'], order=(p, d, q), seasonal_order=(P, D, Q, 12)) model_fit = model.fit(disp=False) # Make predictions forecast_steps = 12 # Forecast for the next 12 months forecast = model_fit.get_forecast(steps=forecast_steps) forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max(), periods=forecast_steps, freq='M') forecast_df = pd.DataFrame({'arrival_date': forecast_index, 'forecast': forecast.predicted_mean}) # Plot the results st.write("## Revenue Forecast") plt.figure(figsize=(12, 6)) sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue') sns.lineplot(x='arrival_date', y='forecast', data=forecast_df, label='Forecasted Revenue') plt.title('Revenue Forecast') plt.xlabel('Month') plt.ylabel('Revenue') plt.xticks(rotation=45) plt.legend() plt.tight_layout() st.pyplot(plt) # Display forecasted values st.write("## Forecasted Revenue for the Next 12 Months") st.write(forecast_df.set_index('arrival_date')) elif options == 'Predict Booking Cancellations': st.header('Predict Booking Cancellations') st.write('Provide input data to predict if a booking will be canceled.') input_data = {} for col in df.drop(columns=['is_canceled']).columns: input_data[col] = st.text_input(f'{col}:', value='0') input_df = pd.DataFrame(input_data, index=[0]) prediction = random_forest_model.predict(input_df) st.write('Prediction:', 'Canceled' if prediction[0] else 'Not Canceled') elif options == 'Market Segmentation': st.header('Market Segmentation') segmentation_features = df[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']] scaler = StandardScaler() segmentation_features_scaled = scaler.fit_transform(segmentation_features) kmeans = KMeans(n_clusters=4, random_state=42) df['customer_segment'] = kmeans.fit_predict(segmentation_features_scaled) plt.figure(figsize=(10, 5)) sns.scatterplot(x=segmentation_features_scaled[:, 0], y=segmentation_features_scaled[:, 1], hue=df['customer_segment'], palette='viridis') plt.title('Customer Segmentation') plt.xlabel('Total Guests (Standardized)') plt.ylabel('Total Special Requests (Standardized)') st.pyplot(plt) elif options == 'Customer Lifetime Value': st.header('Customer Lifetime Value') clv_df = df.groupby('customer_id')['revenue'].sum().reset_index() clv_df.columns = ['customer_id', 'lifetime_value'] plt.figure(figsize=(10, 5)) sns.histplot(clv_df['lifetime_value'], kde=True) plt.title('Customer Lifetime Value Distribution') plt.xlabel('Lifetime Value') plt.ylabel('Frequency') st.pyplot(plt)