import os import streamlit as st import pandas as pd import numpy as np import pickle import matplotlib.pyplot as plt import seaborn as sns import boto3 from io import StringIO from datetime import timedelta from pandas.tseries.offsets import MonthEnd from statsmodels.tsa.statespace.sarimax import SARIMAX from statsmodels.tsa.stattools import adfuller from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,classification_report, confusion_matrix from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import joblib os.environ['AWS_ACCESS_KEY_ID'] = os.getenv("getdata") os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv("getdatake") def load_data_from_s3(bucket_name, file_key): s3 = boto3.client('s3') obj = s3.get_object(Bucket=bucket_name, Key=file_key) data = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8'))) return data st.title('Accomodation Booking Analysis') st.sidebar.title('Navigation') options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value']) if options == 'Overview': st.header('Overview') st.write('This app provides insights and predictions for hotel bookings.') elif options == 'Revenue Forecasting': st.header('Hotel Booking Revenue Forecasting with SARIMA') # Option to choose data source data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) if data_source == "Upload CSV": uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: data = pd.read_csv(uploaded_file) else: bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") if st.button("Load Data"): data = load_data_from_s3(bucket_name, file_key) if 'data' in locals(): # Display the first few rows of the dataset st.write("## Dataset Preview") st.write(data.head()) data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' + data['arrival_date_month'].astype(str) + '-01') data['arrival_date'] += MonthEnd(0) monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index() st.write("## Monthly Revenue") plt.figure(figsize=(12, 6)) sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue) plt.title('Monthly Revenue') plt.xlabel('Month') plt.ylabel('Revenue') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) result = adfuller(monthly_revenue['adr']) st.write(f'## ADF Statistic: {result[0]}') st.write(f'## p-value: {result[1]}') if result[1] > 0.05: monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna() p = st.slider('AR order (p)', 0, 5, 1) d = st.slider('Differencing order (d)', 0, 2, 1) q = st.slider('MA order (q)', 0, 5, 1) P = st.slider('Seasonal AR order (P)', 0, 2, 1) D = st.slider('Seasonal differencing order (D)', 0, 2, 1) Q = st.slider('Seasonal MA order (Q)', 0, 2, 1) model = SARIMAX(monthly_revenue['adr'], order=(p, d, q), seasonal_order=(P, D, Q, 12)) model_fit = model.fit(disp=False) forecast_steps = 12 forecast = model_fit.get_forecast(steps=forecast_steps) forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max() + pd.DateOffset(months=1), periods=forecast_steps, freq='M') forecast_df = pd.DataFrame({'arrival_date': forecast_index, 'forecast': forecast.predicted_mean}) st.write("## Revenue Forecast") plt.figure(figsize=(12, 6)) sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue') sns.lineplot(x='arrival_date', y='forecast', data=forecast_df, label='Forecasted Revenue') plt.title('Revenue Forecast') plt.xlabel('Month') plt.ylabel('Revenue') plt.xticks(rotation=45) plt.legend() plt.tight_layout() st.pyplot(plt) st.write("## Forecasted Revenue for the Next 12 Months") st.write(forecast_df.set_index('arrival_date')) elif options == 'Predict Booking Cancellations': st.header('Predict Booking Cancellations') # Option to choose data source data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) if data_source == "Upload CSV": uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: data = pd.read_csv(uploaded_file) else: bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") if st.button("Load Data"): data = load_data_from_s3(bucket_name, file_key) if 'data' in locals(): # Display the first few rows of the dataset st.write("## Dataset Preview") st.write(data.head()) features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests'] data = data.dropna(subset=features + ['is_canceled']) X = data[features] y = data['is_canceled'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train the Random Forest Classifier rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(X_train, y_train) # Make predictions y_pred = rf_model.predict(X_test) # Display model performance metrics st.write("## Model Performance Metrics") st.write("### Confusion Matrix") cm = confusion_matrix(y_test, y_pred) st.write(cm) st.write("### Classification Report") cr = classification_report(y_test, y_pred, output_dict=True) st.write(pd.DataFrame(cr).transpose()) st.write("### Accuracy Score") acc = accuracy_score(y_test, y_pred) st.write(acc) # Save the model to a file joblib.dump(rf_model, 'rf_model.pkl') st.write("Model saved as rf_model.pkl") st.write("## Predict Booking Cancellation") st.write("Enter the details to predict if a booking will be canceled:") # Collect user input for prediction input_data = {} for feature in features: input_data[feature] = st.number_input(f"Enter {feature}:", min_value=0.0) if st.button("Predict"): input_df = pd.DataFrame([input_data]) # Ensure the input data has the correct data types for feature in features: input_df[feature] = input_df[feature].astype(X[feature].dtype) prediction = rf_model.predict(input_df) prediction_proba = rf_model.predict_proba(input_df) st.write(f"Input Data: {input_df}") st.write(f"Prediction: {prediction}") st.write(f"Prediction Probability: {prediction_proba}") if prediction[0] == 1: st.write("Prediction: The booking is likely to be canceled.") else: st.write("Prediction: The booking is not likely to be canceled.") elif options == 'Market Segmentation': st.header('Market Segmentation') # Option to choose data source data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) if data_source == "Upload CSV": uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: data = pd.read_csv(uploaded_file) else: bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") if st.button("Load Data"): data = load_data_from_s3(bucket_name, file_key) if 'data' in locals(): # Display the first few rows of the dataset st.write("## Dataset Preview") st.write(data.head()) data['total_guests'] = data['adults'] + data['children'] + data['babies'] segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']] scaler = StandardScaler() segmentation_features_scaled = scaler.fit_transform(segmentation_features) kmeans = KMeans(n_clusters=4, random_state=42) data['customer_segment'] = kmeans.fit_predict(segmentation_features_scaled) st.write("## Customer Segmentation Results") st.write(data[['customer_segment']].head()) plt.figure(figsize=(10, 5)) sns.scatterplot(x=segmentation_features_scaled[:, 0], y=segmentation_features_scaled[:, 1], hue=data['customer_segment'], palette='viridis') plt.title('Customer Segmentation') plt.xlabel('Total Guests (Standardized)') plt.ylabel('Total Special Requests (Standardized)') st.pyplot(plt) elif options == 'Customer Lifetime Value': st.header('Customer Lifetime Value') # Option to choose data source data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) if data_source == "Upload CSV": uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: data = pd.read_csv(uploaded_file) else: bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") if st.button("Load Data"): data = load_data_from_s3(bucket_name, file_key) if 'data' in locals(): # Display the first few rows of the dataset st.write("## Dataset Preview") st.write(data.head()) data['customer_id'] = data['lead_time'].astype(str) + '_' + data['arrival_date_year'].astype(str) + '_' + data['arrival_date_month'].astype(str) + '_' + data['arrival_date_day_of_month'].astype(str) clv_df = data.groupby('customer_id')['adr'].sum().reset_index() clv_df.columns = ['customer_id', 'lifetime_value'] st.write("## Customer Lifetime Value Distribution") plt.figure(figsize=(10, 5)) sns.histplot(clv_df['lifetime_value'], kde=True) plt.title('Customer Lifetime Value Distribution') plt.xlabel('Lifetime Value') plt.ylabel('Frequency') st.pyplot(plt)