Spaces:
Running
Running
import os | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import pickle | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import boto3 | |
from io import StringIO | |
from datetime import timedelta | |
from pandas.tseries.offsets import MonthEnd | |
from statsmodels.tsa.statespace.sarimax import SARIMAX | |
from statsmodels.tsa.stattools import adfuller | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix | |
from sklearn.cluster import KMeans | |
from sklearn.preprocessing import StandardScaler | |
import joblib | |
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv("getdata") | |
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv("getdatake") | |
def load_data_from_s3(bucket_name, file_key): | |
s3 = boto3.client('s3') | |
obj = s3.get_object(Bucket=bucket_name, Key=file_key) | |
data = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8'))) | |
return data | |
st.title('Hotel Booking Analysis') | |
st.sidebar.title('Navigation') | |
options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value']) | |
if options == 'Overview': | |
st.header('Overview') | |
st.write('This app provides insights and predictions for hotel bookings.') | |
elif options == 'Revenue Forecasting': | |
st.header('Hotel Booking Revenue Forecasting with SARIMA') | |
# Option to choose data source | |
data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) | |
if data_source == "Upload CSV": | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
data = pd.read_csv(uploaded_file) | |
else: | |
bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") | |
file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") | |
if st.button("Load Data"): | |
data = load_data_from_s3(bucket_name, file_key) | |
if 'data' in locals(): | |
# Display the first few rows of the dataset | |
st.write("## Dataset Preview") | |
st.write(data.head()) | |
data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' + | |
data['arrival_date_month'].astype(str) + '-01') | |
data['arrival_date'] += MonthEnd(0) | |
monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index() | |
st.write("## Monthly Revenue") | |
plt.figure(figsize=(12, 6)) | |
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue) | |
plt.title('Monthly Revenue') | |
plt.xlabel('Month') | |
plt.ylabel('Revenue') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(plt) | |
result = adfuller(monthly_revenue['adr']) | |
st.write(f'## ADF Statistic: {result[0]}') | |
st.write(f'## p-value: {result[1]}') | |
if result[1] > 0.05: | |
monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna() | |
p = st.slider('AR order (p)', 0, 5, 1) | |
d = st.slider('Differencing order (d)', 0, 2, 1) | |
q = st.slider('MA order (q)', 0, 5, 1) | |
P = st.slider('Seasonal AR order (P)', 0, 2, 1) | |
D = st.slider('Seasonal differencing order (D)', 0, 2, 1) | |
Q = st.slider('Seasonal MA order (Q)', 0, 2, 1) | |
model = SARIMAX(monthly_revenue['adr'], | |
order=(p, d, q), | |
seasonal_order=(P, D, Q, 12)) | |
model_fit = model.fit(disp=False) | |
forecast_steps = 12 | |
forecast = model_fit.get_forecast(steps=forecast_steps) | |
forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max() + pd.DateOffset(months=1), | |
periods=forecast_steps, freq='M') | |
forecast_df = pd.DataFrame({'arrival_date': forecast_index, | |
'forecast': forecast.predicted_mean}) | |
st.write("## Revenue Forecast") | |
plt.figure(figsize=(12, 6)) | |
sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue') | |
sns.lineplot(x='arrival_date', y='forecast', data=forecast_df, label='Forecasted Revenue') | |
plt.title('Revenue Forecast') | |
plt.xlabel('Month') | |
plt.ylabel('Revenue') | |
plt.xticks(rotation=45) | |
plt.legend() | |
plt.tight_layout() | |
st.pyplot(plt) | |
st.write("## Forecasted Revenue for the Next 12 Months") | |
st.write(forecast_df.set_index('arrival_date')) | |
elif options == 'Predict Booking Cancellations': | |
st.header('Predict Booking Cancellations') | |
# Option to choose data source | |
data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) | |
if data_source == "Upload CSV": | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
data = pd.read_csv(uploaded_file) | |
else: | |
bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") | |
file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") | |
if st.button("Load Data"): | |
data = load_data_from_s3(bucket_name, file_key) | |
if 'data' in locals(): | |
# Display the first few rows of the dataset | |
st.write("## Dataset Preview") | |
st.write(data.head()) | |
features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number', | |
'arrival_date_day_of_month', 'stays_in_weekend_nights', | |
'stays_in_week_nights', 'adults', 'children', 'babies', | |
'previous_cancellations', 'previous_bookings_not_canceled', | |
'booking_changes', 'days_in_waiting_list', 'adr', | |
'required_car_parking_spaces', 'total_of_special_requests'] | |
data = data.dropna(subset=features + ['is_canceled']) | |
X = data[features] | |
y = data['is_canceled'] | |
# Split the data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Train the Random Forest Classifier | |
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) | |
rf_model.fit(X_train, y_train) | |
# Make predictions | |
y_pred = rf_model.predict(X_test) | |
# Display model performance metrics | |
st.write("## Model Performance Metrics") | |
st.write("### Confusion Matrix") | |
cm = confusion_matrix(y_test, y_pred) | |
st.write(cm) | |
st.write("### Classification Report") | |
cr = classification_report(y_test, y_pred, output_dict=True) | |
st.write(pd.DataFrame(cr).transpose()) | |
st.write("### Accuracy Score") | |
acc = accuracy_score(y_test, y_pred) | |
st.write(acc) | |
# Save the model to a file | |
joblib.dump(rf_model, 'rf_model.pkl') | |
st.write("Model saved as rf_model.pkl") | |
st.write("## Predict Booking Cancellation") | |
st.write("Enter the details to predict if a booking will be canceled:") | |
# Collect user input for prediction | |
input_data = {} | |
for feature in features: | |
input_data[feature] = st.number_input(f"Enter {feature}:", min_value=0.0) | |
if st.button("Predict"): | |
input_df = pd.DataFrame([input_data]) | |
# Ensure the input data has the correct data types | |
for feature in features: | |
input_df[feature] = input_df[feature].astype(X[feature].dtype) | |
prediction = rf_model.predict(input_df) | |
prediction_proba = rf_model.predict_proba(input_df) | |
st.write(f"Input Data: {input_df}") | |
st.write(f"Prediction: {prediction}") | |
st.write(f"Prediction Probability: {prediction_proba}") | |
if prediction[0] == 1: | |
st.write("Prediction: The booking is likely to be canceled.") | |
else: | |
st.write("Prediction: The booking is not likely to be canceled.") | |
elif options == 'Market Segmentation': | |
st.header('Market Segmentation') | |
# Option to choose data source | |
data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) | |
if data_source == "Upload CSV": | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
data = pd.read_csv(uploaded_file) | |
else: | |
bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") | |
file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") | |
if st.button("Load Data"): | |
data = load_data_from_s3(bucket_name, file_key) | |
if 'data' in locals(): | |
# Display the first few rows of the dataset | |
st.write("## Dataset Preview") | |
st.write(data.head()) | |
data['total_guests'] = data['adults'] + data['children'] + data['babies'] | |
segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']] | |
scaler = StandardScaler() | |
segmentation_features_scaled = scaler.fit_transform(segmentation_features) | |
kmeans = KMeans(n_clusters=4, random_state=42) | |
data['customer_segment'] = kmeans.fit_predict(segmentation_features_scaled) | |
st.write("## Customer Segmentation Results") | |
st.write(data[['customer_segment']].head()) | |
plt.figure(figsize=(10, 5)) | |
sns.scatterplot(x=segmentation_features_scaled[:, 0], y=segmentation_features_scaled[:, 1], hue=data['customer_segment'], palette='viridis') | |
plt.title('Customer Segmentation') | |
plt.xlabel('Total Guests (Standardized)') | |
plt.ylabel('Total Special Requests (Standardized)') | |
st.pyplot(plt) | |
elif options == 'Customer Lifetime Value': | |
st.header('Customer Lifetime Value') | |
# Option to choose data source | |
data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"]) | |
if data_source == "Upload CSV": | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
data = pd.read_csv(uploaded_file) | |
else: | |
bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank") | |
file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv") | |
if st.button("Load Data"): | |
data = load_data_from_s3(bucket_name, file_key) | |
if 'data' in locals(): | |
# Display the first few rows of the dataset | |
st.write("## Dataset Preview") | |
st.write(data.head()) | |
data['customer_id'] = data['lead_time'].astype(str) + '_' + data['arrival_date_year'].astype(str) + '_' + data['arrival_date_month'].astype(str) + '_' + data['arrival_date_day_of_month'].astype(str) | |
clv_df = data.groupby('customer_id')['adr'].sum().reset_index() | |
clv_df.columns = ['customer_id', 'lifetime_value'] | |
st.write("## Customer Lifetime Value Distribution") | |
plt.figure(figsize=(10, 5)) | |
sns.histplot(clv_df['lifetime_value'], kde=True) | |
plt.title('Customer Lifetime Value Distribution') | |
plt.xlabel('Lifetime Value') | |
plt.ylabel('Frequency') | |
st.pyplot(plt) | |