Spaces:

mayankraghav
/

project

Runtime error

App Files Files Community

project / app.py

mayankraghav

Add application file

b965b34 2 months ago

raw

history blame

No virus

6.19 kB

	# app.py
	import streamlit as st
	import pandas as pd
	import numpy as np
	import pickle
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import timedelta
	from pandas.tseries.offsets import MonthEnd
	from statsmodels.tsa.statespace.sarimax import SARIMAX
	from statsmodels.tsa.stattools import adfuller

	# # Load models
	# with open('./revenue_forcast.pkl', 'rb') as file:
	# arima_model = pickle.load(file)

	# # Load data
	# file_path = './Dataset/hotel_booking.csv'
	# df = pd.read_csv(file_path)

	# Preprocess data for Streamlit
	numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
	categorical_cols = df.select_dtypes(include=['object']).columns
	for col in categorical_cols:
	df[col] = df[col].astype('category')

	# Streamlit app
	st.title('Hotel Booking Analysis')

	# Navigation
	st.sidebar.title('Navigation')
	options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])

	if options == 'Overview':
	st.header('Overview')
	st.write('This app provides insights and predictions for hotel bookings.')

	elif options == 'Revenue Forecasting':
	# Streamlit app title
	st.title('Hotel Booking Revenue Forecasting with SARIMA')

	# File uploader
	uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

	if uploaded_file is not None:
	# Load the dataset
	data = pd.read_csv(uploaded_file)

	# Display the first few rows of the dataset
	st.write("## Dataset Preview")
	st.write(data.head())

	# Convert arrival_date_year and arrival_date_month to a datetime format
	data['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(str) + '-' +
	data['arrival_date_month'].astype(str) + '-01')
	data['arrival_date'] += MonthEnd(0)

	# Calculate monthly revenue
	monthly_revenue = data[data['is_canceled'] == 0].groupby('arrival_date')['adr'].sum().reset_index()

	# Plot monthly revenue
	st.write("## Monthly Revenue")
	plt.figure(figsize=(12, 6))
	sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue)
	plt.title('Monthly Revenue')
	plt.xlabel('Month')
	plt.ylabel('Revenue')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(plt)

	# Check for stationarity
	result = adfuller(monthly_revenue['adr'])
	st.write(f'## ADF Statistic: {result[0]}')
	st.write(f'## p-value: {result[1]}')

	# If the series is not stationary, take the first difference
	monthly_revenue['adr_diff'] = monthly_revenue['adr'].diff().dropna()

	# Model parameters
	p = st.slider('AR order (p)', 0, 5, 1)
	d = st.slider('Differencing order (d)', 0, 2, 1)
	q = st.slider('MA order (q)', 0, 5, 1)
	P = st.slider('Seasonal AR order (P)', 0, 2, 1)
	D = st.slider('Seasonal differencing order (D)', 0, 2, 1)
	Q = st.slider('Seasonal MA order (Q)', 0, 2, 1)

	# Fit the SARIMA model with user-defined parameters
	model = SARIMAX(monthly_revenue['adr'],
	order=(p, d, q),
	seasonal_order=(P, D, Q, 12))
	model_fit = model.fit(disp=False)

	# Make predictions
	forecast_steps = 12 # Forecast for the next 12 months
	forecast = model_fit.get_forecast(steps=forecast_steps)
	forecast_index = pd.date_range(start=monthly_revenue['arrival_date'].max(),
	periods=forecast_steps, freq='M')

	forecast_df = pd.DataFrame({'arrival_date': forecast_index,
	'forecast': forecast.predicted_mean})

	# Plot the results
	st.write("## Revenue Forecast")
	plt.figure(figsize=(12, 6))
	sns.lineplot(x='arrival_date', y='adr', data=monthly_revenue, label='Historical Revenue')
	sns.lineplot(x='arrival_date', y='forecast', data=forecast_df, label='Forecasted Revenue')
	plt.title('Revenue Forecast')
	plt.xlabel('Month')
	plt.ylabel('Revenue')
	plt.xticks(rotation=45)
	plt.legend()
	plt.tight_layout()
	st.pyplot(plt)

	# Display forecasted values
	st.write("## Forecasted Revenue for the Next 12 Months")
	st.write(forecast_df.set_index('arrival_date'))


	elif options == 'Predict Booking Cancellations':
	st.header('Predict Booking Cancellations')
	st.write('Provide input data to predict if a booking will be canceled.')

	input_data = {}
	for col in df.drop(columns=['is_canceled']).columns:
	input_data[col] = st.text_input(f'{col}:', value='0')

	input_df = pd.DataFrame(input_data, index=[0])
	prediction = random_forest_model.predict(input_df)
	st.write('Prediction:', 'Canceled' if prediction[0] else 'Not Canceled')

	elif options == 'Market Segmentation':
	st.header('Market Segmentation')
	segmentation_features = df[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
	scaler = StandardScaler()
	segmentation_features_scaled = scaler.fit_transform(segmentation_features)

	kmeans = KMeans(n_clusters=4, random_state=42)
	df['customer_segment'] = kmeans.fit_predict(segmentation_features_scaled)

	plt.figure(figsize=(10, 5))
	sns.scatterplot(x=segmentation_features_scaled[:, 0], y=segmentation_features_scaled[:, 1], hue=df['customer_segment'], palette='viridis')
	plt.title('Customer Segmentation')
	plt.xlabel('Total Guests (Standardized)')
	plt.ylabel('Total Special Requests (Standardized)')
	st.pyplot(plt)

	elif options == 'Customer Lifetime Value':
	st.header('Customer Lifetime Value')
	clv_df = df.groupby('customer_id')['revenue'].sum().reset_index()
	clv_df.columns = ['customer_id', 'lifetime_value']

	plt.figure(figsize=(10, 5))
	sns.histplot(clv_df['lifetime_value'], kde=True)
	plt.title('Customer Lifetime Value Distribution')
	plt.xlabel('Lifetime Value')
	plt.ylabel('Frequency')
	st.pyplot(plt)