Spaces:

mayankraghav
/

project

Runtime error

App Files Files Community

mayankraghav commited on Jul 21

Commit

2c2ffac

•

1 Parent(s): aea9fdb

Changes to hotel booking prediction

Browse files

Files changed (2) hide show

app.py +128 -35
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,24 +1,35 @@
-# app.py
 import streamlit as st
 import pandas as pd
 import numpy as np
 import pickle
 import matplotlib.pyplot as plt
 import seaborn as sns
 from datetime import timedelta
 from pandas.tseries.offsets import MonthEnd
 from statsmodels.tsa.statespace.sarimax import SARIMAX
 from statsmodels.tsa.stattools import adfuller
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
-# Streamlit app
 st.title('Hotel Booking Analysis')
-# Navigation
 st.sidebar.title('Navigation')
 options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])
@@ -29,10 +40,21 @@ if options == 'Overview':
 elif options == 'Revenue Forecasting':
     st.header('Hotel Booking Revenue Forecasting with SARIMA')
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    if uploaded_file is not None:
-        data = pd.read_csv(uploaded_file)
         st.write("## Dataset Preview")
         st.write(data.head())
@@ -97,53 +119,112 @@ elif options == 'Revenue Forecasting':
 elif options == 'Predict Booking Cancellations':
     st.header('Predict Booking Cancellations')
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    if uploaded_file is not None:
-        data = pd.read_csv(uploaded_file)
         st.write("## Dataset Preview")
         st.write(data.head())
         features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number',
                     'arrival_date_day_of_month', 'stays_in_weekend_nights',
                     'stays_in_week_nights', 'adults', 'children', 'babies',
-                    'is_repeated_guest', 'previous_cancellations',
-                    'previous_bookings_not_canceled', 'booking_changes',
-                    'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
-                    'total_of_special_requests']
         X = data[features]
         y = data['is_canceled']
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-        model = RandomForestClassifier(n_estimators=100, random_state=42)
-        model.fit(X_train, y_train)
-        y_pred = model.predict(X_test)
-        accuracy = accuracy_score(y_test, y_pred)
-        st.write(f'Model Accuracy: {accuracy:.2f}')
-        st.write("## Predict Booking Cancellation for New Data")
         input_data = {}
-        for col in features:
-            input_data[col] = float(st.text_input(f'{col}:', value='0'))
-        input_df = pd.DataFrame(input_data, index=[0])
-        prediction = model.predict(input_df)
-        st.write('Prediction:', 'Canceled' if prediction[0] else 'Not Canceled')
 elif options == 'Market Segmentation':
     st.header('Market Segmentation')
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    if uploaded_file is not None:
-        data = pd.read_csv(uploaded_file)
         st.write("## Dataset Preview")
         st.write(data.head())
         data['total_guests'] = data['adults'] + data['children'] + data['babies']
         segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
         scaler = StandardScaler()
@@ -165,10 +246,21 @@ elif options == 'Market Segmentation':
 elif options == 'Customer Lifetime Value':
     st.header('Customer Lifetime Value')
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    if uploaded_file is not None:
-        data = pd.read_csv(uploaded_file)
         st.write("## Dataset Preview")
         st.write(data.head())
@@ -183,3 +275,4 @@ elif options == 'Customer Lifetime Value':
         plt.xlabel('Lifetime Value')
         plt.ylabel('Frequency')
         st.pyplot(plt)

+import os
 import streamlit as st
 import pandas as pd
 import numpy as np
 import pickle
 import matplotlib.pyplot as plt
 import seaborn as sns
+import boto3
+from io import StringIO
 from datetime import timedelta
 from pandas.tseries.offsets import MonthEnd
 from statsmodels.tsa.statespace.sarimax import SARIMAX
 from statsmodels.tsa.stattools import adfuller
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
+import joblib
+os.environ['AWS_ACCESS_KEY_ID'] = os.getenv("getdata")
+os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv("getdatake")
+def load_data_from_s3(bucket_name, file_key):
+    s3 = boto3.client('s3')
+    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
+    data = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))
+    return data
 st.title('Hotel Booking Analysis')
 st.sidebar.title('Navigation')
 options = st.sidebar.radio('Select a page:', ['Overview', 'Revenue Forecasting', 'Predict Booking Cancellations', 'Market Segmentation', 'Customer Lifetime Value'])
 elif options == 'Revenue Forecasting':
     st.header('Hotel Booking Revenue Forecasting with SARIMA')
+    # Option to choose data source
+    data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
+    if data_source == "Upload CSV":
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        if uploaded_file is not None:
+            data = pd.read_csv(uploaded_file)
+    else:
+        bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
+        file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
+        if st.button("Load Data"):
+            data = load_data_from_s3(bucket_name, file_key)
+    if 'data' in locals():
+        # Display the first few rows of the dataset
         st.write("## Dataset Preview")
         st.write(data.head())
 elif options == 'Predict Booking Cancellations':
     st.header('Predict Booking Cancellations')
+    # Option to choose data source
+    data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
+    if data_source == "Upload CSV":
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        if uploaded_file is not None:
+            data = pd.read_csv(uploaded_file)
+    else:
+        bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
+        file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
+        if st.button("Load Data"):
+            data = load_data_from_s3(bucket_name, file_key)
+    if 'data' in locals():
+        # Display the first few rows of the dataset
         st.write("## Dataset Preview")
         st.write(data.head())
         features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number',
                     'arrival_date_day_of_month', 'stays_in_weekend_nights',
                     'stays_in_week_nights', 'adults', 'children', 'babies',
+                    'previous_cancellations', 'previous_bookings_not_canceled',
+                    'booking_changes', 'days_in_waiting_list', 'adr',
+                    'required_car_parking_spaces', 'total_of_special_requests']
+        data = data.dropna(subset=features + ['is_canceled'])
         X = data[features]
         y = data['is_canceled']
+        # Split the data into training and testing sets
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        # Train the Random Forest Classifier
+        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
+        rf_model.fit(X_train, y_train)
+        # Make predictions
+        y_pred = rf_model.predict(X_test)
+        # Display model performance metrics
+        st.write("## Model Performance Metrics")
+        st.write("### Confusion Matrix")
+        cm = confusion_matrix(y_test, y_pred)
+        st.write(cm)
+        st.write("### Classification Report")
+        cr = classification_report(y_test, y_pred, output_dict=True)
+        st.write(pd.DataFrame(cr).transpose())
+        st.write("### Accuracy Score")
+        acc = accuracy_score(y_test, y_pred)
+        st.write(acc)
+        # Save the model to a file
+        joblib.dump(rf_model, 'rf_model.pkl')
+        st.write("Model saved as rf_model.pkl")
+        st.write("## Predict Booking Cancellation")
+        st.write("Enter the details to predict if a booking will be canceled:")
+        # Collect user input for prediction
         input_data = {}
+        for feature in features:
+            input_data[feature] = st.number_input(f"Enter {feature}:", min_value=0.0)
+        if st.button("Predict"):
+            input_df = pd.DataFrame([input_data])
+            # Ensure the input data has the correct data types
+            for feature in features:
+                input_df[feature] = input_df[feature].astype(X[feature].dtype)
+            prediction = rf_model.predict(input_df)
+            prediction_proba = rf_model.predict_proba(input_df)
+            st.write(f"Input Data: {input_df}")
+            st.write(f"Prediction: {prediction}")
+            st.write(f"Prediction Probability: {prediction_proba}")
+            if prediction[0] == 1:
+                st.write("Prediction: The booking is likely to be canceled.")
+            else:
+                st.write("Prediction: The booking is not likely to be canceled.")
 elif options == 'Market Segmentation':
     st.header('Market Segmentation')
+    # Option to choose data source
+    data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
+    if data_source == "Upload CSV":
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        if uploaded_file is not None:
+            data = pd.read_csv(uploaded_file)
+    else:
+        bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
+        file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
+        if st.button("Load Data"):
+            data = load_data_from_s3(bucket_name, file_key)
+    if 'data' in locals():
+        # Display the first few rows of the dataset
         st.write("## Dataset Preview")
         st.write(data.head())
         data['total_guests'] = data['adults'] + data['children'] + data['babies']
         segmentation_features = data[['total_guests', 'total_of_special_requests', 'lead_time', 'is_repeated_guest']]
         scaler = StandardScaler()
 elif options == 'Customer Lifetime Value':
     st.header('Customer Lifetime Value')
+    # Option to choose data source
+    data_source = st.radio("Choose data source:", ["Upload CSV", "Load from AWS S3"])
+    if data_source == "Upload CSV":
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        if uploaded_file is not None:
+            data = pd.read_csv(uploaded_file)
+    else:
+        bucket_name = st.text_input("Enter S3 bucket name:", "iitj-ap-south-1-mayank")
+        file_key = st.text_input("Enter S3 file key:", "clean/hotel_booking/hotel_booking.csv")
+        if st.button("Load Data"):
+            data = load_data_from_s3(bucket_name, file_key)
+    if 'data' in locals():
+        # Display the first few rows of the dataset
         st.write("## Dataset Preview")
         st.write(data.head())
         plt.xlabel('Lifetime Value')
         plt.ylabel('Frequency')
         st.pyplot(plt)

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ numpy
 matplotlib
 seaborn
 scikit-learn
-statsmodels

 matplotlib
 seaborn
 scikit-learn
+statsmodels
+boto3
+joblib