Rahul-Crudcook's picture
Upload 3 files
fc8f2da verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
# Load and display dataset
@st.cache_data
def load_data():
data = pd.read_csv("insurance.csv") # Ensure insurance.csv is in the same directory
return data
data = load_data()
st.title("Medical Insurance Cost Prediction with Hybrid Model")
st.write("Dataset preview:")
st.write(data.head())
# Preprocessing and Feature Engineering
st.subheader("Data Preprocessing and Feature Engineering")
data['age_smoker'] = data['age'] * data['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
data['bmi_smoker'] = data['bmi'] * data['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
# Encode categorical variables
label_encoder = LabelEncoder()
data['sex'] = label_encoder.fit_transform(data['sex'])
data['smoker'] = label_encoder.fit_transform(data['smoker'])
data['region'] = label_encoder.fit_transform(data['region'])
# Select features
X = data[['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'age_smoker', 'bmi_smoker']]
y = data['charges']
# Standardize numerical features
scaler = StandardScaler()
X[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']] = scaler.fit_transform(X[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Define the neural network model
def create_neural_network():
model = Sequential([
Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
Dropout(0.3),
Dense(64, activation='relu'),
Dense(1)
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
return model
st.subheader("Training the Neural Network")
nn_model = create_neural_network()
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
# Generate predictions from the neural network for train and test sets
nn_train_pred = nn_model.predict(X_train).flatten()
nn_test_pred = nn_model.predict(X_test).flatten()
# Add NN predictions as a new feature for Random Forest
X_train_rf = X_train.copy()
X_test_rf = X_test.copy()
X_train_rf['nn_pred'] = nn_train_pred
X_test_rf['nn_pred'] = nn_test_pred
# Train a Random Forest on this new feature set
st.subheader("Training the Random Forest with Neural Network Predictions")
rf_model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42)
rf_model.fit(X_train_rf, y_train)
final_predictions = rf_model.predict(X_test_rf)
# Model evaluation
rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
r2 = r2_score(y_test, final_predictions) * 100
st.write(f"RMSE (Root Mean Squared Error): {rmse:.2f}")
st.write(f"R² (Accuracy): {r2:.2f}%")
# Plot actual vs predicted values
st.subheader("Actual vs Predicted Values")
plt.figure(figsize=(10, 5))
plt.plot(y_test.values, label="Actual Values", color='blue')
plt.plot(final_predictions, label="Predicted Values", color='orange')
plt.xlabel("Sample Index")
plt.ylabel("Insurance Charges")
plt.legend()
st.pyplot(plt)
# Prediction on new data
st.subheader("Predict on New Data")
input_data = {col: st.number_input(f"Enter {col}:", value=float(X[col].mean())) for col in X.columns}
if st.button("Predict Insurance Charge"):
input_df = pd.DataFrame([input_data])
input_df[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']] = scaler.transform(
input_df[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']])
# Use neural network to predict intermediate feature
nn_feature = nn_model.predict(input_df).flatten()
input_df['nn_pred'] = nn_feature
# Predict final charge using Random Forest
final_prediction = rf_model.predict(input_df)
st.write(f"Predicted Insurance Charge: ${final_prediction[0]:.2f}")