|
import streamlit as st
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.metrics import mean_squared_error, r2_score
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Dense, Dropout
|
|
from tensorflow.keras.optimizers import Adam
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
@st.cache_data
|
|
def load_data():
|
|
data = pd.read_csv("insurance.csv")
|
|
return data
|
|
|
|
data = load_data()
|
|
st.title("Medical Insurance Cost Prediction with Hybrid Model")
|
|
st.write("Dataset preview:")
|
|
st.write(data.head())
|
|
|
|
|
|
st.subheader("Data Preprocessing and Feature Engineering")
|
|
data['age_smoker'] = data['age'] * data['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
|
|
data['bmi_smoker'] = data['bmi'] * data['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
|
|
|
|
|
|
label_encoder = LabelEncoder()
|
|
data['sex'] = label_encoder.fit_transform(data['sex'])
|
|
data['smoker'] = label_encoder.fit_transform(data['smoker'])
|
|
data['region'] = label_encoder.fit_transform(data['region'])
|
|
|
|
|
|
X = data[['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'age_smoker', 'bmi_smoker']]
|
|
y = data['charges']
|
|
|
|
|
|
scaler = StandardScaler()
|
|
X[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']] = scaler.fit_transform(X[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']])
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
|
|
|
|
def create_neural_network():
|
|
model = Sequential([
|
|
Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
|
|
Dropout(0.3),
|
|
Dense(64, activation='relu'),
|
|
Dense(1)
|
|
])
|
|
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
|
return model
|
|
|
|
st.subheader("Training the Neural Network")
|
|
nn_model = create_neural_network()
|
|
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
|
|
|
|
|
|
nn_train_pred = nn_model.predict(X_train).flatten()
|
|
nn_test_pred = nn_model.predict(X_test).flatten()
|
|
|
|
|
|
X_train_rf = X_train.copy()
|
|
X_test_rf = X_test.copy()
|
|
X_train_rf['nn_pred'] = nn_train_pred
|
|
X_test_rf['nn_pred'] = nn_test_pred
|
|
|
|
|
|
st.subheader("Training the Random Forest with Neural Network Predictions")
|
|
rf_model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42)
|
|
rf_model.fit(X_train_rf, y_train)
|
|
final_predictions = rf_model.predict(X_test_rf)
|
|
|
|
|
|
rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
|
|
r2 = r2_score(y_test, final_predictions) * 100
|
|
st.write(f"RMSE (Root Mean Squared Error): {rmse:.2f}")
|
|
st.write(f"R² (Accuracy): {r2:.2f}%")
|
|
|
|
|
|
st.subheader("Actual vs Predicted Values")
|
|
plt.figure(figsize=(10, 5))
|
|
plt.plot(y_test.values, label="Actual Values", color='blue')
|
|
plt.plot(final_predictions, label="Predicted Values", color='orange')
|
|
plt.xlabel("Sample Index")
|
|
plt.ylabel("Insurance Charges")
|
|
plt.legend()
|
|
st.pyplot(plt)
|
|
|
|
|
|
st.subheader("Predict on New Data")
|
|
input_data = {col: st.number_input(f"Enter {col}:", value=float(X[col].mean())) for col in X.columns}
|
|
|
|
if st.button("Predict Insurance Charge"):
|
|
input_df = pd.DataFrame([input_data])
|
|
input_df[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']] = scaler.transform(
|
|
input_df[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']])
|
|
|
|
|
|
nn_feature = nn_model.predict(input_df).flatten()
|
|
input_df['nn_pred'] = nn_feature
|
|
|
|
|
|
final_prediction = rf_model.predict(input_df)
|
|
st.write(f"Predicted Insurance Charge: ${final_prediction[0]:.2f}")
|
|
|