kothariyashhh's picture
Upload 7 files
4386418 verified
raw
history blame contribute delete
No virus
3.02 kB
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
warnings.filterwarnings("ignore")
# Load and preprocess data
data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39")
data.replace('?', np.nan, inplace=True)
# Function to check data
def check_data(data):
return pd.DataFrame({
'type': data.dtypes,
'amount_unique': data.nunique(),
'unique_values': [data[x].unique() for x in data.columns],
'null_values': data.isna().sum(),
'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2)
})
print(check_data(data).sort_values("null_values", ascending=False))
# Fill missing values with mode
for column in data.columns:
mode_value = data[column].mode().iloc[0]
data[column] = data[column].replace(np.nan, mode_value)
# Encode categorical variables
le = LabelEncoder()
for col in data.columns:
if data[col].dtype == 'O':
data[col] = le.fit_transform(data[col])
# Drop less important columns
to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location',
'auto_year', 'auto_make', 'auto_model']
data.drop(columns=to_drop, inplace=True)
# Correlation heatmap
plt.figure(figsize=(23, 23))
corr_matrix = data.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16)
plt.show()
# Drop less correlated features
to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age',
'incident_hour_of_the_day', 'insured_occupation']
data.drop(columns=to_drop, inplace=True)
# Feature importance
X = data.iloc[:, :-1]
Y = data['fraud_reported']
model = RandomForestClassifier(n_estimators=1000)
model.fit(X, Y)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
final_feat = feat_importances.nlargest(10).index.tolist()
final_feat.append('fraud_reported')
data_new = data[final_feat]
# Prepare data for modeling
df_model = data_new.copy()
X = df_model.drop(columns='fraud_reported')
y = df_model['fraud_reported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
# Train the final model
final_model = RandomForestClassifier(
criterion='gini',
max_depth=5,
min_samples_leaf=4,
min_samples_split=10,
n_estimators=100,
random_state=42,
class_weight='balanced'
)
final_model.fit(X_train, y_train)
# Evaluate the model
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
# Save the model
joblib.dump(final_model, 'model/only_model.joblib')
print("Model saved successfully.")