|
import pandas as pd |
|
import numpy as np |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import warnings |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score |
|
import joblib |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39") |
|
data.replace('?', np.nan, inplace=True) |
|
|
|
|
|
def check_data(data): |
|
return pd.DataFrame({ |
|
'type': data.dtypes, |
|
'amount_unique': data.nunique(), |
|
'unique_values': [data[x].unique() for x in data.columns], |
|
'null_values': data.isna().sum(), |
|
'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2) |
|
}) |
|
|
|
print(check_data(data).sort_values("null_values", ascending=False)) |
|
|
|
|
|
for column in data.columns: |
|
mode_value = data[column].mode().iloc[0] |
|
data[column] = data[column].replace(np.nan, mode_value) |
|
|
|
|
|
le = LabelEncoder() |
|
for col in data.columns: |
|
if data[col].dtype == 'O': |
|
data[col] = le.fit_transform(data[col]) |
|
|
|
|
|
to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location', |
|
'auto_year', 'auto_make', 'auto_model'] |
|
data.drop(columns=to_drop, inplace=True) |
|
|
|
|
|
plt.figure(figsize=(23, 23)) |
|
corr_matrix = data.corr() |
|
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) |
|
sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma') |
|
plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16) |
|
plt.show() |
|
|
|
|
|
to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age', |
|
'incident_hour_of_the_day', 'insured_occupation'] |
|
data.drop(columns=to_drop, inplace=True) |
|
|
|
|
|
X = data.iloc[:, :-1] |
|
Y = data['fraud_reported'] |
|
model = RandomForestClassifier(n_estimators=1000) |
|
model.fit(X, Y) |
|
feat_importances = pd.Series(model.feature_importances_, index=X.columns) |
|
final_feat = feat_importances.nlargest(10).index.tolist() |
|
final_feat.append('fraud_reported') |
|
data_new = data[final_feat] |
|
|
|
|
|
df_model = data_new.copy() |
|
X = df_model.drop(columns='fraud_reported') |
|
y = df_model['fraud_reported'] |
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43) |
|
|
|
|
|
final_model = RandomForestClassifier( |
|
criterion='gini', |
|
max_depth=5, |
|
min_samples_leaf=4, |
|
min_samples_split=10, |
|
n_estimators=100, |
|
random_state=42, |
|
class_weight='balanced' |
|
) |
|
final_model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = final_model.predict(X_test) |
|
accuracy = accuracy_score(y_test, y_pred) |
|
print(f"Model Accuracy: {accuracy}") |
|
|
|
|
|
joblib.dump(final_model, 'model/only_model.joblib') |
|
print("Model saved successfully.") |
|
|