kothariyashhh
commited on
Commit
•
4386418
1
Parent(s):
5ef15eb
Upload 7 files
Browse files- app.py +181 -0
- dataset/insurance_claims.csv +0 -0
- model/only_model.joblib +3 -0
- prediction.py +50 -0
- readme.md +110 -0
- requirements.txt +7 -0
- train.py +95 -0
app.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import streamlit as st
|
2 |
+
# import pandas as pd
|
3 |
+
# import numpy as np
|
4 |
+
# import joblib
|
5 |
+
# from sklearn.preprocessing import LabelEncoder
|
6 |
+
|
7 |
+
# class FraudDetectionApp:
|
8 |
+
# def __init__(self):
|
9 |
+
# self.model = joblib.load('model/only_model.joblib')
|
10 |
+
# self.categorical_columns = ['incident_severity', 'insured_hobbies', 'insured_education_level', 'incident_city']
|
11 |
+
# self.encoders = {col: LabelEncoder() for col in self.categorical_columns}
|
12 |
+
# self.fit_encoders()
|
13 |
+
|
14 |
+
# def fit_encoders(self):
|
15 |
+
# # Example unique values for fitting the encoders
|
16 |
+
# example_data = {
|
17 |
+
# 'incident_severity': ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'],
|
18 |
+
# 'insured_hobbies': ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'],
|
19 |
+
# 'insured_education_level': ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'],
|
20 |
+
# 'incident_city': ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook']
|
21 |
+
# }
|
22 |
+
# for col in self.categorical_columns:
|
23 |
+
# self.encoders[col].fit(example_data[col])
|
24 |
+
|
25 |
+
# def preprocess_single_data(self, data):
|
26 |
+
# if not isinstance(data, pd.DataFrame):
|
27 |
+
# data = pd.DataFrame(data, index=[0])
|
28 |
+
# for col in self.categorical_columns:
|
29 |
+
# if col in data.columns:
|
30 |
+
# data[col] = self.encoders[col].transform(data[col])
|
31 |
+
# return data
|
32 |
+
|
33 |
+
# def predict_single_fraud(self, data):
|
34 |
+
# data_processed = self.preprocess_single_data(data)
|
35 |
+
# prediction = self.model.predict(data_processed)[0]
|
36 |
+
# return prediction
|
37 |
+
|
38 |
+
# def run(self):
|
39 |
+
# st.title('Fraud Detection Prediction')
|
40 |
+
|
41 |
+
# # Input fields
|
42 |
+
# incident_severity = st.selectbox('Incident Severity', ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'])
|
43 |
+
# insured_hobbies = st.selectbox('Insured Hobbies', ['sleeping', 'reading', 'board-games', 'bungie-jumping',
|
44 |
+
# 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving',
|
45 |
+
# 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking',
|
46 |
+
# 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'])
|
47 |
+
# total_claim_amount = st.number_input('Total Claim Amount')
|
48 |
+
# months_as_customer = st.number_input('Months as Customer')
|
49 |
+
# policy_annual_premium = st.number_input('Policy Annual Premium')
|
50 |
+
# incident_date = st.number_input('Incident Date', min_value=1, max_value=31, step=1)
|
51 |
+
# capital_loss = st.number_input('Capital Loss')
|
52 |
+
# capital_gains = st.number_input('Capital Gains')
|
53 |
+
# insured_education_level = st.selectbox('Insured Education Level', ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'])
|
54 |
+
# incident_city = st.selectbox('Incident City', ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook'])
|
55 |
+
|
56 |
+
# # Collecting user input
|
57 |
+
# new_data_point = {
|
58 |
+
# 'incident_severity': incident_severity,
|
59 |
+
# 'insured_hobbies': insured_hobbies,
|
60 |
+
# 'total_claim_amount': total_claim_amount,
|
61 |
+
# 'months_as_customer': months_as_customer,
|
62 |
+
# 'policy_annual_premium': policy_annual_premium,
|
63 |
+
# 'incident_date': incident_date,
|
64 |
+
# 'capital-loss': capital_loss,
|
65 |
+
# 'capital-gains': capital_gains,
|
66 |
+
# 'insured_education_level': insured_education_level,
|
67 |
+
# 'incident_city': incident_city,
|
68 |
+
# }
|
69 |
+
|
70 |
+
# # Prediction button
|
71 |
+
# if st.button('Predict'):
|
72 |
+
# prediction = self.predict_single_fraud(new_data_point)
|
73 |
+
# if prediction == 0:
|
74 |
+
# st.write('The applied application is not fraud.')
|
75 |
+
# else:
|
76 |
+
# st.write('The applied application is fraud.')
|
77 |
+
|
78 |
+
# if __name__ == '__main__':
|
79 |
+
# app = FraudDetectionApp()
|
80 |
+
# app.run()
|
81 |
+
import streamlit as st
|
82 |
+
import pandas as pd
|
83 |
+
import numpy as np
|
84 |
+
import joblib
|
85 |
+
from sklearn.preprocessing import LabelEncoder
|
86 |
+
|
87 |
+
class FraudDetectionApp:
|
88 |
+
def __init__(self):
|
89 |
+
self.model = joblib.load('model/only_model.joblib')
|
90 |
+
self.categorical_columns = ['incident_severity', 'insured_hobbies', 'insured_education_level', 'incident_city']
|
91 |
+
self.encoders = {col: LabelEncoder() for col in self.categorical_columns}
|
92 |
+
self.fit_encoders()
|
93 |
+
|
94 |
+
def fit_encoders(self):
|
95 |
+
# Example unique values for fitting the encoders
|
96 |
+
example_data = {
|
97 |
+
'incident_severity': ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'],
|
98 |
+
'insured_hobbies': ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'],
|
99 |
+
'insured_education_level': ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'],
|
100 |
+
'incident_city': ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook']
|
101 |
+
}
|
102 |
+
for col in self.categorical_columns:
|
103 |
+
self.encoders[col].fit(example_data[col])
|
104 |
+
|
105 |
+
def preprocess_single_data(self, data):
|
106 |
+
if not isinstance(data, pd.DataFrame):
|
107 |
+
data = pd.DataFrame(data, index=[0])
|
108 |
+
for col in self.categorical_columns:
|
109 |
+
if col in data.columns:
|
110 |
+
data[col] = self.encoders[col].transform(data[col])
|
111 |
+
return data
|
112 |
+
|
113 |
+
def predict_single_fraud(self, data):
|
114 |
+
data_processed = self.preprocess_single_data(data)
|
115 |
+
prediction = self.model.predict(data_processed)[0]
|
116 |
+
return prediction
|
117 |
+
|
118 |
+
def run(self):
|
119 |
+
st.title('Insurance Fraud Prediction')
|
120 |
+
|
121 |
+
# Input fields
|
122 |
+
incident_severity = st.selectbox('Incident Severity', ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'])
|
123 |
+
insured_hobbies = st.selectbox('Insured Hobbies', ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'])
|
124 |
+
total_claim_amount = st.number_input('Total Claim Amount')
|
125 |
+
months_as_customer = st.number_input('Months as Customer')
|
126 |
+
policy_annual_premium = st.number_input('Policy Annual Premium')
|
127 |
+
incident_date = st.number_input('Incident Date', min_value=1, max_value=31, step=1)
|
128 |
+
capital_loss = st.number_input('Capital Loss')
|
129 |
+
capital_gains = st.number_input('Capital Gains')
|
130 |
+
insured_education_level = st.selectbox('Insured Education Level', ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'])
|
131 |
+
incident_city = st.selectbox('Incident City', ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook'])
|
132 |
+
|
133 |
+
# Collecting user input
|
134 |
+
new_data_point = {
|
135 |
+
'incident_severity': incident_severity,
|
136 |
+
'insured_hobbies': insured_hobbies,
|
137 |
+
'total_claim_amount': total_claim_amount,
|
138 |
+
'months_as_customer': months_as_customer,
|
139 |
+
'policy_annual_premium': policy_annual_premium,
|
140 |
+
'incident_date': incident_date,
|
141 |
+
'capital-loss': capital_loss,
|
142 |
+
'capital-gains': capital_gains,
|
143 |
+
'insured_education_level': insured_education_level,
|
144 |
+
'incident_city': incident_city,
|
145 |
+
}
|
146 |
+
|
147 |
+
# Prediction button
|
148 |
+
if st.button('Predict'):
|
149 |
+
prediction = self.predict_single_fraud(new_data_point)
|
150 |
+
if prediction == 0:
|
151 |
+
st.write('The applied application is not fraud.')
|
152 |
+
else:
|
153 |
+
st.write('The applied application is fraud.')
|
154 |
+
|
155 |
+
# Generate sample data
|
156 |
+
if st.button('Generate Sample Data'):
|
157 |
+
sample_non_fraud = self.generate_sample_data(fraud=False)
|
158 |
+
sample_fraud = self.generate_sample_data(fraud=True)
|
159 |
+
st.write("Non-Fraud Sample Data:")
|
160 |
+
st.write(sample_non_fraud)
|
161 |
+
st.write("Fraud Sample Data:")
|
162 |
+
st.write(sample_fraud)
|
163 |
+
|
164 |
+
def generate_sample_data(self, fraud=False):
|
165 |
+
sample_data = {
|
166 |
+
'incident_severity': ['Major Damage' if fraud else 'Minor Damage'],
|
167 |
+
'insured_hobbies': ['skydiving' if fraud else 'reading'],
|
168 |
+
'total_claim_amount': [50000 if fraud else 1000],
|
169 |
+
'months_as_customer': [1 if fraud else 60],
|
170 |
+
'policy_annual_premium': [10000 if fraud else 200],
|
171 |
+
'incident_date': [15],
|
172 |
+
'capital-loss': [1000 if fraud else 0],
|
173 |
+
'capital-gains': [5000 if fraud else 0],
|
174 |
+
'insured_education_level': ['PhD' if fraud else 'College'],
|
175 |
+
'incident_city': ['Riverwood' if fraud else 'Northbrook']
|
176 |
+
}
|
177 |
+
return pd.DataFrame(sample_data)
|
178 |
+
|
179 |
+
if __name__ == '__main__':
|
180 |
+
app = FraudDetectionApp()
|
181 |
+
app.run()
|
dataset/insurance_claims.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model/only_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f857138a5f8938cf50f8eae9335418974d524632dde4b95e211455919f4f8670
|
3 |
+
size 334537
|
prediction.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import joblib
|
4 |
+
|
5 |
+
# Load the model and preprocessing pipeline
|
6 |
+
model = joblib.load('model/only_model.joblib')
|
7 |
+
def preprocess_single_data(data):
|
8 |
+
# Convert the data into a DataFrame if it's not already
|
9 |
+
if not isinstance(data, pd.DataFrame):
|
10 |
+
data = pd.DataFrame(data, index=[0])
|
11 |
+
|
12 |
+
# handle missing values by replacing with mode
|
13 |
+
for column in data.columns:
|
14 |
+
mode_value = data[column].mode().iloc[0]
|
15 |
+
data[column] = data[column].replace(np.nan, mode_value)
|
16 |
+
|
17 |
+
def predict_single_fraud(data):
|
18 |
+
# Preprocess the single data point
|
19 |
+
data_processed = preprocess_single_data(data)
|
20 |
+
|
21 |
+
# Standardize the data
|
22 |
+
# data_scaled = scaler.transform(data_processed)
|
23 |
+
|
24 |
+
# Make predictions
|
25 |
+
prediction = model.predict(data_processed)[0]
|
26 |
+
probability = model.predict_proba(data_processed)[0, 1]
|
27 |
+
|
28 |
+
return prediction, probability
|
29 |
+
|
30 |
+
# Example usage:
|
31 |
+
# New single data point
|
32 |
+
new_data_point = {
|
33 |
+
'incident_severity': 'Major Damage',
|
34 |
+
'insured_hobbies': 9,
|
35 |
+
'total_claim_amount': 59670,
|
36 |
+
'months_as_customer': 116,
|
37 |
+
'policy_annual_premium': 951.46,
|
38 |
+
'incident_date': 30,
|
39 |
+
'capital-loss': -35500,
|
40 |
+
'capital-gains': 0,
|
41 |
+
'insured_education_level': 3,
|
42 |
+
'incident_city':5,
|
43 |
+
}
|
44 |
+
|
45 |
+
# Make predictions
|
46 |
+
prediction = predict_single_fraud(new_data_point)
|
47 |
+
|
48 |
+
# Display predictions
|
49 |
+
print(f'Fraud Prediction: {prediction}')
|
50 |
+
# print(f'Probability of Fraud: {probability:.4f}')
|
readme.md
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Insurance Fraud Prediction Model
|
2 |
+
|
3 |
+
This project focuses on building and evaluating a machine learning model to detect fraudulent insurance claims.
|
4 |
+
The project involves data preprocessing, model training using a RandomForestClassifier, model evaluation with
|
5 |
+
various metrics and visualizations, and a Streamlit UI for interacting with the model.
|
6 |
+
|
7 |
+
Create and activate a virtual environment:
|
8 |
+
|
9 |
+
```bash
|
10 |
+
python -m venv env
|
11 |
+
source env/bin/activate # On Windows use `env\Scripts\activate`
|
12 |
+
|
13 |
+
```
|
14 |
+
|
15 |
+
Install the required packages:
|
16 |
+
|
17 |
+
```bash
|
18 |
+
pip install -r requirements.txt
|
19 |
+
```
|
20 |
+
|
21 |
+
### Project Structure
|
22 |
+
```bash
|
23 |
+
insurance-fraud-detection/
|
24 |
+
│
|
25 |
+
├── dataset/
|
26 |
+
│ └── insurance_claims.csv
|
27 |
+
│
|
28 |
+
├── model/
|
29 |
+
│ └── only_model.joblib
|
30 |
+
│
|
31 |
+
├── train.py
|
32 |
+
├── prediction.py
|
33 |
+
├── app.py
|
34 |
+
├── requirements.txt
|
35 |
+
└── README.md
|
36 |
+
```
|
37 |
+
|
38 |
+
|
39 |
+
### Data Preprocessing
|
40 |
+
#### Data Loading
|
41 |
+
The data is loaded from a CSV file located at dataset/insurance_claims.csv. During loading, the following steps are
|
42 |
+
performed:
|
43 |
+
|
44 |
+
- Drop the _c39 column.
|
45 |
+
- Replace '?' with NaN.
|
46 |
+
|
47 |
+
#### Data Cleaning
|
48 |
+
Fill missing values for 'property_damage', 'police_report_available', and 'collision_type' columns with their mode.
|
49 |
+
Drop duplicate records.
|
50 |
+
|
51 |
+
#### Encoding and Feature Selection
|
52 |
+
Encode categorical variables using Label Encoding.
|
53 |
+
Drop unnecessary columns that are not relevant for the model.
|
54 |
+
Select the final set of features for the model.
|
55 |
+
|
56 |
+
#### Preprocessed Features
|
57 |
+
The final set of features used for model training:
|
58 |
+
|
59 |
+
incident_severity
|
60 |
+
insured_hobbies
|
61 |
+
total_claim_amount
|
62 |
+
months_as_customer
|
63 |
+
policy_annual_premium
|
64 |
+
incident_date
|
65 |
+
capital-loss
|
66 |
+
capital-gains
|
67 |
+
insured_education_level
|
68 |
+
incident_city
|
69 |
+
fraud_reported (target variable)
|
70 |
+
|
71 |
+
#### Model Training
|
72 |
+
The model is trained using a RandomForestClassifier with a pipeline that includes preprocessing steps and
|
73 |
+
hyperparameter tuning using GridSearchCV.
|
74 |
+
|
75 |
+
#### Training Steps
|
76 |
+
Train-test split: The data is split into training and testing sets with a 70-30 split.
|
77 |
+
Pipeline setup: A pipeline is created to include preprocessing and model training.
|
78 |
+
Hyperparameter tuning: A grid search is performed to find the best hyperparameters.
|
79 |
+
Model training: The best model is trained on the training data.
|
80 |
+
Model saving: The trained model is saved as fraud_insurance_pipeline.joblib.
|
81 |
+
|
82 |
+
#### Model Evaluation
|
83 |
+
The trained model is evaluated using the test set. The evaluation metrics include:
|
84 |
+
|
85 |
+
Classification Report: Precision, Recall, F1-score.
|
86 |
+
AUC Score: Area Under the ROC Curve.
|
87 |
+
Confusion Matrix: Visual representation of true vs. predicted values.
|
88 |
+
ROC Curve: Receiver Operating Characteristic curve.
|
89 |
+
|
90 |
+
|
91 |
+
### Usage
|
92 |
+
|
93 |
+
#### Training the Model
|
94 |
+
To train the model, run the following command:
|
95 |
+
|
96 |
+
```bash
|
97 |
+
python train.py
|
98 |
+
```
|
99 |
+
#### Evaluating the Model
|
100 |
+
|
101 |
+
To evaluate the model, run the following command:
|
102 |
+
|
103 |
+
```bash
|
104 |
+
python predict.py
|
105 |
+
```
|
106 |
+
#### Running the Streamlit App
|
107 |
+
To run the Streamlit app, use the following command:
|
108 |
+
```bash
|
109 |
+
streamlit run streamlit_app.py
|
110 |
+
```
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.0.3
|
2 |
+
numpy==1.24.3
|
3 |
+
seaborn==0.12.2
|
4 |
+
matplotlib==3.7.2
|
5 |
+
scikit-learn==1.3.0
|
6 |
+
joblib==1.3.2
|
7 |
+
streamlit
|
train.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import warnings
|
6 |
+
from sklearn.preprocessing import LabelEncoder
|
7 |
+
from sklearn.ensemble import RandomForestClassifier
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from sklearn.metrics import accuracy_score
|
10 |
+
import joblib
|
11 |
+
|
12 |
+
warnings.filterwarnings("ignore")
|
13 |
+
|
14 |
+
# Load and preprocess data
|
15 |
+
data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39")
|
16 |
+
data.replace('?', np.nan, inplace=True)
|
17 |
+
|
18 |
+
# Function to check data
|
19 |
+
def check_data(data):
|
20 |
+
return pd.DataFrame({
|
21 |
+
'type': data.dtypes,
|
22 |
+
'amount_unique': data.nunique(),
|
23 |
+
'unique_values': [data[x].unique() for x in data.columns],
|
24 |
+
'null_values': data.isna().sum(),
|
25 |
+
'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2)
|
26 |
+
})
|
27 |
+
|
28 |
+
print(check_data(data).sort_values("null_values", ascending=False))
|
29 |
+
|
30 |
+
# Fill missing values with mode
|
31 |
+
for column in data.columns:
|
32 |
+
mode_value = data[column].mode().iloc[0]
|
33 |
+
data[column] = data[column].replace(np.nan, mode_value)
|
34 |
+
|
35 |
+
# Encode categorical variables
|
36 |
+
le = LabelEncoder()
|
37 |
+
for col in data.columns:
|
38 |
+
if data[col].dtype == 'O':
|
39 |
+
data[col] = le.fit_transform(data[col])
|
40 |
+
|
41 |
+
# Drop less important columns
|
42 |
+
to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location',
|
43 |
+
'auto_year', 'auto_make', 'auto_model']
|
44 |
+
data.drop(columns=to_drop, inplace=True)
|
45 |
+
|
46 |
+
# Correlation heatmap
|
47 |
+
plt.figure(figsize=(23, 23))
|
48 |
+
corr_matrix = data.corr()
|
49 |
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
50 |
+
sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
|
51 |
+
plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16)
|
52 |
+
plt.show()
|
53 |
+
|
54 |
+
# Drop less correlated features
|
55 |
+
to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age',
|
56 |
+
'incident_hour_of_the_day', 'insured_occupation']
|
57 |
+
data.drop(columns=to_drop, inplace=True)
|
58 |
+
|
59 |
+
# Feature importance
|
60 |
+
X = data.iloc[:, :-1]
|
61 |
+
Y = data['fraud_reported']
|
62 |
+
model = RandomForestClassifier(n_estimators=1000)
|
63 |
+
model.fit(X, Y)
|
64 |
+
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
|
65 |
+
final_feat = feat_importances.nlargest(10).index.tolist()
|
66 |
+
final_feat.append('fraud_reported')
|
67 |
+
data_new = data[final_feat]
|
68 |
+
|
69 |
+
# Prepare data for modeling
|
70 |
+
df_model = data_new.copy()
|
71 |
+
X = df_model.drop(columns='fraud_reported')
|
72 |
+
y = df_model['fraud_reported']
|
73 |
+
|
74 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
|
75 |
+
|
76 |
+
# Train the final model
|
77 |
+
final_model = RandomForestClassifier(
|
78 |
+
criterion='gini',
|
79 |
+
max_depth=5,
|
80 |
+
min_samples_leaf=4,
|
81 |
+
min_samples_split=10,
|
82 |
+
n_estimators=100,
|
83 |
+
random_state=42,
|
84 |
+
class_weight='balanced'
|
85 |
+
)
|
86 |
+
final_model.fit(X_train, y_train)
|
87 |
+
|
88 |
+
# Evaluate the model
|
89 |
+
y_pred = final_model.predict(X_test)
|
90 |
+
accuracy = accuracy_score(y_test, y_pred)
|
91 |
+
print(f"Model Accuracy: {accuracy}")
|
92 |
+
|
93 |
+
# Save the model
|
94 |
+
joblib.dump(final_model, 'model/only_model.joblib')
|
95 |
+
print("Model saved successfully.")
|