kothariyashhh commited on
Commit
4386418
1 Parent(s): 5ef15eb

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +181 -0
  2. dataset/insurance_claims.csv +0 -0
  3. model/only_model.joblib +3 -0
  4. prediction.py +50 -0
  5. readme.md +110 -0
  6. requirements.txt +7 -0
  7. train.py +95 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # import numpy as np
4
+ # import joblib
5
+ # from sklearn.preprocessing import LabelEncoder
6
+
7
+ # class FraudDetectionApp:
8
+ # def __init__(self):
9
+ # self.model = joblib.load('model/only_model.joblib')
10
+ # self.categorical_columns = ['incident_severity', 'insured_hobbies', 'insured_education_level', 'incident_city']
11
+ # self.encoders = {col: LabelEncoder() for col in self.categorical_columns}
12
+ # self.fit_encoders()
13
+
14
+ # def fit_encoders(self):
15
+ # # Example unique values for fitting the encoders
16
+ # example_data = {
17
+ # 'incident_severity': ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'],
18
+ # 'insured_hobbies': ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'],
19
+ # 'insured_education_level': ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'],
20
+ # 'incident_city': ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook']
21
+ # }
22
+ # for col in self.categorical_columns:
23
+ # self.encoders[col].fit(example_data[col])
24
+
25
+ # def preprocess_single_data(self, data):
26
+ # if not isinstance(data, pd.DataFrame):
27
+ # data = pd.DataFrame(data, index=[0])
28
+ # for col in self.categorical_columns:
29
+ # if col in data.columns:
30
+ # data[col] = self.encoders[col].transform(data[col])
31
+ # return data
32
+
33
+ # def predict_single_fraud(self, data):
34
+ # data_processed = self.preprocess_single_data(data)
35
+ # prediction = self.model.predict(data_processed)[0]
36
+ # return prediction
37
+
38
+ # def run(self):
39
+ # st.title('Fraud Detection Prediction')
40
+
41
+ # # Input fields
42
+ # incident_severity = st.selectbox('Incident Severity', ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'])
43
+ # insured_hobbies = st.selectbox('Insured Hobbies', ['sleeping', 'reading', 'board-games', 'bungie-jumping',
44
+ # 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving',
45
+ # 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking',
46
+ # 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'])
47
+ # total_claim_amount = st.number_input('Total Claim Amount')
48
+ # months_as_customer = st.number_input('Months as Customer')
49
+ # policy_annual_premium = st.number_input('Policy Annual Premium')
50
+ # incident_date = st.number_input('Incident Date', min_value=1, max_value=31, step=1)
51
+ # capital_loss = st.number_input('Capital Loss')
52
+ # capital_gains = st.number_input('Capital Gains')
53
+ # insured_education_level = st.selectbox('Insured Education Level', ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'])
54
+ # incident_city = st.selectbox('Incident City', ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook'])
55
+
56
+ # # Collecting user input
57
+ # new_data_point = {
58
+ # 'incident_severity': incident_severity,
59
+ # 'insured_hobbies': insured_hobbies,
60
+ # 'total_claim_amount': total_claim_amount,
61
+ # 'months_as_customer': months_as_customer,
62
+ # 'policy_annual_premium': policy_annual_premium,
63
+ # 'incident_date': incident_date,
64
+ # 'capital-loss': capital_loss,
65
+ # 'capital-gains': capital_gains,
66
+ # 'insured_education_level': insured_education_level,
67
+ # 'incident_city': incident_city,
68
+ # }
69
+
70
+ # # Prediction button
71
+ # if st.button('Predict'):
72
+ # prediction = self.predict_single_fraud(new_data_point)
73
+ # if prediction == 0:
74
+ # st.write('The applied application is not fraud.')
75
+ # else:
76
+ # st.write('The applied application is fraud.')
77
+
78
+ # if __name__ == '__main__':
79
+ # app = FraudDetectionApp()
80
+ # app.run()
81
+ import streamlit as st
82
+ import pandas as pd
83
+ import numpy as np
84
+ import joblib
85
+ from sklearn.preprocessing import LabelEncoder
86
+
87
+ class FraudDetectionApp:
88
+ def __init__(self):
89
+ self.model = joblib.load('model/only_model.joblib')
90
+ self.categorical_columns = ['incident_severity', 'insured_hobbies', 'insured_education_level', 'incident_city']
91
+ self.encoders = {col: LabelEncoder() for col in self.categorical_columns}
92
+ self.fit_encoders()
93
+
94
+ def fit_encoders(self):
95
+ # Example unique values for fitting the encoders
96
+ example_data = {
97
+ 'incident_severity': ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'],
98
+ 'insured_hobbies': ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'],
99
+ 'insured_education_level': ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'],
100
+ 'incident_city': ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook']
101
+ }
102
+ for col in self.categorical_columns:
103
+ self.encoders[col].fit(example_data[col])
104
+
105
+ def preprocess_single_data(self, data):
106
+ if not isinstance(data, pd.DataFrame):
107
+ data = pd.DataFrame(data, index=[0])
108
+ for col in self.categorical_columns:
109
+ if col in data.columns:
110
+ data[col] = self.encoders[col].transform(data[col])
111
+ return data
112
+
113
+ def predict_single_fraud(self, data):
114
+ data_processed = self.preprocess_single_data(data)
115
+ prediction = self.model.predict(data_processed)[0]
116
+ return prediction
117
+
118
+ def run(self):
119
+ st.title('Insurance Fraud Prediction')
120
+
121
+ # Input fields
122
+ incident_severity = st.selectbox('Incident Severity', ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'])
123
+ insured_hobbies = st.selectbox('Insured Hobbies', ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'])
124
+ total_claim_amount = st.number_input('Total Claim Amount')
125
+ months_as_customer = st.number_input('Months as Customer')
126
+ policy_annual_premium = st.number_input('Policy Annual Premium')
127
+ incident_date = st.number_input('Incident Date', min_value=1, max_value=31, step=1)
128
+ capital_loss = st.number_input('Capital Loss')
129
+ capital_gains = st.number_input('Capital Gains')
130
+ insured_education_level = st.selectbox('Insured Education Level', ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'])
131
+ incident_city = st.selectbox('Incident City', ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook'])
132
+
133
+ # Collecting user input
134
+ new_data_point = {
135
+ 'incident_severity': incident_severity,
136
+ 'insured_hobbies': insured_hobbies,
137
+ 'total_claim_amount': total_claim_amount,
138
+ 'months_as_customer': months_as_customer,
139
+ 'policy_annual_premium': policy_annual_premium,
140
+ 'incident_date': incident_date,
141
+ 'capital-loss': capital_loss,
142
+ 'capital-gains': capital_gains,
143
+ 'insured_education_level': insured_education_level,
144
+ 'incident_city': incident_city,
145
+ }
146
+
147
+ # Prediction button
148
+ if st.button('Predict'):
149
+ prediction = self.predict_single_fraud(new_data_point)
150
+ if prediction == 0:
151
+ st.write('The applied application is not fraud.')
152
+ else:
153
+ st.write('The applied application is fraud.')
154
+
155
+ # Generate sample data
156
+ if st.button('Generate Sample Data'):
157
+ sample_non_fraud = self.generate_sample_data(fraud=False)
158
+ sample_fraud = self.generate_sample_data(fraud=True)
159
+ st.write("Non-Fraud Sample Data:")
160
+ st.write(sample_non_fraud)
161
+ st.write("Fraud Sample Data:")
162
+ st.write(sample_fraud)
163
+
164
+ def generate_sample_data(self, fraud=False):
165
+ sample_data = {
166
+ 'incident_severity': ['Major Damage' if fraud else 'Minor Damage'],
167
+ 'insured_hobbies': ['skydiving' if fraud else 'reading'],
168
+ 'total_claim_amount': [50000 if fraud else 1000],
169
+ 'months_as_customer': [1 if fraud else 60],
170
+ 'policy_annual_premium': [10000 if fraud else 200],
171
+ 'incident_date': [15],
172
+ 'capital-loss': [1000 if fraud else 0],
173
+ 'capital-gains': [5000 if fraud else 0],
174
+ 'insured_education_level': ['PhD' if fraud else 'College'],
175
+ 'incident_city': ['Riverwood' if fraud else 'Northbrook']
176
+ }
177
+ return pd.DataFrame(sample_data)
178
+
179
+ if __name__ == '__main__':
180
+ app = FraudDetectionApp()
181
+ app.run()
dataset/insurance_claims.csv ADDED
The diff for this file is too large to render. See raw diff
 
model/only_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f857138a5f8938cf50f8eae9335418974d524632dde4b95e211455919f4f8670
3
+ size 334537
prediction.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+
5
+ # Load the model and preprocessing pipeline
6
+ model = joblib.load('model/only_model.joblib')
7
+ def preprocess_single_data(data):
8
+ # Convert the data into a DataFrame if it's not already
9
+ if not isinstance(data, pd.DataFrame):
10
+ data = pd.DataFrame(data, index=[0])
11
+
12
+ # handle missing values by replacing with mode
13
+ for column in data.columns:
14
+ mode_value = data[column].mode().iloc[0]
15
+ data[column] = data[column].replace(np.nan, mode_value)
16
+
17
+ def predict_single_fraud(data):
18
+ # Preprocess the single data point
19
+ data_processed = preprocess_single_data(data)
20
+
21
+ # Standardize the data
22
+ # data_scaled = scaler.transform(data_processed)
23
+
24
+ # Make predictions
25
+ prediction = model.predict(data_processed)[0]
26
+ probability = model.predict_proba(data_processed)[0, 1]
27
+
28
+ return prediction, probability
29
+
30
+ # Example usage:
31
+ # New single data point
32
+ new_data_point = {
33
+ 'incident_severity': 'Major Damage',
34
+ 'insured_hobbies': 9,
35
+ 'total_claim_amount': 59670,
36
+ 'months_as_customer': 116,
37
+ 'policy_annual_premium': 951.46,
38
+ 'incident_date': 30,
39
+ 'capital-loss': -35500,
40
+ 'capital-gains': 0,
41
+ 'insured_education_level': 3,
42
+ 'incident_city':5,
43
+ }
44
+
45
+ # Make predictions
46
+ prediction = predict_single_fraud(new_data_point)
47
+
48
+ # Display predictions
49
+ print(f'Fraud Prediction: {prediction}')
50
+ # print(f'Probability of Fraud: {probability:.4f}')
readme.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Insurance Fraud Prediction Model
2
+
3
+ This project focuses on building and evaluating a machine learning model to detect fraudulent insurance claims.
4
+ The project involves data preprocessing, model training using a RandomForestClassifier, model evaluation with
5
+ various metrics and visualizations, and a Streamlit UI for interacting with the model.
6
+
7
+ Create and activate a virtual environment:
8
+
9
+ ```bash
10
+ python -m venv env
11
+ source env/bin/activate # On Windows use `env\Scripts\activate`
12
+
13
+ ```
14
+
15
+ Install the required packages:
16
+
17
+ ```bash
18
+ pip install -r requirements.txt
19
+ ```
20
+
21
+ ### Project Structure
22
+ ```bash
23
+ insurance-fraud-detection/
24
+
25
+ ├── dataset/
26
+ │ └── insurance_claims.csv
27
+
28
+ ├── model/
29
+ │ └── only_model.joblib
30
+
31
+ ├── train.py
32
+ ├── prediction.py
33
+ ├── app.py
34
+ ├── requirements.txt
35
+ └── README.md
36
+ ```
37
+
38
+
39
+ ### Data Preprocessing
40
+ #### Data Loading
41
+ The data is loaded from a CSV file located at dataset/insurance_claims.csv. During loading, the following steps are
42
+ performed:
43
+
44
+ - Drop the _c39 column.
45
+ - Replace '?' with NaN.
46
+
47
+ #### Data Cleaning
48
+ Fill missing values for 'property_damage', 'police_report_available', and 'collision_type' columns with their mode.
49
+ Drop duplicate records.
50
+
51
+ #### Encoding and Feature Selection
52
+ Encode categorical variables using Label Encoding.
53
+ Drop unnecessary columns that are not relevant for the model.
54
+ Select the final set of features for the model.
55
+
56
+ #### Preprocessed Features
57
+ The final set of features used for model training:
58
+
59
+ incident_severity
60
+ insured_hobbies
61
+ total_claim_amount
62
+ months_as_customer
63
+ policy_annual_premium
64
+ incident_date
65
+ capital-loss
66
+ capital-gains
67
+ insured_education_level
68
+ incident_city
69
+ fraud_reported (target variable)
70
+
71
+ #### Model Training
72
+ The model is trained using a RandomForestClassifier with a pipeline that includes preprocessing steps and
73
+ hyperparameter tuning using GridSearchCV.
74
+
75
+ #### Training Steps
76
+ Train-test split: The data is split into training and testing sets with a 70-30 split.
77
+ Pipeline setup: A pipeline is created to include preprocessing and model training.
78
+ Hyperparameter tuning: A grid search is performed to find the best hyperparameters.
79
+ Model training: The best model is trained on the training data.
80
+ Model saving: The trained model is saved as fraud_insurance_pipeline.joblib.
81
+
82
+ #### Model Evaluation
83
+ The trained model is evaluated using the test set. The evaluation metrics include:
84
+
85
+ Classification Report: Precision, Recall, F1-score.
86
+ AUC Score: Area Under the ROC Curve.
87
+ Confusion Matrix: Visual representation of true vs. predicted values.
88
+ ROC Curve: Receiver Operating Characteristic curve.
89
+
90
+
91
+ ### Usage
92
+
93
+ #### Training the Model
94
+ To train the model, run the following command:
95
+
96
+ ```bash
97
+ python train.py
98
+ ```
99
+ #### Evaluating the Model
100
+
101
+ To evaluate the model, run the following command:
102
+
103
+ ```bash
104
+ python predict.py
105
+ ```
106
+ #### Running the Streamlit App
107
+ To run the Streamlit app, use the following command:
108
+ ```bash
109
+ streamlit run streamlit_app.py
110
+ ```
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ numpy==1.24.3
3
+ seaborn==0.12.2
4
+ matplotlib==3.7.2
5
+ scikit-learn==1.3.0
6
+ joblib==1.3.2
7
+ streamlit
train.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ import warnings
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from sklearn.ensemble import RandomForestClassifier
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import accuracy_score
10
+ import joblib
11
+
12
+ warnings.filterwarnings("ignore")
13
+
14
+ # Load and preprocess data
15
+ data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39")
16
+ data.replace('?', np.nan, inplace=True)
17
+
18
+ # Function to check data
19
+ def check_data(data):
20
+ return pd.DataFrame({
21
+ 'type': data.dtypes,
22
+ 'amount_unique': data.nunique(),
23
+ 'unique_values': [data[x].unique() for x in data.columns],
24
+ 'null_values': data.isna().sum(),
25
+ 'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2)
26
+ })
27
+
28
+ print(check_data(data).sort_values("null_values", ascending=False))
29
+
30
+ # Fill missing values with mode
31
+ for column in data.columns:
32
+ mode_value = data[column].mode().iloc[0]
33
+ data[column] = data[column].replace(np.nan, mode_value)
34
+
35
+ # Encode categorical variables
36
+ le = LabelEncoder()
37
+ for col in data.columns:
38
+ if data[col].dtype == 'O':
39
+ data[col] = le.fit_transform(data[col])
40
+
41
+ # Drop less important columns
42
+ to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location',
43
+ 'auto_year', 'auto_make', 'auto_model']
44
+ data.drop(columns=to_drop, inplace=True)
45
+
46
+ # Correlation heatmap
47
+ plt.figure(figsize=(23, 23))
48
+ corr_matrix = data.corr()
49
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
50
+ sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
51
+ plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16)
52
+ plt.show()
53
+
54
+ # Drop less correlated features
55
+ to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age',
56
+ 'incident_hour_of_the_day', 'insured_occupation']
57
+ data.drop(columns=to_drop, inplace=True)
58
+
59
+ # Feature importance
60
+ X = data.iloc[:, :-1]
61
+ Y = data['fraud_reported']
62
+ model = RandomForestClassifier(n_estimators=1000)
63
+ model.fit(X, Y)
64
+ feat_importances = pd.Series(model.feature_importances_, index=X.columns)
65
+ final_feat = feat_importances.nlargest(10).index.tolist()
66
+ final_feat.append('fraud_reported')
67
+ data_new = data[final_feat]
68
+
69
+ # Prepare data for modeling
70
+ df_model = data_new.copy()
71
+ X = df_model.drop(columns='fraud_reported')
72
+ y = df_model['fraud_reported']
73
+
74
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
75
+
76
+ # Train the final model
77
+ final_model = RandomForestClassifier(
78
+ criterion='gini',
79
+ max_depth=5,
80
+ min_samples_leaf=4,
81
+ min_samples_split=10,
82
+ n_estimators=100,
83
+ random_state=42,
84
+ class_weight='balanced'
85
+ )
86
+ final_model.fit(X_train, y_train)
87
+
88
+ # Evaluate the model
89
+ y_pred = final_model.predict(X_test)
90
+ accuracy = accuracy_score(y_test, y_pred)
91
+ print(f"Model Accuracy: {accuracy}")
92
+
93
+ # Save the model
94
+ joblib.dump(final_model, 'model/only_model.joblib')
95
+ print("Model saved successfully.")