Spaces:

bacancydataprophets
/

Insurance_Fraud_Detection

Sleeping

App Files Files Community

kothariyashhh commited on Jul 3

Commit

4386418

•

1 Parent(s): 5ef15eb

Upload 7 files

Browse files

Files changed (7) hide show

app.py +181 -0
dataset/insurance_claims.csv +0 -0
model/only_model.joblib +3 -0
prediction.py +50 -0
readme.md +110 -0
requirements.txt +7 -0
train.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# import streamlit as st
+# import pandas as pd
+# import numpy as np
+# import joblib
+# from sklearn.preprocessing import LabelEncoder
+# class FraudDetectionApp:
+#     def __init__(self):
+#         self.model = joblib.load('model/only_model.joblib')
+#         self.categorical_columns = ['incident_severity', 'insured_hobbies', 'insured_education_level', 'incident_city']
+#         self.encoders = {col: LabelEncoder() for col in self.categorical_columns}
+#         self.fit_encoders()
+#     def fit_encoders(self):
+#         # Example unique values for fitting the encoders
+#         example_data = {
+#             'incident_severity': ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'],
+#             'insured_hobbies': ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'],
+#             'insured_education_level': ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'],
+#             'incident_city': ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook']
+#         }
+#         for col in self.categorical_columns:
+#             self.encoders[col].fit(example_data[col])
+#     def preprocess_single_data(self, data):
+#         if not isinstance(data, pd.DataFrame):
+#             data = pd.DataFrame(data, index=[0])
+#         for col in self.categorical_columns:
+#             if col in data.columns:
+#                 data[col] = self.encoders[col].transform(data[col])
+#         return data
+#     def predict_single_fraud(self, data):
+#         data_processed = self.preprocess_single_data(data)
+#         prediction = self.model.predict(data_processed)[0]
+#         return prediction
+#     def run(self):
+#         st.title('Fraud Detection Prediction')
+#         # Input fields
+#         incident_severity = st.selectbox('Incident Severity', ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'])
+#         insured_hobbies = st.selectbox('Insured Hobbies', ['sleeping', 'reading', 'board-games', 'bungie-jumping',
+#                'base-jumping', 'golf', 'camping', 'dancing', 'skydiving',
+#                'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking',
+#                'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'])
+#         total_claim_amount = st.number_input('Total Claim Amount')
+#         months_as_customer = st.number_input('Months as Customer')
+#         policy_annual_premium = st.number_input('Policy Annual Premium')
+#         incident_date = st.number_input('Incident Date', min_value=1, max_value=31, step=1)
+#         capital_loss = st.number_input('Capital Loss')
+#         capital_gains = st.number_input('Capital Gains')
+#         insured_education_level = st.selectbox('Insured Education Level', ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'])
+#         incident_city = st.selectbox('Incident City', ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook'])
+#         # Collecting user input
+#         new_data_point = {
+#             'incident_severity': incident_severity,
+#             'insured_hobbies': insured_hobbies,
+#             'total_claim_amount': total_claim_amount,
+#             'months_as_customer': months_as_customer,
+#             'policy_annual_premium': policy_annual_premium,
+#             'incident_date': incident_date,
+#             'capital-loss': capital_loss,
+#             'capital-gains': capital_gains,
+#             'insured_education_level': insured_education_level,
+#             'incident_city': incident_city,
+#         }
+#         # Prediction button
+#         if st.button('Predict'):
+#             prediction = self.predict_single_fraud(new_data_point)
+#             if prediction == 0:
+#                 st.write('The applied application is not fraud.')
+#             else:
+#                 st.write('The applied application is fraud.')
+# if __name__ == '__main__':
+#     app = FraudDetectionApp()
+#     app.run()
+import streamlit as st
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder
+class FraudDetectionApp:
+    def __init__(self):
+        self.model = joblib.load('model/only_model.joblib')
+        self.categorical_columns = ['incident_severity', 'insured_hobbies', 'insured_education_level', 'incident_city']
+        self.encoders = {col: LabelEncoder() for col in self.categorical_columns}
+        self.fit_encoders()
+    def fit_encoders(self):
+        # Example unique values for fitting the encoders
+        example_data = {
+            'incident_severity': ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'],
+            'insured_hobbies': ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'],
+            'insured_education_level': ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'],
+            'incident_city': ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook']
+        }
+        for col in self.categorical_columns:
+            self.encoders[col].fit(example_data[col])
+    def preprocess_single_data(self, data):
+        if not isinstance(data, pd.DataFrame):
+            data = pd.DataFrame(data, index=[0])
+        for col in self.categorical_columns:
+            if col in data.columns:
+                data[col] = self.encoders[col].transform(data[col])
+        return data
+    def predict_single_fraud(self, data):
+        data_processed = self.preprocess_single_data(data)
+        prediction = self.model.predict(data_processed)[0]
+        return prediction
+    def run(self):
+        st.title('Insurance Fraud Prediction')
+        # Input fields
+        incident_severity = st.selectbox('Incident Severity', ['Minor Damage', 'Major Damage', 'Total Loss', 'Trivial Damage'])
+        insured_hobbies = st.selectbox('Insured Hobbies', ['sleeping', 'reading', 'board-games', 'bungie-jumping', 'base-jumping', 'golf', 'camping', 'dancing', 'skydiving', 'movies', 'hiking', 'yachting', 'paintball', 'chess', 'kayaking', 'polo', 'basketball', 'video-games', 'cross-fit', 'exercise'])
+        total_claim_amount = st.number_input('Total Claim Amount')
+        months_as_customer = st.number_input('Months as Customer')
+        policy_annual_premium = st.number_input('Policy Annual Premium')
+        incident_date = st.number_input('Incident Date', min_value=1, max_value=31, step=1)
+        capital_loss = st.number_input('Capital Loss')
+        capital_gains = st.number_input('Capital Gains')
+        insured_education_level = st.selectbox('Insured Education Level', ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD'])
+        incident_city = st.selectbox('Incident City', ['Columbus', 'Riverwood', 'Arlington', 'Springfield', 'Hillsdale', 'Northbend', 'Northbrook'])
+        # Collecting user input
+        new_data_point = {
+            'incident_severity': incident_severity,
+            'insured_hobbies': insured_hobbies,
+            'total_claim_amount': total_claim_amount,
+            'months_as_customer': months_as_customer,
+            'policy_annual_premium': policy_annual_premium,
+            'incident_date': incident_date,
+            'capital-loss': capital_loss,
+            'capital-gains': capital_gains,
+            'insured_education_level': insured_education_level,
+            'incident_city': incident_city,
+        }
+        # Prediction button
+        if st.button('Predict'):
+            prediction = self.predict_single_fraud(new_data_point)
+            if prediction == 0:
+                st.write('The applied application is not fraud.')
+            else:
+                st.write('The applied application is fraud.')
+        # Generate sample data
+        if st.button('Generate Sample Data'):
+            sample_non_fraud = self.generate_sample_data(fraud=False)
+            sample_fraud = self.generate_sample_data(fraud=True)
+            st.write("Non-Fraud Sample Data:")
+            st.write(sample_non_fraud)
+            st.write("Fraud Sample Data:")
+            st.write(sample_fraud)
+    def generate_sample_data(self, fraud=False):
+        sample_data = {
+            'incident_severity': ['Major Damage' if fraud else 'Minor Damage'],
+            'insured_hobbies': ['skydiving' if fraud else 'reading'],
+            'total_claim_amount': [50000 if fraud else 1000],
+            'months_as_customer': [1 if fraud else 60],
+            'policy_annual_premium': [10000 if fraud else 200],
+            'incident_date': [15],
+            'capital-loss': [1000 if fraud else 0],
+            'capital-gains': [5000 if fraud else 0],
+            'insured_education_level': ['PhD' if fraud else 'College'],
+            'incident_city': ['Riverwood' if fraud else 'Northbrook']
+        }
+        return pd.DataFrame(sample_data)
+if __name__ == '__main__':
+    app = FraudDetectionApp()
+    app.run()

dataset/insurance_claims.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

model/only_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f857138a5f8938cf50f8eae9335418974d524632dde4b95e211455919f4f8670
+size 334537

prediction.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+import numpy as np
+import joblib
+# Load the model and preprocessing pipeline
+model = joblib.load('model/only_model.joblib')
+def preprocess_single_data(data):
+    # Convert the data into a DataFrame if it's not already
+    if not isinstance(data, pd.DataFrame):
+        data = pd.DataFrame(data, index=[0])
+    # handle missing values by replacing with mode
+    for column in data.columns:
+        mode_value = data[column].mode().iloc[0]
+        data[column] = data[column].replace(np.nan, mode_value)
+def predict_single_fraud(data):
+    # Preprocess the single data point
+    data_processed = preprocess_single_data(data)
+    # Standardize the data
+    # data_scaled = scaler.transform(data_processed)
+    # Make predictions
+    prediction = model.predict(data_processed)[0]
+    probability = model.predict_proba(data_processed)[0, 1]
+    return prediction, probability
+# Example usage:
+# New single data point
+new_data_point = {
+    'incident_severity': 'Major Damage',
+    'insured_hobbies': 9,
+    'total_claim_amount': 59670,
+    'months_as_customer': 116,
+    'policy_annual_premium': 951.46,
+    'incident_date': 30,
+    'capital-loss': -35500,
+    'capital-gains': 0,
+    'insured_education_level': 3,
+    'incident_city':5,
+}
+# Make predictions
+prediction = predict_single_fraud(new_data_point)
+# Display predictions
+print(f'Fraud Prediction: {prediction}')
+# print(f'Probability of Fraud: {probability:.4f}')

readme.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# Insurance Fraud Prediction Model
+This project focuses on building and evaluating a machine learning model to detect fraudulent insurance claims.
+The project involves data preprocessing, model training using a RandomForestClassifier, model evaluation with
+various metrics and visualizations, and a Streamlit UI for interacting with the model.
+Create and activate a virtual environment:
+```bash
+    python -m venv env
+    source env/bin/activate  # On Windows use `env\Scripts\activate`
+```
+Install the required packages:
+```bash
+    pip install -r requirements.txt
+```
+### Project Structure
+```bash
+insurance-fraud-detection/
+│
+├── dataset/
+│   └── insurance_claims.csv
+│
+├── model/
+│   └── only_model.joblib
+│
+├── train.py
+├── prediction.py
+├── app.py
+├── requirements.txt
+└── README.md
+```
+### Data Preprocessing
+#### Data Loading
+The data is loaded from a CSV file located at dataset/insurance_claims.csv. During loading, the following steps are
+performed:
+- Drop the _c39 column.
+- Replace '?' with NaN.
+#### Data Cleaning
+Fill missing values for 'property_damage', 'police_report_available', and 'collision_type' columns with their mode.
+Drop duplicate records.
+#### Encoding and Feature Selection
+Encode categorical variables using Label Encoding.
+Drop unnecessary columns that are not relevant for the model.
+Select the final set of features for the model.
+#### Preprocessed Features
+The final set of features used for model training:
+incident_severity
+insured_hobbies
+total_claim_amount
+months_as_customer
+policy_annual_premium
+incident_date
+capital-loss
+capital-gains
+insured_education_level
+incident_city
+fraud_reported (target variable)
+####  Model Training
+The model is trained using a RandomForestClassifier with a pipeline that includes preprocessing steps and
+hyperparameter tuning using GridSearchCV.
+#### Training Steps
+Train-test split: The data is split into training and testing sets with a 70-30 split.
+Pipeline setup: A pipeline is created to include preprocessing and model training.
+Hyperparameter tuning: A grid search is performed to find the best hyperparameters.
+Model training: The best model is trained on the training data.
+Model saving: The trained model is saved as fraud_insurance_pipeline.joblib.
+#### Model Evaluation
+The trained model is evaluated using the test set. The evaluation metrics include:
+Classification Report: Precision, Recall, F1-score.
+AUC Score: Area Under the ROC Curve.
+Confusion Matrix: Visual representation of true vs. predicted values.
+ROC Curve: Receiver Operating Characteristic curve.
+### Usage
+#### Training the Model
+To train the model, run the following command:
+```bash
+python train.py
+```
+#### Evaluating the Model
+To evaluate the model, run the following command:
+```bash
+python predict.py
+```
+#### Running the Streamlit App
+To run the Streamlit app, use the following command:
+```bash
+streamlit run streamlit_app.py
+```

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pandas==2.0.3
+numpy==1.24.3
+seaborn==0.12.2
+matplotlib==3.7.2
+scikit-learn==1.3.0
+joblib==1.3.2
+streamlit

train.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+import warnings
+from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+import joblib
+warnings.filterwarnings("ignore")
+# Load and preprocess data
+data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39")
+data.replace('?', np.nan, inplace=True)
+# Function to check data
+def check_data(data):
+    return pd.DataFrame({
+        'type': data.dtypes,
+        'amount_unique': data.nunique(),
+        'unique_values': [data[x].unique() for x in data.columns],
+        'null_values': data.isna().sum(),
+        'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2)
+    })
+print(check_data(data).sort_values("null_values", ascending=False))
+# Fill missing values with mode
+for column in data.columns:
+    mode_value = data[column].mode().iloc[0]
+    data[column] = data[column].replace(np.nan, mode_value)
+# Encode categorical variables
+le = LabelEncoder()
+for col in data.columns:
+    if data[col].dtype == 'O':
+        data[col] = le.fit_transform(data[col])
+# Drop less important columns
+to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location',
+           'auto_year', 'auto_make', 'auto_model']
+data.drop(columns=to_drop, inplace=True)
+# Correlation heatmap
+plt.figure(figsize=(23, 23))
+corr_matrix = data.corr()
+mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
+plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16)
+plt.show()
+# Drop less correlated features
+to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age',
+           'incident_hour_of_the_day', 'insured_occupation']
+data.drop(columns=to_drop, inplace=True)
+# Feature importance
+X = data.iloc[:, :-1]
+Y = data['fraud_reported']
+model = RandomForestClassifier(n_estimators=1000)
+model.fit(X, Y)
+feat_importances = pd.Series(model.feature_importances_, index=X.columns)
+final_feat = feat_importances.nlargest(10).index.tolist()
+final_feat.append('fraud_reported')
+data_new = data[final_feat]
+# Prepare data for modeling
+df_model = data_new.copy()
+X = df_model.drop(columns='fraud_reported')
+y = df_model['fraud_reported']
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
+# Train the final model
+final_model = RandomForestClassifier(
+    criterion='gini',
+    max_depth=5,
+    min_samples_leaf=4,
+    min_samples_split=10,
+    n_estimators=100,
+    random_state=42,
+    class_weight='balanced'
+)
+final_model.fit(X_train, y_train)
+# Evaluate the model
+y_pred = final_model.predict(X_test)
+accuracy = accuracy_score(y_test, y_pred)
+print(f"Model Accuracy: {accuracy}")
+# Save the model
+joblib.dump(final_model, 'model/only_model.joblib')
+print("Model saved successfully.")