|
import pandas as pd |
|
import numpy as np |
|
def data_imp(): |
|
insurance_feature_descriptions = { |
|
"CustID": "Unique identifier for each customer.", |
|
"FirstPolYear": "Year when the customer first bought an insurance policy.", |
|
"BirthYear": "Birth year of the customer, used to calculate age.", |
|
"EducDeg": "Highest educational degree obtained by the customer.", |
|
"MonthSal": "Monthly salary of the customer. (Numerical, float64)", |
|
"GeoLivArea": "Geographical area where the customer lives.", |
|
"Children": "Number of children the customer has.", |
|
"CustMonVal": "Total monetary value of the customer to the company.", |
|
"ClaimsRate": "Rate at which the customer files insurance claims.", |
|
"PremMotor": "Premium amount for motor insurance.", |
|
"PremHousehold": "Premium amount for household insurance.", |
|
"PremHealth": "Premium amount for health insurance.", |
|
"PremLife": "Premium amount for life insurance.", |
|
"PremWork": "Premium amount for work insurance." |
|
} |
|
retail_feature_descriptions = { |
|
"Channel": "Indicates the sales channel through which the customer made purchases.", |
|
"Region": "The geographical region where the customer is located.", |
|
"Fresh": "Annual spending (in monetary units) on fresh products.", |
|
"Milk": "Annual spending (in monetary units) on milk products.", |
|
"Grocery": "Annual spending (in monetary units) on grocery items.", |
|
"Frozen": "Annual spending (in monetary units) on frozen products.", |
|
"Detergents_Paper": "Annual spending (in monetary units) on detergents and paper products.", |
|
"Delicassen": "Annual spending (in monetary units) on delicatessen products." |
|
} |
|
bankng_feature_descriptions = { |
|
"CUST_ID": "Unique identifier for each customer.", |
|
"BALANCE": "The average balance left in the customer's account.", |
|
"BALANCE_FREQUENCY": "Frequency with which the balance is updated.", |
|
"PURCHASES": "The total amount of purchases made by the customer.", |
|
"ONEOFF_PURCHASES": "The total amount of one-time purchases made by the customer.", |
|
"INSTALLMENTS_PURCHASES": "The total amount of purchases made in installments.", |
|
"CASH_ADVANCE": "The total amount of cash advances taken by the customer.", |
|
"PURCHASES_FREQUENCY": "The frequency of purchases made by the customer.", |
|
"ONEOFF_PURCHASES_FREQUENCY": "The frequency of one-time purchases made by the customer.", |
|
"PURCHASES_INSTALLMENTS_FREQUENCY": "The frequency of purchases made in installments.", |
|
"CASH_ADVANCE_FREQUENCY": "The frequency of cash advances taken by the customer.", |
|
"CASH_ADVANCE_TRX": "The number of cash advance transactions made by the customer.", |
|
"PURCHASES_TRX": "The number of purchase transactions made by the customer.", |
|
"CREDIT_LIMIT": "The credit limit assigned to the customer's account.", |
|
"PAYMENTS": "The total amount of payments made by the customer.", |
|
"MINIMUM_PAYMENTS": "The minimum amount of payments made by the customer.", |
|
"PRC_FULL_PAYMENT": "The percentage of full payments made by the customer.", |
|
"TENURE": "The tenure of the customer in months." |
|
} |
|
|
|
insurance_defaults = { |
|
"FirstPolYear": 1999, |
|
"BirthYear": 1980, |
|
"MonthSal": 1000, |
|
"GeoLivArea": 0, |
|
"Children": 0, |
|
"CustMonVal": 100, |
|
"ClaimsRate": 2.33, |
|
"PremMotor": 200, |
|
"PremHousehold": 200, |
|
"PremHealth": 200, |
|
"PremLife": 200, |
|
"PremWork": 200 |
|
} |
|
|
|
|
|
banking_defaults = { |
|
"BALANCE": 2000, |
|
"BALANCE_FREQUENCY": 0.5, |
|
"PURCHASES": 500, |
|
"ONEOFF_PURCHASES": 0, |
|
"INSTALLMENTS_PURCHASES": 0, |
|
"CASH_ADVANCE": 200, |
|
"PURCHASES_FREQUENCY": 0.1, |
|
"ONEOFF_PURCHASES_FREQUENCY": 0.1, |
|
"PURCHASES_INSTALLMENTS_FREQUENCY": 0.5, |
|
"CASH_ADVANCE_FREQUENCY": 5, |
|
"CASH_ADVANCE_TRX": 5, |
|
"PURCHASES_TRX": 5, |
|
"CREDIT_LIMIT": 10000, |
|
"PAYMENTS": 500, |
|
"MINIMUM_PAYMENTS": 130, |
|
"PRC_FULL_PAYMENT": 0.22, |
|
"TENURE": 10 |
|
} |
|
|
|
|
|
retail_defaults = { |
|
"Fresh": 6000, |
|
"Milk": 9000, |
|
"Grocery": 9000, |
|
"Frozen": 4000, |
|
"Detergents_Paper": 4000, |
|
"Delicassen": 2000 |
|
} |
|
return insurance_feature_descriptions,bankng_feature_descriptions,retail_feature_descriptions,insurance_defaults,banking_defaults,retail_defaults |
|
|
|
def preprocess_data(data): |
|
if 'CustID' in data.columns: |
|
data = data.drop(columns=['CustID']) |
|
if 'Cust_ID' in data.columns: |
|
data = data.drop(columns=['Cust_ID']) |
|
data = remove_outliers(data) |
|
return data |
|
|
|
def remove_outliers(df, threshold=3): |
|
df_numeric = df.select_dtypes(include=[float, int]) |
|
z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std()) |
|
df_clean = df[(z_scores < threshold).all(axis=1)] |
|
return df_clean |
|
|