Amaysood commited on
Commit
fcd8c94
1 Parent(s): 1296889

Upload 3 files

Browse files
Files changed (3) hide show
  1. P2Pdeliquency.py +123 -0
  2. loans_clean_schema.csv +0 -0
  3. model.pickle +3 -0
P2Pdeliquency.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import necessary libraries and utilities
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from sklearn.preprocessing import StandardScaler,PolynomialFeatures
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import Ridge
9
+ from sklearn.model_selection import GridSearchCV
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
12
+
13
+
14
+ #def read_data(url = 'https://github.com/amaysood/Cybersprint/raw/main/loans_clean_schema.csv'):
15
+ #data=pd.read_csv(url)
16
+ #return data
17
+
18
+ #Fucntion that fetches Dataframe from required csv
19
+ def read_data():
20
+ data=pd.read_csv('loans_clean_schema.csv')
21
+ return data
22
+
23
+ #removing missing values from the dataset
24
+ def data_clean(df):
25
+ df.dropna(axis = 0, inplace=True)
26
+ return df
27
+
28
+ #defining fucntion for onehotencode to use later
29
+ def onehot_encode(df, column, prefix):
30
+ df = df.copy()
31
+ dummies = pd.get_dummies(df[column], prefix = prefix)
32
+ df = pd.concat([df, dummies], axis = 1)
33
+ df = df.drop(column, axis = 1)
34
+ return df
35
+
36
+ #encoding the categorical data in the dataset to numerical
37
+ def data_encoding(data):
38
+
39
+ # Converting type of columns to category
40
+ data['emp_title']=data['emp_title'].astype('category')
41
+
42
+
43
+ #Assigning numerical values and storing it in another columns
44
+ data['emp_title']=data['emp_title'].cat.codes
45
+
46
+ #Onehot encoding
47
+ df = onehot_encode(data, 'homeownership', prefix = "ho")
48
+ df = onehot_encode(df, 'loan_purpose', 'lp')
49
+
50
+ return df
51
+
52
+ #Scaling the data
53
+ def data_normalization(data):
54
+ #Splitting the data into dependant and independant variables
55
+ y=data['account_never_delinq_percent'].copy()
56
+ X=data.drop('account_never_delinq_percent',axis=1).copy()
57
+ #Scaling
58
+ scaling=StandardScaler()
59
+ X=pd.DataFrame(scaling.fit_transform(X),columns=X.columns)
60
+ #carrying out PCA to reduce dimensionality
61
+ pca = PCA(n_components=26)
62
+ X = pca.fit_transform(X)
63
+ return X,y
64
+
65
+
66
+ #Preprocessing inputs to train model
67
+ def preprocessing_inputs(data):
68
+ df=read_data()
69
+ data=data_clean(df)
70
+ data1=data_encoding(data)
71
+ X,y=data_normalization(data1)
72
+ return X,y
73
+
74
+ #training the model
75
+ def train(data):
76
+ #preprocess inputs
77
+ X,y=preprocessing_inputs(data)
78
+ #split the given dataset into train and test set
79
+ X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.9,random_state=42)
80
+ #using Ridge regression with cross validation
81
+ model=Ridge()
82
+ #Adding a Polynomial degree to inputs to eliminate problems with linearity
83
+ poly = PolynomialFeatures(degree=2)
84
+ X_train_poly = poly.fit_transform(X_train)
85
+ X_test_poly = poly.transform(X_test)
86
+ #Carrying out cross-validation for hyperparameter optimization in Ridge Regression
87
+ param_grid = {'alpha': np.logspace(-3, 3, 10)}
88
+ grid_search = GridSearchCV(model, param_grid, cv=5)
89
+ grid_search.fit(X_train_poly,y_train)
90
+ #print the best alpha and score
91
+ print('Best alpha:', grid_search.best_params_)
92
+ print('Best score:', grid_search.best_score_)
93
+ #Train Ridge model with best value of alpha
94
+ best_ridge = grid_search.best_estimator_
95
+ best_ridge.fit(X_train_poly, y_train)
96
+ # save the trained model as a pickle file
97
+ with open('model.pickle', 'wb') as f:
98
+ pickle.dump(best_ridge, f)
99
+ return X_test_poly,best_ridge,y_test
100
+
101
+
102
+ #carrying out predictions
103
+ def predict(X_test_poly,model,y_test):
104
+ y_pred=model.predict(X_test_poly)
105
+ y_pred=y_pred.clip(None,100)
106
+ return y_pred
107
+ #scoring metrics
108
+ #print( r2_score(y_test, y_pred))
109
+ #print( mean_absolute_error(y_test, y_pred))
110
+ #print( mean_squared_error(y_test, y_pred))
111
+
112
+ if __name__ == '__main__':
113
+ data=read_data()
114
+ X_test_poly,model,y_test=train(data)
115
+ predict(X_test_poly,model,y_test)
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
loans_clean_schema.csv ADDED
The diff for this file is too large to render. See raw diff
 
model.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5a14b25dbcc00fba54fa5bd000fe8ceed2078fafb9b21c3650da8907900546a
3
+ size 3496