#import necessary libraries and utilities import pickle import numpy as np import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler,PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV from sklearn.decomposition import PCA from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error #def read_data(url = 'https://github.com/amaysood/Cybersprint/raw/main/loans_clean_schema.csv'): #data=pd.read_csv(url) #return data #Fucntion that fetches Dataframe from required csv def read_data(): data=pd.read_csv('loans_clean_schema.csv') return data #removing missing values from the dataset def data_clean(df): df.dropna(axis = 0, inplace=True) return df #defining fucntion for onehotencode to use later def onehot_encode(df, column, prefix): df = df.copy() dummies = pd.get_dummies(df[column], prefix = prefix) df = pd.concat([df, dummies], axis = 1) df = df.drop(column, axis = 1) return df #encoding the categorical data in the dataset to numerical def data_encoding(data): # Converting type of columns to category data['emp_title']=data['emp_title'].astype('category') #Assigning numerical values and storing it in another columns data['emp_title']=data['emp_title'].cat.codes #Onehot encoding df = onehot_encode(data, 'homeownership', prefix = "ho") df = onehot_encode(df, 'loan_purpose', 'lp') return df #Scaling the data def data_normalization(data): #Splitting the data into dependant and independant variables y=data['account_never_delinq_percent'].copy() X=data.drop('account_never_delinq_percent',axis=1).copy() #Scaling scaling=StandardScaler() X=pd.DataFrame(scaling.fit_transform(X),columns=X.columns) #carrying out PCA to reduce dimensionality pca = PCA(n_components=26) X = pca.fit_transform(X) return X,y #Preprocessing inputs to train model def preprocessing_inputs(data): df=read_data() data=data_clean(df) data1=data_encoding(data) X,y=data_normalization(data1) return X,y #training the model def train(data): #preprocess inputs X,y=preprocessing_inputs(data) #split the given dataset into train and test set X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.9,random_state=42) #using Ridge regression with cross validation model=Ridge() #Adding a Polynomial degree to inputs to eliminate problems with linearity poly = PolynomialFeatures(degree=2) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.transform(X_test) #Carrying out cross-validation for hyperparameter optimization in Ridge Regression param_grid = {'alpha': np.logspace(-3, 3, 10)} grid_search = GridSearchCV(model, param_grid, cv=5) grid_search.fit(X_train_poly,y_train) #print the best alpha and score print('Best alpha:', grid_search.best_params_) print('Best score:', grid_search.best_score_) #Train Ridge model with best value of alpha best_ridge = grid_search.best_estimator_ best_ridge.fit(X_train_poly, y_train) # save the trained model as a pickle file with open('model.pickle', 'wb') as f: pickle.dump(best_ridge, f) return X_test_poly,best_ridge,y_test #carrying out predictions def predict(X_test_poly,model,y_test): y_pred=model.predict(X_test_poly) y_pred=y_pred.clip(None,100) return y_pred #scoring metrics #print( r2_score(y_test, y_pred)) #print( mean_absolute_error(y_test, y_pred)) #print( mean_squared_error(y_test, y_pred)) if __name__ == '__main__': data=read_data() X_test_poly,model,y_test=train(data) predict(X_test_poly,model,y_test)