# -*- coding: utf-8 -*- """LoanEligibilityPrediction.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/15wGr9tHgIq7Ua4af83Z0UqfAsH8dyOEZ # IMPORT LIBRERIE """ # Commented out IPython magic to ensure Python compatibility. import numpy as np import pandas as pd import seaborn as sns import gradio as gr import matplotlib.pyplot as plt # %matplotlib inline from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler """# COLLEZIONE DATI""" url = "https://raw.githubusercontent.com/livio-24/LoanEligibilityPrediction/main/dataset.csv" #caricamento dataset in un pandas dataframe dataset = pd.read_csv(url) """# EXPLORATORY DATA ANALYSIS""" #prime 5 righe dataset.head() #numero righe e colonne dataset.shape dataset.describe() #misure statistiche #info sulle colonne #5 variabili numeriche e 8 variabili categoriche dataset.info() #Distribuzione variabile target dataset['Loan_Status'].value_counts() # numero di valori mancanti in ogni colonna # verranno gestiti successivamente nella fase di data cleaning dataset.isnull().sum() #eliminiamo colonna Loan_ID perché inutile dataset.drop(columns='Loan_ID', axis = 1, inplace=True) dataset.head() """**DATA VISUALIZATION - ANALISI UNIVARIATA** VARIABILI CATEGORICHE """ #visualizzazione valori variabili catagoriche in percentuale dataset['Gender'].value_counts(normalize=True).plot.bar(title='Gender') plt.show() dataset['Married'].value_counts(normalize=True).plot.bar(title='Married') plt.show() dataset['Self_Employed'].value_counts(normalize=True).plot.bar(title='Self_Employed') plt.show() dataset['Credit_History'].value_counts(normalize=True).plot.bar(title='Credit_History') plt.show() """Risultati: - 80% dei candidati nel dataset è maschio - Circa il 65% dei candidati nel dataset è sposato/a - Circa il 15% lavora in proprio - Circa l'85% ha ripagato i propri debiti VARIABILI ORDINALI """ #visualizzazione valori variabili ordinali in percentuale dataset['Dependents'].value_counts(normalize=True).plot.bar(title='Dependents') plt.show() dataset['Education'].value_counts(normalize=True).plot.bar(title='Education') plt.show() dataset['Property_Area'].value_counts(normalize=True).plot.bar(title='Property_Area') plt.show() """Risultati: - La maggior parte dei candidati non ha familiari dipendenti - Circa l'80% dei candidati ha una laurea - La maggior parte dei candidati vive in un'area semiurbana VARIABILI NUMERICHE """ #visualizzazione distribuzione variabile 'ApplicantIncome' sns.distplot(dataset['ApplicantIncome']) plt.show() #boxplot per individuazione outliers dataset.boxplot(['ApplicantIncome']) plt.show() #visualizzazione distribuzione variabile 'CoapplicantIncome' sns.distplot(dataset['CoapplicantIncome']) plt.show() #boxplot per individuazione outliers dataset.boxplot(['CoapplicantIncome']) plt.show() #visualizzazione distribuzione variabile 'LoanAmount' sns.distplot(dataset['LoanAmount']) plt.show() dataset.boxplot(['LoanAmount']) plt.show() #dataset['LoanAmount'].hist(bins=20) #visualizzazione distribuzione variabile 'Loan_Amount_Term' sns.distplot(dataset['Loan_Amount_Term']) plt.show() dataset.boxplot(['Loan_Amount_Term']) plt.show() """La maggior parte delle features numeriche ha degli outliers **Matrice di correlazione** """ correlation_matrix = dataset.corr() # heat map per visualizzare matrice di correlazione sns.heatmap(correlation_matrix, cbar=True, fmt='.1f', annot=True, cmap='coolwarm') #plt.savefig('Correlation Heat map', bbox_inches='tight') """Non ci sono molte variabili correlate tra di loro, le uniche due sono ApplicantIncome - LoanAmount""" #conversione variabili categoriche in numeriche dataset.replace({'Gender':{'Male':0, 'Female':1}, 'Married' :{'No':0, 'Yes':1}, 'Education':{'Not Graduate':0, 'Graduate':1}, 'Self_Employed':{'No':0, 'Yes':1}, 'Property_Area':{'Rural':0, 'Urban':1, 'Semiurban':2}, 'Loan_Status':{'N':0, 'Y':1}}, inplace = True) # replacing the value of 3+ to 4 dataset['Dependents'].replace(to_replace='3+', value=4, inplace=True) """# DATA CLEANING **CONTROLLO VALORI MANCANTI** """ dataset.isnull().sum() #Sostituiamo i valori mancanti con la moda per le variabili categoriche dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True) dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True) dataset['Dependents'].fillna(dataset['Dependents'].mode()[0], inplace=True) dataset['Self_Employed'].fillna(dataset['Self_Employed'].mode()[0], inplace=True) dataset['Credit_History'].fillna(dataset['Credit_History'].mode()[0], inplace=True) #Utilizziamo la mediana poiché la variabile ha degli outliers, quindi non è un buon approccio utilizzare la media dataset['LoanAmount'].fillna(dataset['LoanAmount'].median(), inplace=True) #dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean(), inplace=True) dataset['Loan_Amount_Term'].value_counts() #Nella variabile Loan_Amount_Term possiamo notare che 360 è il valore che si ripete di più, quindi utilizziamo la moda dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mode()[0], inplace=True) dataset.isnull().sum() #Per trasformare Dtype di Dependents in int dataset['Dependents'] = dataset['Dependents'].astype(str).astype(int) dataset.info() """**GESTIONE OUTLIERS**""" fig, axs = plt.subplots(2, 2, figsize=(10, 8)) #Distribuzioni prima di applicare log sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green') sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue') sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange') # Log Transformation per normalizzare la distribuzione dataset.ApplicantIncome = np.log(dataset.ApplicantIncome) dataset.CoapplicantIncome = np.log(dataset.CoapplicantIncome + 1) dataset.LoanAmount = np.log(dataset.LoanAmount) fig, axs = plt.subplots(2, 2, figsize=(10, 8)) #Distribuzioni dopo aver applicato log sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green') sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue') sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange') """Possiamo notare che la distribuzione è migliorata dopo aver applicato il logaritmo # SPLIT DATASET """ #definizione variabili dipendenti e indipendenti x = dataset.drop('Loan_Status', axis = 1) y = dataset['Loan_Status'] #split dataset X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y) print("X_train dataset: ", X_train.shape) print("y_train dataset: ", y_train.shape) print("X_test dataset: ", X_test.shape) print("y_test dataset: ", y_test.shape) y_test.value_counts() #Distribuzione della variabile dipendente plt.figure(figsize=(5,5)) pd.value_counts(dataset['Loan_Status']).plot.bar() plt.xlabel('Loan_Status') plt.ylabel('Frequency') dataset['Loan_Status'].value_counts() plt.savefig('target_distr', bbox_inches='tight') """# DATA SCALING""" #Normalizzazione scaler = MinMaxScaler(feature_range=(0, 1)) X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) #z-score #scaler = StandardScaler() #X_train=scaler.fit_transform(X_train) #X_test=scaler.transform(X_test) df = pd.DataFrame(X_train, columns = x.columns) df """# FEATURE SELECTION""" #feature selection supervisionata from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2, f_classif from numpy import set_printoptions fs = SelectKBest(score_func=chi2,k=5) fs.fit_transform(X_train, y_train) X_new_train = fs.transform(X_train) X_new_test = fs.transform(X_test) print(X_new_train.shape) x.columns[fs.get_support(indices=True)] print("features selezionate: ", x.columns[fs.get_support(indices=True)].tolist()) """# COSTRUZIONE MODELLI""" models = [] precision = [] accuracy = [] recall = [] f1 = [] """**LOGISTIC REGRESSION**""" from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score ,recall_score, precision_score, f1_score logisticRegr = LogisticRegression() logisticRegr.fit(X_new_train, y_train) y_train_pred = logisticRegr.predict(X_new_train) y_test_pred = logisticRegr.predict(X_new_test) fig, ax = plt.subplots(figsize=(8, 8)) plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax) plt.show() #print(confusion_matrix(y_test, y_test_pred)) #Risultati ottenuti print(classification_report(y_test, y_test_pred)) print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) models.append('Logistic Regression') accuracy.append(accuracy_score(y_test, y_test_pred)) recall.append(recall_score(y_test, y_test_pred)) precision.append(precision_score(y_test, y_test_pred)) f1.append(f1_score(y_test, y_test_pred)) """**DECISION TREE**""" from sklearn.tree import DecisionTreeClassifier tree_model = DecisionTreeClassifier( random_state=42) tree_model.fit(X_new_train, y_train) y_train_pred = tree_model.predict(X_new_train) y_test_pred = tree_model.predict(X_new_test) fig, ax = plt.subplots(figsize=(8, 8)) plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax) plt.show() print(classification_report(y_test, y_test_pred)) print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) models.append('Decision Tree') accuracy.append(accuracy_score(y_test, y_test_pred)) recall.append(recall_score(y_test, y_test_pred)) precision.append(precision_score(y_test, y_test_pred)) f1.append(f1_score(y_test, y_test_pred)) """**NAIVE BAYES**""" from sklearn.naive_bayes import GaussianNB NB = GaussianNB() NB.fit(X_new_train, y_train) y_train_pred = NB.predict(X_new_train) y_test_pred = NB.predict(X_new_test) fig, ax = plt.subplots(figsize=(8, 8)) plot_confusion_matrix(NB, X_new_test, y_test, ax=ax) plt.show() print(classification_report(y_test, y_test_pred)) print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) models.append('Naive Bayes') accuracy.append(accuracy_score(y_test, y_test_pred)) recall.append(recall_score(y_test, y_test_pred)) precision.append(precision_score(y_test, y_test_pred)) f1.append(f1_score(y_test, y_test_pred)) """**RANDOM FOREST**""" from sklearn.ensemble import RandomForestClassifier RandomForest = RandomForestClassifier() RandomForest.fit(X_new_train, y_train) y_train_pred = RandomForest.predict(X_new_train) y_test_pred = RandomForest.predict(X_new_test) fig, ax = plt.subplots(figsize=(8, 8)) plot_confusion_matrix(RandomForest, X_new_test, y_test, ax=ax) plt.show() print(classification_report(y_test, y_test_pred)) print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) models.append('Random Forest') accuracy.append(accuracy_score(y_test, y_test_pred)) recall.append(recall_score(y_test, y_test_pred)) precision.append(precision_score(y_test, y_test_pred)) f1.append(f1_score(y_test, y_test_pred)) """**XGBOOST**""" from xgboost import XGBClassifier XGB = XGBClassifier() XGB.fit(X_new_train, y_train) y_train_pred = XGB.predict(X_new_train) y_test_pred = XGB.predict(X_new_test) fig, ax = plt.subplots(figsize=(8, 8)) plot_confusion_matrix(XGB, X_new_test, y_test, ax=ax) plt.show() print(classification_report(y_test, y_test_pred)) print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) models.append('XGBoost') accuracy.append(accuracy_score(y_test, y_test_pred)) recall.append(recall_score(y_test, y_test_pred)) precision.append(precision_score(y_test, y_test_pred)) f1.append(f1_score(y_test, y_test_pred)) """**CONFRONTO METRICHE**""" compare = pd.DataFrame({'Model': models, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'f1_score': f1}) compare.sort_values(by='Accuracy', ascending=False) #print(compare.to_latex()) def loan(Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area): #turning the arguments into a numpy array Marr = 0 if Married == 'No' else 1 Educ = 0 if Education == 'Not Graduate' else 1 CredHis = 0 if Credit_History == '0: bad credit history' else 1 Dep = 4 if Dependents == '3+' else Dependents if Property_Area == 'Rural': PA = 0 elif Property_Area == 'Urban': PA = 1 else: PA = 2 x = np.array([Marr, Educ, CoapplicantIncome, CredHis, PA]) #reshaping into 2D array x_resh = x.reshape(1,-1) prediction = logisticRegr.predict(scaler.transform(x_resh)) return ("Loan approved" if prediction[0] == 1 else "Loan not approved") app = gr.Interface(fn=loan, inputs=[gr.Radio(['Male', 'Female']), gr.Radio(['Yes', 'No']), gr.Radio(['0', '1', '2', '3+']), gr.Radio(['Graduate', 'Not Graduate']), gr.Radio(['Yes', 'No']), "number", "number", "number", "number", gr.Radio(['0: bad credit history', '1: good credit history']), gr.Radio(['Urban', 'Semiurban', 'Rural'])], outputs="text", title = "Loan Eligibility Prediction") app.launch(debug=True)