Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""LoanEligibilityPrediction.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/15wGr9tHgIq7Ua4af83Z0UqfAsH8dyOEZ | |
# IMPORT LIBRERIE | |
""" | |
# Commented out IPython magic to ensure Python compatibility. | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
# %matplotlib inline | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.preprocessing import StandardScaler | |
"""# COLLEZIONE DATI""" | |
url = "https://raw.githubusercontent.com/livio-24/LoanEligibilityPrediction/main/dataset.csv" | |
#caricamento dataset in un pandas dataframe | |
dataset = pd.read_csv(url) | |
"""# EXPLORATORY DATA ANALYSIS""" | |
#prime 5 righe | |
dataset.head() | |
#numero righe e colonne | |
dataset.shape | |
dataset.describe() | |
#misure statistiche | |
#info sulle colonne | |
#5 variabili numeriche e 8 variabili categoriche | |
dataset.info() | |
#Distribuzione variabile target | |
dataset['Loan_Status'].value_counts() | |
# numero di valori mancanti in ogni colonna | |
# verranno gestiti successivamente nella fase di data cleaning | |
dataset.isnull().sum() | |
#eliminiamo colonna Loan_ID perché inutile | |
dataset.drop(columns='Loan_ID', axis = 1, inplace=True) | |
dataset.head() | |
"""**DATA VISUALIZATION - ANALISI UNIVARIATA** | |
VARIABILI CATEGORICHE | |
""" | |
#visualizzazione valori variabili catagoriche in percentuale | |
dataset['Gender'].value_counts(normalize=True).plot.bar(title='Gender') | |
plt.show() | |
dataset['Married'].value_counts(normalize=True).plot.bar(title='Married') | |
plt.show() | |
dataset['Self_Employed'].value_counts(normalize=True).plot.bar(title='Self_Employed') | |
plt.show() | |
dataset['Credit_History'].value_counts(normalize=True).plot.bar(title='Credit_History') | |
plt.show() | |
"""Risultati: | |
- 80% dei candidati nel dataset è maschio | |
- Circa il 65% dei candidati nel dataset è sposato/a | |
- Circa il 15% lavora in proprio | |
- Circa l'85% ha ripagato i propri debiti | |
VARIABILI ORDINALI | |
""" | |
#visualizzazione valori variabili ordinali in percentuale | |
dataset['Dependents'].value_counts(normalize=True).plot.bar(title='Dependents') | |
plt.show() | |
dataset['Education'].value_counts(normalize=True).plot.bar(title='Education') | |
plt.show() | |
dataset['Property_Area'].value_counts(normalize=True).plot.bar(title='Property_Area') | |
plt.show() | |
"""Risultati: | |
- La maggior parte dei candidati non ha familiari dipendenti | |
- Circa l'80% dei candidati ha una laurea | |
- La maggior parte dei candidati vive in un'area semiurbana | |
VARIABILI NUMERICHE | |
""" | |
#visualizzazione distribuzione variabile 'ApplicantIncome' | |
sns.distplot(dataset['ApplicantIncome']) | |
plt.show() | |
#boxplot per individuazione outliers | |
dataset.boxplot(['ApplicantIncome']) | |
plt.show() | |
#visualizzazione distribuzione variabile 'CoapplicantIncome' | |
sns.distplot(dataset['CoapplicantIncome']) | |
plt.show() | |
#boxplot per individuazione outliers | |
dataset.boxplot(['CoapplicantIncome']) | |
plt.show() | |
#visualizzazione distribuzione variabile 'LoanAmount' | |
sns.distplot(dataset['LoanAmount']) | |
plt.show() | |
dataset.boxplot(['LoanAmount']) | |
plt.show() | |
#dataset['LoanAmount'].hist(bins=20) | |
#visualizzazione distribuzione variabile 'Loan_Amount_Term' | |
sns.distplot(dataset['Loan_Amount_Term']) | |
plt.show() | |
dataset.boxplot(['Loan_Amount_Term']) | |
plt.show() | |
"""La maggior parte delle features numeriche ha degli outliers | |
**Matrice di correlazione** | |
""" | |
correlation_matrix = dataset.corr() | |
# heat map per visualizzare matrice di correlazione | |
sns.heatmap(correlation_matrix, cbar=True, fmt='.1f', annot=True, cmap='coolwarm') | |
#plt.savefig('Correlation Heat map', bbox_inches='tight') | |
"""Non ci sono molte variabili correlate tra di loro, le uniche due sono ApplicantIncome - LoanAmount""" | |
#conversione variabili categoriche in numeriche | |
dataset.replace({'Gender':{'Male':0, 'Female':1}, 'Married' :{'No':0, 'Yes':1}, 'Education':{'Not Graduate':0, 'Graduate':1}, 'Self_Employed':{'No':0, 'Yes':1}, 'Property_Area':{'Rural':0, 'Urban':1, 'Semiurban':2}, 'Loan_Status':{'N':0, 'Y':1}}, inplace = True) | |
# replacing the value of 3+ to 4 | |
dataset['Dependents'].replace(to_replace='3+', value=4, inplace=True) | |
"""# DATA CLEANING | |
**CONTROLLO VALORI MANCANTI** | |
""" | |
dataset.isnull().sum() | |
#Sostituiamo i valori mancanti con la moda per le variabili categoriche | |
dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True) | |
dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True) | |
dataset['Dependents'].fillna(dataset['Dependents'].mode()[0], inplace=True) | |
dataset['Self_Employed'].fillna(dataset['Self_Employed'].mode()[0], inplace=True) | |
dataset['Credit_History'].fillna(dataset['Credit_History'].mode()[0], inplace=True) | |
#Utilizziamo la mediana poiché la variabile ha degli outliers, quindi non è un buon approccio utilizzare la media | |
dataset['LoanAmount'].fillna(dataset['LoanAmount'].median(), inplace=True) | |
#dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean(), inplace=True) | |
dataset['Loan_Amount_Term'].value_counts() | |
#Nella variabile Loan_Amount_Term possiamo notare che 360 è il valore che si ripete di più, quindi utilizziamo la moda | |
dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mode()[0], inplace=True) | |
dataset.isnull().sum() | |
#Per trasformare Dtype di Dependents in int | |
dataset['Dependents'] = dataset['Dependents'].astype(str).astype(int) | |
dataset.info() | |
"""**GESTIONE OUTLIERS**""" | |
fig, axs = plt.subplots(2, 2, figsize=(10, 8)) | |
#Distribuzioni prima di applicare log | |
sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green') | |
sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue') | |
sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange') | |
# Log Transformation per normalizzare la distribuzione | |
dataset.ApplicantIncome = np.log(dataset.ApplicantIncome) | |
dataset.CoapplicantIncome = np.log(dataset.CoapplicantIncome + 1) | |
dataset.LoanAmount = np.log(dataset.LoanAmount) | |
fig, axs = plt.subplots(2, 2, figsize=(10, 8)) | |
#Distribuzioni dopo aver applicato log | |
sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green') | |
sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue') | |
sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange') | |
"""Possiamo notare che la distribuzione è migliorata dopo aver applicato il logaritmo | |
# SPLIT DATASET | |
""" | |
#definizione variabili dipendenti e indipendenti | |
x = dataset.drop('Loan_Status', axis = 1) | |
y = dataset['Loan_Status'] | |
#split dataset | |
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y) | |
print("X_train dataset: ", X_train.shape) | |
print("y_train dataset: ", y_train.shape) | |
print("X_test dataset: ", X_test.shape) | |
print("y_test dataset: ", y_test.shape) | |
y_test.value_counts() | |
#Distribuzione della variabile dipendente | |
plt.figure(figsize=(5,5)) | |
pd.value_counts(dataset['Loan_Status']).plot.bar() | |
plt.xlabel('Loan_Status') | |
plt.ylabel('Frequency') | |
dataset['Loan_Status'].value_counts() | |
plt.savefig('target_distr', bbox_inches='tight') | |
"""# DATA SCALING""" | |
#Normalizzazione | |
scaler = MinMaxScaler(feature_range=(0, 1)) | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.fit_transform(X_test) | |
#z-score | |
#scaler = StandardScaler() | |
#X_train=scaler.fit_transform(X_train) | |
#X_test=scaler.transform(X_test) | |
df = pd.DataFrame(X_train, columns = x.columns) | |
df | |
"""# FEATURE SELECTION""" | |
#feature selection supervisionata | |
from sklearn.feature_selection import SelectKBest | |
from sklearn.feature_selection import chi2, f_classif | |
from numpy import set_printoptions | |
fs = SelectKBest(score_func=chi2,k=5) | |
fs.fit_transform(X_train, y_train) | |
X_new_train = fs.transform(X_train) | |
X_new_test = fs.transform(X_test) | |
print(X_new_train.shape) | |
x.columns[fs.get_support(indices=True)] | |
print("features selezionate: ", x.columns[fs.get_support(indices=True)].tolist()) | |
"""# COSTRUZIONE MODELLI""" | |
models = [] | |
precision = [] | |
accuracy = [] | |
recall = [] | |
f1 = [] | |
"""**LOGISTIC REGRESSION**""" | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score ,recall_score, precision_score, f1_score | |
logisticRegr = LogisticRegression() | |
logisticRegr.fit(X_new_train, y_train) | |
y_train_pred = logisticRegr.predict(X_new_train) | |
y_test_pred = logisticRegr.predict(X_new_test) | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax) | |
plt.show() | |
#print(confusion_matrix(y_test, y_test_pred)) | |
#Risultati ottenuti | |
print(classification_report(y_test, y_test_pred)) | |
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) | |
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) | |
models.append('Logistic Regression') | |
accuracy.append(accuracy_score(y_test, y_test_pred)) | |
recall.append(recall_score(y_test, y_test_pred)) | |
precision.append(precision_score(y_test, y_test_pred)) | |
f1.append(f1_score(y_test, y_test_pred)) | |
"""**DECISION TREE**""" | |
from sklearn.tree import DecisionTreeClassifier | |
tree_model = DecisionTreeClassifier( random_state=42) | |
tree_model.fit(X_new_train, y_train) | |
y_train_pred = tree_model.predict(X_new_train) | |
y_test_pred = tree_model.predict(X_new_test) | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax) | |
plt.show() | |
print(classification_report(y_test, y_test_pred)) | |
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) | |
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) | |
models.append('Decision Tree') | |
accuracy.append(accuracy_score(y_test, y_test_pred)) | |
recall.append(recall_score(y_test, y_test_pred)) | |
precision.append(precision_score(y_test, y_test_pred)) | |
f1.append(f1_score(y_test, y_test_pred)) | |
"""**NAIVE BAYES**""" | |
from sklearn.naive_bayes import GaussianNB | |
NB = GaussianNB() | |
NB.fit(X_new_train, y_train) | |
y_train_pred = NB.predict(X_new_train) | |
y_test_pred = NB.predict(X_new_test) | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
plot_confusion_matrix(NB, X_new_test, y_test, ax=ax) | |
plt.show() | |
print(classification_report(y_test, y_test_pred)) | |
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) | |
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) | |
models.append('Naive Bayes') | |
accuracy.append(accuracy_score(y_test, y_test_pred)) | |
recall.append(recall_score(y_test, y_test_pred)) | |
precision.append(precision_score(y_test, y_test_pred)) | |
f1.append(f1_score(y_test, y_test_pred)) | |
"""**RANDOM FOREST**""" | |
from sklearn.ensemble import RandomForestClassifier | |
RandomForest = RandomForestClassifier() | |
RandomForest.fit(X_new_train, y_train) | |
y_train_pred = RandomForest.predict(X_new_train) | |
y_test_pred = RandomForest.predict(X_new_test) | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
plot_confusion_matrix(RandomForest, X_new_test, y_test, ax=ax) | |
plt.show() | |
print(classification_report(y_test, y_test_pred)) | |
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) | |
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) | |
models.append('Random Forest') | |
accuracy.append(accuracy_score(y_test, y_test_pred)) | |
recall.append(recall_score(y_test, y_test_pred)) | |
precision.append(precision_score(y_test, y_test_pred)) | |
f1.append(f1_score(y_test, y_test_pred)) | |
"""**XGBOOST**""" | |
from xgboost import XGBClassifier | |
XGB = XGBClassifier() | |
XGB.fit(X_new_train, y_train) | |
y_train_pred = XGB.predict(X_new_train) | |
y_test_pred = XGB.predict(X_new_test) | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
plot_confusion_matrix(XGB, X_new_test, y_test, ax=ax) | |
plt.show() | |
print(classification_report(y_test, y_test_pred)) | |
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred)) | |
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred)) | |
models.append('XGBoost') | |
accuracy.append(accuracy_score(y_test, y_test_pred)) | |
recall.append(recall_score(y_test, y_test_pred)) | |
precision.append(precision_score(y_test, y_test_pred)) | |
f1.append(f1_score(y_test, y_test_pred)) | |
"""**CONFRONTO METRICHE**""" | |
compare = pd.DataFrame({'Model': models, | |
'Accuracy': accuracy, | |
'Precision': precision, | |
'Recall': recall, | |
'f1_score': f1}) | |
compare.sort_values(by='Accuracy', ascending=False) | |
#print(compare.to_latex()) | |
def loan(Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area): | |
#turning the arguments into a numpy array | |
Marr = 0 if Married == 'No' else 1 | |
Educ = 0 if Education == 'Not Graduate' else 1 | |
CredHis = 0 if Credit_History == '0: bad credit history' else 1 | |
Dep = 4 if Dependents == '3+' else Dependents | |
Gen = 0 if Gender == 'Male' else 1 | |
Self_Empl = 0 if Self_Employed == 'No' else 1 | |
if Property_Area == 'Rural': PA = 0 | |
elif Property_Area == 'Urban': PA = 1 | |
else: PA = 2 | |
instance = np.array([Marr, Educ, CoapplicantIncome, CredHis, PA, Gen, Self_Empl, Dependents, ApplicantIncome, LoanAmount, Loan_Amount_Term]) | |
#reshaping into 2D array | |
instance_resh = instance.reshape(1,-1) | |
new_instance_resh = scaler.transform(instance_resh) | |
new_instance_resh = np.delete(new_instance_resh, [5,6,7,8,9,10], axis=1) | |
prediction = logisticRegr.predict(new_instance_resh) | |
return ("Loan approved" if prediction[0] == 1 else "Loan not approved") | |
app = gr.Interface(fn=loan, | |
inputs=[gr.Radio(['Male', 'Female']), | |
gr.Radio(['Yes', 'No']), | |
gr.Radio(['0', '1', '2', '3+']), | |
gr.Radio(['Graduate', 'Not Graduate']), | |
gr.Radio(['Yes', 'No']), | |
"number", | |
"number", | |
"number", | |
"number", | |
gr.Radio(['0: bad credit history', '1: good credit history']), | |
gr.Radio(['Urban', 'Semiurban', 'Rural'])], | |
outputs="text", | |
title = "Loan Eligibility Prediction") | |
app.launch(debug=True) |