Liviox24's picture
Create app.py
d9ceac2
raw
history blame
13.9 kB
# -*- coding: utf-8 -*-
"""LoanEligibilityPrediction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15wGr9tHgIq7Ua4af83Z0UqfAsH8dyOEZ
# IMPORT LIBRERIE
"""
# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import pandas as pd
import seaborn as sns
import gradio as gr
import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
"""# COLLEZIONE DATI"""
url = "https://raw.githubusercontent.com/livio-24/LoanEligibilityPrediction/main/dataset.csv"
#caricamento dataset in un pandas dataframe
dataset = pd.read_csv(url)
"""# EXPLORATORY DATA ANALYSIS"""
#prime 5 righe
dataset.head()
#numero righe e colonne
dataset.shape
dataset.describe()
#misure statistiche
#info sulle colonne
#5 variabili numeriche e 8 variabili categoriche
dataset.info()
#Distribuzione variabile target
dataset['Loan_Status'].value_counts()
# numero di valori mancanti in ogni colonna
# verranno gestiti successivamente nella fase di data cleaning
dataset.isnull().sum()
#eliminiamo colonna Loan_ID perché inutile
dataset.drop(columns='Loan_ID', axis = 1, inplace=True)
dataset.head()
"""**DATA VISUALIZATION - ANALISI UNIVARIATA**
VARIABILI CATEGORICHE
"""
#visualizzazione valori variabili catagoriche in percentuale
dataset['Gender'].value_counts(normalize=True).plot.bar(title='Gender')
plt.show()
dataset['Married'].value_counts(normalize=True).plot.bar(title='Married')
plt.show()
dataset['Self_Employed'].value_counts(normalize=True).plot.bar(title='Self_Employed')
plt.show()
dataset['Credit_History'].value_counts(normalize=True).plot.bar(title='Credit_History')
plt.show()
"""Risultati:
- 80% dei candidati nel dataset è maschio
- Circa il 65% dei candidati nel dataset è sposato/a
- Circa il 15% lavora in proprio
- Circa l'85% ha ripagato i propri debiti
VARIABILI ORDINALI
"""
#visualizzazione valori variabili ordinali in percentuale
dataset['Dependents'].value_counts(normalize=True).plot.bar(title='Dependents')
plt.show()
dataset['Education'].value_counts(normalize=True).plot.bar(title='Education')
plt.show()
dataset['Property_Area'].value_counts(normalize=True).plot.bar(title='Property_Area')
plt.show()
"""Risultati:
- La maggior parte dei candidati non ha familiari dipendenti
- Circa l'80% dei candidati ha una laurea
- La maggior parte dei candidati vive in un'area semiurbana
VARIABILI NUMERICHE
"""
#visualizzazione distribuzione variabile 'ApplicantIncome'
sns.distplot(dataset['ApplicantIncome'])
plt.show()
#boxplot per individuazione outliers
dataset.boxplot(['ApplicantIncome'])
plt.show()
#visualizzazione distribuzione variabile 'CoapplicantIncome'
sns.distplot(dataset['CoapplicantIncome'])
plt.show()
#boxplot per individuazione outliers
dataset.boxplot(['CoapplicantIncome'])
plt.show()
#visualizzazione distribuzione variabile 'LoanAmount'
sns.distplot(dataset['LoanAmount'])
plt.show()
dataset.boxplot(['LoanAmount'])
plt.show()
#dataset['LoanAmount'].hist(bins=20)
#visualizzazione distribuzione variabile 'Loan_Amount_Term'
sns.distplot(dataset['Loan_Amount_Term'])
plt.show()
dataset.boxplot(['Loan_Amount_Term'])
plt.show()
"""La maggior parte delle features numeriche ha degli outliers
**Matrice di correlazione**
"""
correlation_matrix = dataset.corr()
# heat map per visualizzare matrice di correlazione
sns.heatmap(correlation_matrix, cbar=True, fmt='.1f', annot=True, cmap='coolwarm')
#plt.savefig('Correlation Heat map', bbox_inches='tight')
"""Non ci sono molte variabili correlate tra di loro, le uniche due sono ApplicantIncome - LoanAmount"""
#conversione variabili categoriche in numeriche
dataset.replace({'Gender':{'Male':0, 'Female':1}, 'Married' :{'No':0, 'Yes':1}, 'Education':{'Not Graduate':0, 'Graduate':1}, 'Self_Employed':{'No':0, 'Yes':1}, 'Property_Area':{'Rural':0, 'Urban':1, 'Semiurban':2}, 'Loan_Status':{'N':0, 'Y':1}}, inplace = True)
# replacing the value of 3+ to 4
dataset['Dependents'].replace(to_replace='3+', value=4, inplace=True)
"""# DATA CLEANING
**CONTROLLO VALORI MANCANTI**
"""
dataset.isnull().sum()
#Sostituiamo i valori mancanti con la moda per le variabili categoriche
dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True)
dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True)
dataset['Dependents'].fillna(dataset['Dependents'].mode()[0], inplace=True)
dataset['Self_Employed'].fillna(dataset['Self_Employed'].mode()[0], inplace=True)
dataset['Credit_History'].fillna(dataset['Credit_History'].mode()[0], inplace=True)
#Utilizziamo la mediana poiché la variabile ha degli outliers, quindi non è un buon approccio utilizzare la media
dataset['LoanAmount'].fillna(dataset['LoanAmount'].median(), inplace=True)
#dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean(), inplace=True)
dataset['Loan_Amount_Term'].value_counts()
#Nella variabile Loan_Amount_Term possiamo notare che 360 è il valore che si ripete di più, quindi utilizziamo la moda
dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mode()[0], inplace=True)
dataset.isnull().sum()
#Per trasformare Dtype di Dependents in int
dataset['Dependents'] = dataset['Dependents'].astype(str).astype(int)
dataset.info()
"""**GESTIONE OUTLIERS**"""
fig, axs = plt.subplots(2, 2, figsize=(10, 8))
#Distribuzioni prima di applicare log
sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green')
sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue')
sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange')
# Log Transformation per normalizzare la distribuzione
dataset.ApplicantIncome = np.log(dataset.ApplicantIncome)
dataset.CoapplicantIncome = np.log(dataset.CoapplicantIncome + 1)
dataset.LoanAmount = np.log(dataset.LoanAmount)
fig, axs = plt.subplots(2, 2, figsize=(10, 8))
#Distribuzioni dopo aver applicato log
sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green')
sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue')
sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange')
"""Possiamo notare che la distribuzione è migliorata dopo aver applicato il logaritmo
# SPLIT DATASET
"""
#definizione variabili dipendenti e indipendenti
x = dataset.drop('Loan_Status', axis = 1)
y = dataset['Loan_Status']
#split dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y)
print("X_train dataset: ", X_train.shape)
print("y_train dataset: ", y_train.shape)
print("X_test dataset: ", X_test.shape)
print("y_test dataset: ", y_test.shape)
y_test.value_counts()
#Distribuzione della variabile dipendente
plt.figure(figsize=(5,5))
pd.value_counts(dataset['Loan_Status']).plot.bar()
plt.xlabel('Loan_Status')
plt.ylabel('Frequency')
dataset['Loan_Status'].value_counts()
plt.savefig('target_distr', bbox_inches='tight')
"""# DATA SCALING"""
#Normalizzazione
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
#z-score
#scaler = StandardScaler()
#X_train=scaler.fit_transform(X_train)
#X_test=scaler.transform(X_test)
df = pd.DataFrame(X_train, columns = x.columns)
df
"""# FEATURE SELECTION"""
#feature selection supervisionata
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from numpy import set_printoptions
fs = SelectKBest(score_func=chi2,k=5)
fs.fit_transform(X_train, y_train)
X_new_train = fs.transform(X_train)
X_new_test = fs.transform(X_test)
print(X_new_train.shape)
x.columns[fs.get_support(indices=True)]
print("features selezionate: ", x.columns[fs.get_support(indices=True)].tolist())
"""# COSTRUZIONE MODELLI"""
models = []
precision = []
accuracy = []
recall = []
f1 = []
"""**LOGISTIC REGRESSION**"""
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score ,recall_score, precision_score, f1_score
logisticRegr = LogisticRegression()
logisticRegr.fit(X_new_train, y_train)
y_train_pred = logisticRegr.predict(X_new_train)
y_test_pred = logisticRegr.predict(X_new_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax)
plt.show()
#print(confusion_matrix(y_test, y_test_pred))
#Risultati ottenuti
print(classification_report(y_test, y_test_pred))
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
models.append('Logistic Regression')
accuracy.append(accuracy_score(y_test, y_test_pred))
recall.append(recall_score(y_test, y_test_pred))
precision.append(precision_score(y_test, y_test_pred))
f1.append(f1_score(y_test, y_test_pred))
"""**DECISION TREE**"""
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier( random_state=42)
tree_model.fit(X_new_train, y_train)
y_train_pred = tree_model.predict(X_new_train)
y_test_pred = tree_model.predict(X_new_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax)
plt.show()
print(classification_report(y_test, y_test_pred))
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
models.append('Decision Tree')
accuracy.append(accuracy_score(y_test, y_test_pred))
recall.append(recall_score(y_test, y_test_pred))
precision.append(precision_score(y_test, y_test_pred))
f1.append(f1_score(y_test, y_test_pred))
"""**NAIVE BAYES**"""
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_new_train, y_train)
y_train_pred = NB.predict(X_new_train)
y_test_pred = NB.predict(X_new_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_confusion_matrix(NB, X_new_test, y_test, ax=ax)
plt.show()
print(classification_report(y_test, y_test_pred))
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
models.append('Naive Bayes')
accuracy.append(accuracy_score(y_test, y_test_pred))
recall.append(recall_score(y_test, y_test_pred))
precision.append(precision_score(y_test, y_test_pred))
f1.append(f1_score(y_test, y_test_pred))
"""**RANDOM FOREST**"""
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest.fit(X_new_train, y_train)
y_train_pred = RandomForest.predict(X_new_train)
y_test_pred = RandomForest.predict(X_new_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_confusion_matrix(RandomForest, X_new_test, y_test, ax=ax)
plt.show()
print(classification_report(y_test, y_test_pred))
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
models.append('Random Forest')
accuracy.append(accuracy_score(y_test, y_test_pred))
recall.append(recall_score(y_test, y_test_pred))
precision.append(precision_score(y_test, y_test_pred))
f1.append(f1_score(y_test, y_test_pred))
"""**XGBOOST**"""
from xgboost import XGBClassifier
XGB = XGBClassifier()
XGB.fit(X_new_train, y_train)
y_train_pred = XGB.predict(X_new_train)
y_test_pred = XGB.predict(X_new_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_confusion_matrix(XGB, X_new_test, y_test, ax=ax)
plt.show()
print(classification_report(y_test, y_test_pred))
print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
models.append('XGBoost')
accuracy.append(accuracy_score(y_test, y_test_pred))
recall.append(recall_score(y_test, y_test_pred))
precision.append(precision_score(y_test, y_test_pred))
f1.append(f1_score(y_test, y_test_pred))
"""**CONFRONTO METRICHE**"""
compare = pd.DataFrame({'Model': models,
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'f1_score': f1})
compare.sort_values(by='Accuracy', ascending=False)
#print(compare.to_latex())
def loan(Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area):
#turning the arguments into a numpy array
Marr = 0 if Married == 'No' else 1
Educ = 0 if Education == 'Not Graduate' else 1
CredHis = 0 if Credit_History == '0: bad credit history' else 1
Dep = 4 if Dependents == '3+' else Dependents
if Property_Area == 'Rural': PA = 0
elif Property_Area == 'Urban': PA = 1
else: PA = 2
x = np.array([Marr, Educ, CoapplicantIncome, CredHis, PA])
#reshaping into 2D array
x_resh = x.reshape(1,-1)
prediction = logisticRegr.predict(scaler.transform(x_resh))
return ("Loan approved" if prediction[0] == 1 else "Loan not approved")
app = gr.Interface(fn=loan,
inputs=[gr.Radio(['Male', 'Female']),
gr.Radio(['Yes', 'No']),
gr.Radio(['0', '1', '2', '3+']),
gr.Radio(['Graduate', 'Not Graduate']),
gr.Radio(['Yes', 'No']),
"number",
"number",
"number",
"number",
gr.Radio(['0: bad credit history', '1: good credit history']),
gr.Radio(['Urban', 'Semiurban', 'Rural'])],
outputs="text",
title = "Loan Eligibility Prediction")
app.launch(debug=True)