Liviox24 commited on
Commit
d9ceac2
1 Parent(s): c557eae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +433 -0
app.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """LoanEligibilityPrediction.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/15wGr9tHgIq7Ua4af83Z0UqfAsH8dyOEZ
8
+
9
+ # IMPORT LIBRERIE
10
+ """
11
+
12
+ # Commented out IPython magic to ensure Python compatibility.
13
+ import numpy as np
14
+ import pandas as pd
15
+ import seaborn as sns
16
+ import gradio as gr
17
+ import matplotlib.pyplot as plt
18
+ # %matplotlib inline
19
+
20
+ from sklearn.model_selection import train_test_split
21
+ from sklearn.preprocessing import MinMaxScaler
22
+ from sklearn.preprocessing import StandardScaler
23
+
24
+ """# COLLEZIONE DATI"""
25
+
26
+ url = "https://raw.githubusercontent.com/livio-24/LoanEligibilityPrediction/main/dataset.csv"
27
+
28
+ #caricamento dataset in un pandas dataframe
29
+ dataset = pd.read_csv(url)
30
+
31
+ """# EXPLORATORY DATA ANALYSIS"""
32
+
33
+ #prime 5 righe
34
+ dataset.head()
35
+
36
+ #numero righe e colonne
37
+ dataset.shape
38
+
39
+ dataset.describe()
40
+ #misure statistiche
41
+
42
+ #info sulle colonne
43
+ #5 variabili numeriche e 8 variabili categoriche
44
+ dataset.info()
45
+
46
+ #Distribuzione variabile target
47
+ dataset['Loan_Status'].value_counts()
48
+
49
+ # numero di valori mancanti in ogni colonna
50
+ # verranno gestiti successivamente nella fase di data cleaning
51
+ dataset.isnull().sum()
52
+
53
+ #eliminiamo colonna Loan_ID perché inutile
54
+ dataset.drop(columns='Loan_ID', axis = 1, inplace=True)
55
+
56
+ dataset.head()
57
+
58
+ """**DATA VISUALIZATION - ANALISI UNIVARIATA**
59
+
60
+ VARIABILI CATEGORICHE
61
+ """
62
+
63
+ #visualizzazione valori variabili catagoriche in percentuale
64
+ dataset['Gender'].value_counts(normalize=True).plot.bar(title='Gender')
65
+ plt.show()
66
+ dataset['Married'].value_counts(normalize=True).plot.bar(title='Married')
67
+ plt.show()
68
+ dataset['Self_Employed'].value_counts(normalize=True).plot.bar(title='Self_Employed')
69
+ plt.show()
70
+ dataset['Credit_History'].value_counts(normalize=True).plot.bar(title='Credit_History')
71
+ plt.show()
72
+
73
+ """Risultati:
74
+ - 80% dei candidati nel dataset è maschio
75
+ - Circa il 65% dei candidati nel dataset è sposato/a
76
+ - Circa il 15% lavora in proprio
77
+ - Circa l'85% ha ripagato i propri debiti
78
+
79
+ VARIABILI ORDINALI
80
+ """
81
+
82
+ #visualizzazione valori variabili ordinali in percentuale
83
+ dataset['Dependents'].value_counts(normalize=True).plot.bar(title='Dependents')
84
+ plt.show()
85
+ dataset['Education'].value_counts(normalize=True).plot.bar(title='Education')
86
+ plt.show()
87
+ dataset['Property_Area'].value_counts(normalize=True).plot.bar(title='Property_Area')
88
+ plt.show()
89
+
90
+ """Risultati:
91
+ - La maggior parte dei candidati non ha familiari dipendenti
92
+ - Circa l'80% dei candidati ha una laurea
93
+ - La maggior parte dei candidati vive in un'area semiurbana
94
+
95
+ VARIABILI NUMERICHE
96
+ """
97
+
98
+ #visualizzazione distribuzione variabile 'ApplicantIncome'
99
+ sns.distplot(dataset['ApplicantIncome'])
100
+ plt.show()
101
+ #boxplot per individuazione outliers
102
+ dataset.boxplot(['ApplicantIncome'])
103
+ plt.show()
104
+
105
+ #visualizzazione distribuzione variabile 'CoapplicantIncome'
106
+ sns.distplot(dataset['CoapplicantIncome'])
107
+ plt.show()
108
+ #boxplot per individuazione outliers
109
+ dataset.boxplot(['CoapplicantIncome'])
110
+ plt.show()
111
+
112
+ #visualizzazione distribuzione variabile 'LoanAmount'
113
+ sns.distplot(dataset['LoanAmount'])
114
+ plt.show()
115
+ dataset.boxplot(['LoanAmount'])
116
+ plt.show()
117
+
118
+ #dataset['LoanAmount'].hist(bins=20)
119
+
120
+ #visualizzazione distribuzione variabile 'Loan_Amount_Term'
121
+ sns.distplot(dataset['Loan_Amount_Term'])
122
+ plt.show()
123
+ dataset.boxplot(['Loan_Amount_Term'])
124
+ plt.show()
125
+
126
+ """La maggior parte delle features numeriche ha degli outliers
127
+
128
+ **Matrice di correlazione**
129
+ """
130
+
131
+ correlation_matrix = dataset.corr()
132
+
133
+ # heat map per visualizzare matrice di correlazione
134
+ sns.heatmap(correlation_matrix, cbar=True, fmt='.1f', annot=True, cmap='coolwarm')
135
+ #plt.savefig('Correlation Heat map', bbox_inches='tight')
136
+
137
+ """Non ci sono molte variabili correlate tra di loro, le uniche due sono ApplicantIncome - LoanAmount"""
138
+
139
+ #conversione variabili categoriche in numeriche
140
+ dataset.replace({'Gender':{'Male':0, 'Female':1}, 'Married' :{'No':0, 'Yes':1}, 'Education':{'Not Graduate':0, 'Graduate':1}, 'Self_Employed':{'No':0, 'Yes':1}, 'Property_Area':{'Rural':0, 'Urban':1, 'Semiurban':2}, 'Loan_Status':{'N':0, 'Y':1}}, inplace = True)
141
+
142
+
143
+ # replacing the value of 3+ to 4
144
+ dataset['Dependents'].replace(to_replace='3+', value=4, inplace=True)
145
+
146
+ """# DATA CLEANING
147
+
148
+ **CONTROLLO VALORI MANCANTI**
149
+ """
150
+
151
+ dataset.isnull().sum()
152
+
153
+ #Sostituiamo i valori mancanti con la moda per le variabili categoriche
154
+ dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True)
155
+ dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True)
156
+ dataset['Dependents'].fillna(dataset['Dependents'].mode()[0], inplace=True)
157
+ dataset['Self_Employed'].fillna(dataset['Self_Employed'].mode()[0], inplace=True)
158
+ dataset['Credit_History'].fillna(dataset['Credit_History'].mode()[0], inplace=True)
159
+
160
+ #Utilizziamo la mediana poiché la variabile ha degli outliers, quindi non è un buon approccio utilizzare la media
161
+ dataset['LoanAmount'].fillna(dataset['LoanAmount'].median(), inplace=True)
162
+ #dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean(), inplace=True)
163
+
164
+ dataset['Loan_Amount_Term'].value_counts()
165
+
166
+ #Nella variabile Loan_Amount_Term possiamo notare che 360 è il valore che si ripete di più, quindi utilizziamo la moda
167
+ dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mode()[0], inplace=True)
168
+
169
+ dataset.isnull().sum()
170
+
171
+ #Per trasformare Dtype di Dependents in int
172
+ dataset['Dependents'] = dataset['Dependents'].astype(str).astype(int)
173
+ dataset.info()
174
+
175
+ """**GESTIONE OUTLIERS**"""
176
+
177
+ fig, axs = plt.subplots(2, 2, figsize=(10, 8))
178
+
179
+ #Distribuzioni prima di applicare log
180
+ sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green')
181
+ sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue')
182
+ sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange')
183
+
184
+ # Log Transformation per normalizzare la distribuzione
185
+
186
+ dataset.ApplicantIncome = np.log(dataset.ApplicantIncome)
187
+ dataset.CoapplicantIncome = np.log(dataset.CoapplicantIncome + 1)
188
+ dataset.LoanAmount = np.log(dataset.LoanAmount)
189
+
190
+ fig, axs = plt.subplots(2, 2, figsize=(10, 8))
191
+
192
+ #Distribuzioni dopo aver applicato log
193
+ sns.histplot(data=dataset, x="ApplicantIncome", kde=True, ax=axs[0, 0], color='green')
194
+ sns.histplot(data=dataset, x="CoapplicantIncome", kde=True, ax=axs[0, 1], color='skyblue')
195
+ sns.histplot(data=dataset, x="LoanAmount", kde=True, ax=axs[1, 0], color='orange')
196
+
197
+ """Possiamo notare che la distribuzione è migliorata dopo aver applicato il logaritmo
198
+
199
+ # SPLIT DATASET
200
+ """
201
+
202
+ #definizione variabili dipendenti e indipendenti
203
+
204
+ x = dataset.drop('Loan_Status', axis = 1)
205
+ y = dataset['Loan_Status']
206
+
207
+ #split dataset
208
+
209
+ X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y)
210
+
211
+ print("X_train dataset: ", X_train.shape)
212
+ print("y_train dataset: ", y_train.shape)
213
+ print("X_test dataset: ", X_test.shape)
214
+ print("y_test dataset: ", y_test.shape)
215
+
216
+ y_test.value_counts()
217
+
218
+ #Distribuzione della variabile dipendente
219
+ plt.figure(figsize=(5,5))
220
+ pd.value_counts(dataset['Loan_Status']).plot.bar()
221
+ plt.xlabel('Loan_Status')
222
+ plt.ylabel('Frequency')
223
+ dataset['Loan_Status'].value_counts()
224
+ plt.savefig('target_distr', bbox_inches='tight')
225
+
226
+ """# DATA SCALING"""
227
+
228
+ #Normalizzazione
229
+ scaler = MinMaxScaler(feature_range=(0, 1))
230
+ X_train = scaler.fit_transform(X_train)
231
+ X_test = scaler.fit_transform(X_test)
232
+
233
+ #z-score
234
+ #scaler = StandardScaler()
235
+ #X_train=scaler.fit_transform(X_train)
236
+ #X_test=scaler.transform(X_test)
237
+
238
+ df = pd.DataFrame(X_train, columns = x.columns)
239
+
240
+ df
241
+
242
+ """# FEATURE SELECTION"""
243
+
244
+ #feature selection supervisionata
245
+
246
+ from sklearn.feature_selection import SelectKBest
247
+ from sklearn.feature_selection import chi2, f_classif
248
+ from numpy import set_printoptions
249
+
250
+ fs = SelectKBest(score_func=chi2,k=5)
251
+ fs.fit_transform(X_train, y_train)
252
+
253
+ X_new_train = fs.transform(X_train)
254
+ X_new_test = fs.transform(X_test)
255
+ print(X_new_train.shape)
256
+
257
+ x.columns[fs.get_support(indices=True)]
258
+ print("features selezionate: ", x.columns[fs.get_support(indices=True)].tolist())
259
+
260
+ """# COSTRUZIONE MODELLI"""
261
+
262
+ models = []
263
+ precision = []
264
+ accuracy = []
265
+ recall = []
266
+ f1 = []
267
+
268
+ """**LOGISTIC REGRESSION**"""
269
+
270
+ from sklearn.linear_model import LogisticRegression
271
+ from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score ,recall_score, precision_score, f1_score
272
+
273
+ logisticRegr = LogisticRegression()
274
+ logisticRegr.fit(X_new_train, y_train)
275
+
276
+ y_train_pred = logisticRegr.predict(X_new_train)
277
+ y_test_pred = logisticRegr.predict(X_new_test)
278
+
279
+ fig, ax = plt.subplots(figsize=(8, 8))
280
+ plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax)
281
+ plt.show()
282
+ #print(confusion_matrix(y_test, y_test_pred))
283
+
284
+ #Risultati ottenuti
285
+ print(classification_report(y_test, y_test_pred))
286
+ print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
287
+ print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
288
+
289
+ models.append('Logistic Regression')
290
+ accuracy.append(accuracy_score(y_test, y_test_pred))
291
+ recall.append(recall_score(y_test, y_test_pred))
292
+ precision.append(precision_score(y_test, y_test_pred))
293
+ f1.append(f1_score(y_test, y_test_pred))
294
+
295
+ """**DECISION TREE**"""
296
+
297
+ from sklearn.tree import DecisionTreeClassifier
298
+
299
+ tree_model = DecisionTreeClassifier( random_state=42)
300
+ tree_model.fit(X_new_train, y_train)
301
+
302
+ y_train_pred = tree_model.predict(X_new_train)
303
+ y_test_pred = tree_model.predict(X_new_test)
304
+
305
+ fig, ax = plt.subplots(figsize=(8, 8))
306
+ plot_confusion_matrix(logisticRegr, X_new_test, y_test, ax=ax)
307
+ plt.show()
308
+
309
+ print(classification_report(y_test, y_test_pred))
310
+ print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
311
+ print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
312
+
313
+ models.append('Decision Tree')
314
+ accuracy.append(accuracy_score(y_test, y_test_pred))
315
+ recall.append(recall_score(y_test, y_test_pred))
316
+ precision.append(precision_score(y_test, y_test_pred))
317
+ f1.append(f1_score(y_test, y_test_pred))
318
+
319
+ """**NAIVE BAYES**"""
320
+
321
+ from sklearn.naive_bayes import GaussianNB
322
+
323
+ NB = GaussianNB()
324
+ NB.fit(X_new_train, y_train)
325
+
326
+ y_train_pred = NB.predict(X_new_train)
327
+ y_test_pred = NB.predict(X_new_test)
328
+
329
+ fig, ax = plt.subplots(figsize=(8, 8))
330
+ plot_confusion_matrix(NB, X_new_test, y_test, ax=ax)
331
+ plt.show()
332
+
333
+ print(classification_report(y_test, y_test_pred))
334
+ print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
335
+ print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
336
+
337
+ models.append('Naive Bayes')
338
+ accuracy.append(accuracy_score(y_test, y_test_pred))
339
+ recall.append(recall_score(y_test, y_test_pred))
340
+ precision.append(precision_score(y_test, y_test_pred))
341
+ f1.append(f1_score(y_test, y_test_pred))
342
+
343
+ """**RANDOM FOREST**"""
344
+
345
+ from sklearn.ensemble import RandomForestClassifier
346
+
347
+ RandomForest = RandomForestClassifier()
348
+ RandomForest.fit(X_new_train, y_train)
349
+
350
+ y_train_pred = RandomForest.predict(X_new_train)
351
+ y_test_pred = RandomForest.predict(X_new_test)
352
+
353
+ fig, ax = plt.subplots(figsize=(8, 8))
354
+ plot_confusion_matrix(RandomForest, X_new_test, y_test, ax=ax)
355
+ plt.show()
356
+
357
+ print(classification_report(y_test, y_test_pred))
358
+ print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
359
+ print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
360
+
361
+ models.append('Random Forest')
362
+ accuracy.append(accuracy_score(y_test, y_test_pred))
363
+ recall.append(recall_score(y_test, y_test_pred))
364
+ precision.append(precision_score(y_test, y_test_pred))
365
+ f1.append(f1_score(y_test, y_test_pred))
366
+
367
+ """**XGBOOST**"""
368
+
369
+ from xgboost import XGBClassifier
370
+
371
+ XGB = XGBClassifier()
372
+ XGB.fit(X_new_train, y_train)
373
+
374
+ y_train_pred = XGB.predict(X_new_train)
375
+ y_test_pred = XGB.predict(X_new_test)
376
+
377
+ fig, ax = plt.subplots(figsize=(8, 8))
378
+ plot_confusion_matrix(XGB, X_new_test, y_test, ax=ax)
379
+ plt.show()
380
+
381
+ print(classification_report(y_test, y_test_pred))
382
+ print("Accuracy on training data:",accuracy_score(y_train, y_train_pred))
383
+ print("Accuracy on test data:",accuracy_score(y_test, y_test_pred))
384
+
385
+ models.append('XGBoost')
386
+ accuracy.append(accuracy_score(y_test, y_test_pred))
387
+ recall.append(recall_score(y_test, y_test_pred))
388
+ precision.append(precision_score(y_test, y_test_pred))
389
+ f1.append(f1_score(y_test, y_test_pred))
390
+
391
+ """**CONFRONTO METRICHE**"""
392
+
393
+ compare = pd.DataFrame({'Model': models,
394
+ 'Accuracy': accuracy,
395
+ 'Precision': precision,
396
+ 'Recall': recall,
397
+ 'f1_score': f1})
398
+ compare.sort_values(by='Accuracy', ascending=False)
399
+ #print(compare.to_latex())
400
+
401
+ def loan(Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area):
402
+ #turning the arguments into a numpy array
403
+ Marr = 0 if Married == 'No' else 1
404
+ Educ = 0 if Education == 'Not Graduate' else 1
405
+ CredHis = 0 if Credit_History == '0: bad credit history' else 1
406
+ Dep = 4 if Dependents == '3+' else Dependents
407
+
408
+ if Property_Area == 'Rural': PA = 0
409
+ elif Property_Area == 'Urban': PA = 1
410
+ else: PA = 2
411
+
412
+ x = np.array([Marr, Educ, CoapplicantIncome, CredHis, PA])
413
+ #reshaping into 2D array
414
+ x_resh = x.reshape(1,-1)
415
+ prediction = logisticRegr.predict(scaler.transform(x_resh))
416
+
417
+ return ("Loan approved" if prediction[0] == 1 else "Loan not approved")
418
+
419
+ app = gr.Interface(fn=loan,
420
+ inputs=[gr.Radio(['Male', 'Female']),
421
+ gr.Radio(['Yes', 'No']),
422
+ gr.Radio(['0', '1', '2', '3+']),
423
+ gr.Radio(['Graduate', 'Not Graduate']),
424
+ gr.Radio(['Yes', 'No']),
425
+ "number",
426
+ "number",
427
+ "number",
428
+ "number",
429
+ gr.Radio(['0: bad credit history', '1: good credit history']),
430
+ gr.Radio(['Urban', 'Semiurban', 'Rural'])],
431
+ outputs="text",
432
+ title = "Loan Eligibility Prediction")
433
+ app.launch(debug=True)