Danjari commited on
Commit
d840b87
β€’
1 Parent(s): b05d3db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +986 -0
app.py CHANGED
@@ -0,0 +1,986 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit_shadcn_ui as ui
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.graph_objects as go
7
+ import graphviz
8
+ import streamlit.components.v1 as components
9
+ from streamlit_option_menu import option_menu
10
+ from sklearn.preprocessing import StandardScaler
11
+ from PIL import Image
12
+ from sklearn.neighbors import KNeighborsClassifier
13
+ from sklearn.linear_model import LinearRegression,LogisticRegression
14
+ from sklearn.metrics import f1_score, r2_score,accuracy_score, precision_score,recall_score
15
+ st.set_option('deprecation.showPyplotGlobalUse', False)
16
+ from sklearn.model_selection import train_test_split, GridSearchCV
17
+ from sklearn.tree import DecisionTreeClassifier,export_graphviz # Import Decision Tree Classifier
18
+ import mlflow
19
+ from sklearn import metrics
20
+ from codecarbon import EmissionsTracker
21
+ #%matplotlib inline
22
+
23
+ # Initialize the emissions tracker
24
+ tracker = EmissionsTracker()
25
+ tracker.start()
26
+ #st.set_page_config(layout='wide')
27
+ st.set_option('deprecation.showPyplotGlobalUse', False)
28
+ #####################################################################
29
+ # Load and cleaning the dataset
30
+ df = pd.read_csv("Students.csv")
31
+ df_VIZ= pd.read_csv("Student_modified.csv")
32
+
33
+ img_importance = Image.open('feature_importance.png')
34
+ img_importance_subset = Image.open('feature_subset.png')
35
+ img_contribution_subset = Image.open('contribution subset.png')
36
+ #Renaming the column 'Nacionality' to 'Nationality' and 'Output' to 'Student Status'
37
+
38
+ df.rename(columns = {'Nacionality':'Nationality', 'Output': 'Student Status'}, inplace = True)
39
+
40
+
41
+ # function that takes categorical values and turns them into corresponding strings.
42
+ def cat_to_string(df, column_name, mapping_dict):
43
+ df_string = df.copy()
44
+ # Replace the numbers in the specified column with strings using the map function
45
+ df_string[column_name] = df_string[column_name].map(lambda x: mapping_dict[x] if x in mapping_dict else x)
46
+ return df_string
47
+
48
+ # Dictionary to map the values in the 'school' column to strings
49
+ marital_status_mapping = {
50
+ 1: "single",
51
+ 2: "married",
52
+ 3: "widower",
53
+ 4: "divorced",
54
+ 5: "facto union",
55
+ 6: "legally separated"
56
+ }
57
+
58
+ # application dic
59
+ application_mode_mapping = {
60
+ 1: "1st phase - general contingent",
61
+ 2: "Ordinance No. 612/93",
62
+ 5: "1st phase - special contingent (Azores Island)",
63
+ 7: "Holders of other higher courses",
64
+ 10: "Ordinance No. 854-B/99",
65
+ 15: "International student (bachelor)",
66
+ 16: "1st phase - special contingent (Madeira Island)",
67
+ 17: "2nd phase - general contingent",
68
+ 18: "3rd phase - general contingent",
69
+ 26: "Ordinance No. 533-A/99, item b2 (Different Plan)",
70
+ 27: "Ordinance No. 533-A/99, item b3 (Other Institution)",
71
+ 39: "Over 23 years old",
72
+ 42: "Transfer",
73
+ 43: "Change of course",
74
+ 44: "Technological specialization diploma holders",
75
+ 51: "Change of institution/course",
76
+ 53: "Short cycle diploma holders",
77
+ 57: "Change of institution/course (International)"
78
+ }
79
+
80
+ # application order '
81
+
82
+ application_order_mapping = {
83
+ 0: "first choice",
84
+ 1: "second choice",
85
+ 2: "third choice",
86
+ 3: "fourth choice",
87
+ 4: "fifth choice",
88
+ 5: "sixth choice",
89
+ 6: "seventh choice",
90
+ 7: "eighth choice",
91
+ 8: "ninth choice",
92
+ 9: "last choice"
93
+ }
94
+
95
+
96
+ # course mapping
97
+
98
+ course_mapping = {
99
+ 33: "Biofuel Production Technologies",
100
+ 171: "Animation and Multimedia Design",
101
+ 8014: "Social Service (evening attendance)",
102
+ 9003: "Agronomy",
103
+ 9070: "Communication Design",
104
+ 9085: "Veterinary Nursing",
105
+ 9119: "Informatics Engineering",
106
+ 9130: "Equinculture",
107
+ 9147: "Management",
108
+ 9238: "Social Service",
109
+ 9254: "Tourism",
110
+ 9500: "Nursing",
111
+ 9556: "Oral Hygiene",
112
+ 9670: "Advertising and Marketing Management",
113
+ 9773: "Journalism and Communication",
114
+ 9853: "Basic Education",
115
+ 9991: "Management (evening attendance)"
116
+ }
117
+
118
+
119
+ # previous qualifications
120
+
121
+ previous_qualification_mapping = {
122
+ 1: "Secondary education",
123
+ 2: "Higher education - bachelor's degree",
124
+ 3: "Higher education - degree",
125
+ 4: "Higher education - master's",
126
+ 5: "Higher education - doctorate",
127
+ 6: "Frequency of higher education",
128
+ 9: "12th year of schooling - not completed",
129
+ 10: "11th year of schooling - not completed",
130
+ 12: "Other - 11th year of schooling",
131
+ 14: "10th year of schooling",
132
+ 15: "10th year of schooling - not completed",
133
+ 19: "Basic education 3rd cycle (9th/10th/11th year) or equivalent",
134
+ 38: "Basic education 2nd cycle (6th/7th/8th year) or equivalent",
135
+ 39: "Technological specialization course",
136
+ 40: "Higher education - degree (1st cycle)",
137
+ 42: "Professional higher technical course",
138
+ 43: "Higher education - master (2nd cycle)"
139
+ }
140
+
141
+ nationality_mapping = {
142
+ 1: "Portuguese",
143
+ 2: "German",
144
+ 6: "Spanish",
145
+ 11: "Italian",
146
+ 13: "Dutch",
147
+ 14: "English",
148
+ 17: "Lithuanian",
149
+ 21: "Angolan",
150
+ 22: "Cape Verdean",
151
+ 24: "Guinean",
152
+ 25: "Mozambican",
153
+ 26: "Santomean",
154
+ 32: "Turkish",
155
+ 41: "Brazilian",
156
+ 62: "Romanian",
157
+ 100: "Moldova (Republic of)",
158
+ 101: "Mexican",
159
+ 103: "Ukrainian",
160
+ 105: "Russian",
161
+ 108: "Cuban",
162
+ 109: "Colombian"
163
+ }
164
+
165
+ mothers_qualification_mapping = {
166
+ 1: "Secondary Education - 12th Year of Schooling or Equivalent",
167
+ 2: "Higher Education - Bachelor's Degree",
168
+ 3: "Higher Education - Degree",
169
+ 4: "Higher Education - Master's",
170
+ 5: "Higher Education - Doctorate",
171
+ 6: "Frequency of Higher Education",
172
+ 9: "12th Year of Schooling - Not Completed",
173
+ 10: "11th Year of Schooling - Not Completed",
174
+ 11: "7th Year (Old)",
175
+ 12: "Other - 11th Year of Schooling",
176
+ 14: "10th Year of Schooling",
177
+ 18: "General commerce course",
178
+ 19: "Basic Education 3rd Cycle (9th/10th/11th Year) or Equivalent",
179
+ 22: "Technical-professional course",
180
+ 26: "7th year of schooling",
181
+ 27: "2nd cycle of the general high school course",
182
+ 29: "9th Year of Schooling - Not Completed",
183
+ 30: "8th year of schooling",
184
+ 34: "Unknown",
185
+ 35: "Can't read or write",
186
+ 36: "Can read without having a 4th year of schooling",
187
+ 37: "Basic education 1st cycle (4th/5th year) or equivalent",
188
+ 38: "Basic Education 2nd Cycle (6th/7th/8th Year) or Equivalent",
189
+ 39: "Technological specialization course",
190
+ 40: "Higher education - degree (1st cycle)",
191
+ 41: "Specialized higher studies course",
192
+ 42: "Professional higher technical course",
193
+ 43: "Higher Education - Master (2nd cycle)",
194
+ 44: "Higher Education - Doctorate (3rd cycle)"
195
+ }
196
+
197
+ fathers_qualification_mapping = {
198
+ 1: "Secondary Education - 12th Year of Schooling or Equivalent",
199
+ 2: "Higher Education - Bachelor's Degree",
200
+ 3: "Higher Education - Degree",
201
+ 4: "Higher Education - Master's",
202
+ 5: "Higher Education - Doctorate",
203
+ 6: "Frequency of Higher Education",
204
+ 9: "12th Year of Schooling - Not Completed",
205
+ 10: "11th Year of Schooling - Not Completed",
206
+ 11: "7th Year (Old)",
207
+ 12: "Other - 11th Year of Schooling",
208
+ 13: "2nd year complementary high school course",
209
+ 14: "10th Year of Schooling",
210
+ 18: "General commerce course",
211
+ 19: "Basic Education 3rd Cycle (9th/10th/11th Year) or Equivalent",
212
+ 20: "Complementary High School Course",
213
+ 22: "Technical-professional course",
214
+ 25: "Complementary High School Course - not concluded",
215
+ 26: "7th year of schooling",
216
+ 27: "2nd cycle of the general high school course",
217
+ 29: "9th Year of Schooling - Not Completed",
218
+ 30: "8th year of schooling",
219
+ 31: "General Course of Administration and Commerce",
220
+ 33: "Supplementary Accounting and Administration",
221
+ 34: "Unknown",
222
+ 35: "Can't read or write",
223
+ 36: "Can read without having a 4th year of schooling",
224
+ 37: "Basic education 1st cycle (4th/5th year) or equivalent",
225
+ 38: "Basic Education 2nd Cycle (6th/7th/8th Year) or Equivalent",
226
+ 39: "Technological specialization course",
227
+ 40: "Higher education - degree (1st cycle)",
228
+ 41: "Specialized higher studies course",
229
+ 42: "Professional higher technical course",
230
+ 43: "Higher Education - Master (2nd cycle)",
231
+ 44: "Higher Education - Doctorate (3rd cycle)"
232
+ }
233
+
234
+ mothers_occupation_mapping = {
235
+ 0: "Student",
236
+ 1: "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
237
+ 2: "Specialists in Intellectual and Scientific Activities",
238
+ 3: "Intermediate Level Technicians and Professions",
239
+ 4: "Administrative staff",
240
+ 5: "Personal Services, Security and Safety Workers and Sellers",
241
+ 6: "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
242
+ 7: "Skilled Workers in Industry, Construction and Craftsmen",
243
+ 8: "Installation and Machine Operators and Assembly Workers",
244
+ 9: "Unskilled Workers",
245
+ 10: "Armed Forces Professions",
246
+ 90: "Other Situation",
247
+ 99: "Not Available",
248
+ 122: "Health professionals",
249
+ 123: "Teachers",
250
+ 125: "Specialists in Information and Communication Technologies (ICT)",
251
+ 131: "Intermediate level science and engineering technicians and professions",
252
+ 132: "Technicians and professionals, of intermediate level of health",
253
+ 134: "Intermediate level technicians from legal, social, sports, cultural and similar services",
254
+ 141: "Office workers, secretaries in general and data processing operators",
255
+ 143: "Data, accounting, statistical, financial services and registry-related operators",
256
+ 144: "Other administrative support staff",
257
+ 151: "Personal service workers",
258
+ 152: "Sellers",
259
+ 153: "Personal care workers and the like",
260
+ 171: "Skilled construction workers and the like, except electricians",
261
+ 173: "Skilled workers in printing, precision instrument manufacturing, jewelers, artisans and the like",
262
+ 175: "Workers in food processing, woodworking, clothing and other industries and crafts",
263
+ 191: "Cleaning workers",
264
+ 192: "Unskilled workers in agriculture, animal production, fisheries and forestry",
265
+ 193: "Unskilled workers in extractive industry, construction, manufacturing and transport",
266
+ 194: "Meal preparation assistants"
267
+ }
268
+
269
+ fathers_occupation_mapping = {
270
+ 0: "Student",
271
+ 1: "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
272
+ 2: "Specialists in Intellectual and Scientific Activities",
273
+ 3: "Intermediate Level Technicians and Professions",
274
+ 4: "Administrative staff",
275
+ 5: "Personal Services, Security and Safety Workers and Sellers",
276
+ 6: "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
277
+ 7: "Skilled Workers in Industry, Construction and Craftsmen",
278
+ 8: "Installation and Machine Operators and Assembly Workers",
279
+ 9: "Unskilled Workers",
280
+ 10: "Armed Forces Professions",
281
+ 90: "Other Situation",
282
+ 99: "Not Available",
283
+ 101: "Armed Forces Officers",
284
+ 102: "Armed Forces Sergeants",
285
+ 103: "Other Armed Forces personnel",
286
+ 112: "Directors of administrative and commercial services",
287
+ 114: "Hotel, catering, trade and other services directors",
288
+ 121: "Specialists in the physical sciences, mathematics, engineering and related techniques",
289
+ 122: "Health professionals",
290
+ 123: "Teachers",
291
+ 124: "Specialists in finance, accounting, administrative organization, public and commercial relations",
292
+ 131: "Intermediate level science and engineering technicians and professions",
293
+ 132: "Technicians and professionals, of intermediate level of health",
294
+ 134: "Intermediate level technicians from legal, social, sports, cultural and similar services",
295
+ 135: "Information and communication technology technicians",
296
+ 141: "Office workers, secretaries in general and data processing operators",
297
+ 143: "Data, accounting, statistical, financial services and registry-related operators",
298
+ 144: "Other administrative support staff",
299
+ 151: "Personal service workers",
300
+ 152: "Sellers",
301
+ 153: "Personal care workers and the like",
302
+ 154: "Protection and security services personnel",
303
+ 161: "Market-oriented farmers and skilled agricultural and animal production workers",
304
+ 163: "Farmers, livestock keepers, fishermen, hunters and gatherers, subsistence",
305
+ 171: "Skilled construction workers and the like, except electricians",
306
+ 172: "Skilled workers in metallurgy, metalworking and similar",
307
+ 174: "Skilled workers in electricity and electronics",
308
+ 175: "Workers in food processing, woodworking, clothing and other industries and crafts",
309
+ 181: "Fixed plant and machine operators",
310
+ 182: "Assembly workers",
311
+ 183: "Vehicle drivers and mobile equipment operators",
312
+ 192: "Unskilled workers in agriculture, animal production, fisheries and forestry",
313
+ 193: "Unskilled workers in extractive industry, construction, manufacturing and transport",
314
+ 194: "Meal preparation assistants",
315
+ 195: "Street vendors (except food) and street service providers"
316
+ }
317
+ gender_mapping= {
318
+ 0: "Female",
319
+ 1: "Male"
320
+ }
321
+ international_mapping= {
322
+ 0: "Not International",
323
+ 1: "International"
324
+ }
325
+
326
+ # Define a dictionary that relates column names to their respective mappings
327
+ mappings = {
328
+ "Marital status": marital_status_mapping,
329
+ "Application mode": application_mode_mapping,
330
+ "Application order": application_order_mapping,
331
+ "Course": course_mapping,
332
+ "Previous qualification": previous_qualification_mapping,
333
+ "Nacionality": nationality_mapping,
334
+ "Mother's qualification": mothers_qualification_mapping,
335
+ "Father's qualification": fathers_qualification_mapping,
336
+ "Mother's occupation": mothers_occupation_mapping,
337
+ "Father's occupation": fathers_occupation_mapping,
338
+ "Gender": gender_mapping,
339
+ "International": international_mapping,
340
+
341
+
342
+ }
343
+
344
+
345
+
346
+ # Apply the mapping to each column using a loop
347
+ # Apply the mapping to each column using a loop
348
+ for column_name, mapping_dict in mappings.items():
349
+ df_string = cat_to_string(df_VIZ, column_name, mapping_dict)
350
+
351
+ # Transforming 'Student Status' values into numerical format, making them interpretable by machine learning algorithms
352
+ df['Student Status'] = df['Student Status'].map({'Dropout' : 0, 'Enrolled': 1, 'Graduate': 2})
353
+
354
+ # Removing unnecessary columns that won't contribute to the analysis. dropping values with a corr [-0.05,0.05]
355
+ df = df.drop(columns=['Nationality', 'International', 'Educational special needs', 'Course',
356
+ 'Mother\'s qualification','Father\'s qualification',
357
+ 'Mother\'s occupation', 'Father\'s occupation',
358
+ 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (evaluations)',
359
+ 'Unemployment rate', 'Inflation rate', 'GDP'], axis=1)
360
+ # Creating interaction features for academic performance
361
+ df['Yearly Credit Approved'] = df['Curricular units 1st sem (approved)'] * df['Curricular units 2nd sem (approved)']
362
+ df['Yearly Grade'] = df['Curricular units 1st sem (grade)'] * df['Curricular units 2nd sem (grade)']
363
+
364
+ # Creating aggregated features
365
+ df['Total Credit approved'] = df['Curricular units 1st sem (approved)'] + df['Curricular units 2nd sem (approved)']
366
+ df['Total Grade'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2
367
+
368
+ # Dropping the original features to reduce multi-collinearity
369
+ columns_to_drop = ['Curricular units 1st sem (approved)', 'Curricular units 2nd sem (approved)',
370
+ 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)',]
371
+ df.drop(columns_to_drop, axis=1, inplace=True)
372
+
373
+ #####################################################################
374
+ # TRAINING AND EVALUATION OF THE MODEL
375
+
376
+ y = df['Student Status']
377
+ X = df.drop(['Student Status'], axis = 1)
378
+
379
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
380
+ def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, **kwargs):
381
+ """
382
+ Train a machine learning model and evaluate its performance.
383
+
384
+ Parameters:
385
+ - model: The machine learning model to train (e.g., DecisionTreeClassifier()).
386
+ - X_train: Training data features.
387
+ - X_test: Testing data features.
388
+ - y_train: Training data labels.
389
+ - y_test: Testing data labels.
390
+ - **kwargs: Additional keyword arguments to pass to the model's fit method.
391
+
392
+ Returns:
393
+ - model: The trained machine learning model.
394
+ - accuracy: The accuracy of the model on the test data.
395
+ - precision: The precision of the model on the test data.
396
+ """
397
+ # Train the model
398
+ model.fit(X_train, y_train, **kwargs)
399
+
400
+ # Make predictions
401
+ y_pred = model.predict(X_test)
402
+
403
+ # Calculate f1_score,accuracy and precision
404
+ f1_score = metrics.f1_score(y_test, y_pred, average='micro')
405
+ accuracy = accuracy_score(y_test, y_pred)
406
+ accuracy = accuracy_score(y_test, y_pred)
407
+ precision = precision_score(y_test, y_pred, average='micro')
408
+
409
+
410
+ # Print performance metrics
411
+ # st.write(f"Accuracy of {model} is :", accuracy)
412
+ # st.write(f"Precision of {model} is :", precision)
413
+
414
+ y_pred = pd.Series(y_pred, index=X_test.index)
415
+
416
+ return model,y_pred,f1_score, accuracy, precision
417
+
418
+
419
+
420
+ #####################################################################
421
+
422
+ # Now both experiments are logged to MLflow
423
+
424
+ #############################
425
+
426
+ ### The st.title() function sets the title of the Streamlit application
427
+ st.title("Student Dropout Rate In Portugal")
428
+
429
+
430
+ ### menu bar
431
+
432
+ selected = option_menu(
433
+ menu_title = None,
434
+ options = ["Overview","Visualisation","Prediction","Conclusion"],
435
+ icons = ["menu-up", "pie-chart-fill", "graph-up-arrow","recycle"],
436
+ default_index = 0,
437
+ orientation = "horizontal",
438
+
439
+ )
440
+
441
+
442
+ # update the metrics based on the model
443
+ def update_metrics(model_type, f1_score,accuracy,precision):
444
+ cols = st.columns(3)
445
+ # Check if 'first_run' exists in the session state, if not, initialize it
446
+ if 'first_run' not in st.session_state:
447
+ st.session_state.first_run = True
448
+ st.session_state.previous_f1 = 0
449
+ st.session_state.previous_accuracy = 0
450
+ st.session_state.previous_precision = 0
451
+
452
+ # Calculate the changes if not the first run
453
+ if st.session_state.first_run:
454
+ f1_change = accuracy_change = precision_change = 0
455
+ st.session_state.first_run = False # Set first run to False after the first check
456
+ elif st.session_state.previous_precision != 0 and st.session_state.previous_accuracy != 0 and st.session_state.previous_f1 != 0:
457
+ f1_change = round((f1_score - st.session_state.previous_f1) / st.session_state.previous_f1 * 100, 3)
458
+ accuracy_change = round(
459
+ (accuracy - st.session_state.previous_accuracy) / st.session_state.previous_accuracy * 100, 3)
460
+ precision_change = round(
461
+ (precision - st.session_state.previous_precision) / st.session_state.previous_precision * 100, 3)
462
+ else:
463
+ f1_change = accuracy_change = precision_change = 0
464
+
465
+ # Update the previous metrics
466
+ st.session_state.previous_f1 = f1_score
467
+ st.session_state.previous_accuracy = accuracy
468
+ st.session_state.previous_precision = precision
469
+ with cols[0]:
470
+ ui.metric_card(title=f"{model_type}' f1-Score",
471
+ content=f"{round(f1_score,3) *100}%",
472
+ description=f"{f1_change}% from last run",
473
+ key="card1")
474
+ with cols[1]:
475
+ ui.metric_card(title="Accuracy",
476
+ content=f"{round(accuracy,3)*100}%",
477
+ description=f"{accuracy_change}% from last run",
478
+ key="card2")
479
+ with cols[2]:
480
+ ui.metric_card(title="Precision",
481
+ content=f"{round(precision,4)*100}%",
482
+ description=f"{precision_change}% from last run",
483
+ key="card3")
484
+
485
+ if selected == "Overview":
486
+ st.title("Overview")
487
+ st.markdown("""
488
+ ### 🧐Dataset Overview
489
+
490
+ Our dataset provides an overview of student demographics, educational paths, and outcomes within the Portuguese education system. It includes a variety of attributes including:
491
+ - **Personal Information:** Age, gender, marital status.
492
+ - **Academic Details:** Course enrollment, previous qualifications, and academic performance across semesters.
493
+ - **Socio-economic Factors:** Parents' occupation and educational levels, scholarship status, and tuition payment statuses.
494
+
495
+ Additionally, the dataset integrates broader economic indicators, such as the unemployment rate, inflation rate, and GDP, which may influence student success. However, we will mostly discard these indicators for this study.
496
+
497
+ ### 🎯Project Goal
498
+
499
+ The goal of the project is to analyze the factors that contribute to educational outcomes such as graduation, retention, and dropout rates among Portuguese students. We aim to identify patterns and correlations that can inform educational policies and intervention strategies to enhance student achievement and retention.
500
+ """)
501
+
502
+ looker_link = "https://lookerstudio.google.com/reporting/6141ce7c-954d-4801-bad7-b58131aa563d/page/J1lxD"
503
+ column1, column2, column3 = st.columns([1, 1, 1])
504
+ with column1:
505
+ st.write("")
506
+ with column2:
507
+ ui.link_button(text="πŸ‘‰πŸ» Go To Looker Studio", url=looker_link, key="link_btn")
508
+ with column3:
509
+ st.write("")
510
+ if selected == "Visualisation":
511
+
512
+ tab1, tab2, tab3,tab4 = st.tabs(["Barcharts", "Stacked", "Sankey","Explainable AI"])
513
+
514
+ with tab1:
515
+ st.subheader("Percentage of Output by Gender")
516
+ # Group the output based on the gender and count how many there is in each category
517
+ # create an extra column in our new dataframe called counts
518
+ df_counts = df_VIZ.groupby(['Gender', 'Output']).size().reset_index(name='Count')
519
+
520
+ # find the total number of
521
+ total_counts = df_counts.groupby('Gender')['Count'].transform('sum')
522
+
523
+ # Calculate percentage
524
+ df_counts['Percentage'] = 100 * df_counts['Count'] / total_counts
525
+
526
+ # Plot configuration
527
+ plt.figure(figsize=(12, 8))
528
+ plt.title('Percentage of Output by Gender')
529
+
530
+ # Using a bar plot to show the percentages of 'Output' values for each 'Gender'
531
+ sns.barplot(data=df_counts, x='Gender', y='Percentage', hue='Output', palette='pastel', dodge=True)
532
+
533
+ # Adjust legend
534
+ plt.legend(title='Output')
535
+
536
+ # Show plot
537
+ st.pyplot()
538
+
539
+ paragraphs = [
540
+ "Graduation Rate:",
541
+ "A smaller proportion of female students graduate compared to their male counterparts,as indicated by the green bars. Females show approximately a 60% graduation rate, while males reach almost 40%.",
542
+ "Dropout Rate:",
543
+ "The dropout rate for female students is significantly lower than for males, with about 20% of females dropping out.",
544
+ "The dropout rate for males is lower than their graduation rate but still substantial, roughly around 30%."]
545
+ for paragraph in paragraphs:
546
+ st.write(paragraph)
547
+ # Filter the DataFrame to include rows with specified marital status values
548
+ filtered_df = df_VIZ[df_VIZ['Marital status'].isin(['divorced', 'married', 'single'])]
549
+
550
+ df_counts_marital_status = filtered_df.groupby(['Marital status', 'Output']).size().reset_index(name='count')
551
+
552
+ # Plot configuration
553
+ plt.figure(figsize=(12, 8))
554
+ plt.title('Count of Output by Marital Status')
555
+
556
+ # Using barplot to show the counts of 'Output' values for each 'Marital status'
557
+ sns.barplot(data=df_counts_marital_status, x='Marital status', y='count', hue='Output', palette='pastel')
558
+
559
+ # Adjust legend
560
+ plt.legend(title='Output')
561
+
562
+ # Show plot
563
+ st.pyplot()
564
+ with tab2:
565
+ st.subheader("Impact of Mother's Occupation on Student Outcomes")
566
+
567
+ # Filter rows where "Mother's occupation" is not numeric
568
+ filtered_df = df_VIZ[~df_VIZ["Mother's occupation"].astype(str).str.isnumeric()]
569
+
570
+ # Group the filtered data by "Mother's occupation" and "Output"
571
+ grouped_data = filtered_df.groupby(["Mother's occupation", 'Output']).size().unstack(fill_value=0)
572
+
573
+ # Reset index to make "Mother's occupation" a column again for easier plotting
574
+ grouped_data.reset_index(inplace=True)
575
+
576
+ # Plotting
577
+ plt.figure(figsize=(14, 8))
578
+
579
+ # Plotting each category as a separate bar with appropriate stacking
580
+ sns.barplot(x="Mother's occupation", y="Graduate", data=grouped_data, color="green", label="Graduate")
581
+ sns.barplot(x="Mother's occupation", y="Dropout", data=grouped_data, color="red", label="Dropout",
582
+ bottom=grouped_data["Graduate"])
583
+ sns.barplot(x="Mother's occupation", y="Enrolled", data=grouped_data, color="blue", label="Enrolled",
584
+ bottom=grouped_data["Graduate"] + grouped_data["Dropout"])
585
+
586
+ # Customize plot appearance
587
+ plt.xticks(rotation=90)
588
+ plt.xlabel("Mother's Occupation")
589
+ plt.ylabel("Number of Students")
590
+ plt.title("Impact of Mother's Occupation on Student Outcomes")
591
+ plt.legend(title="Output")
592
+ plt.tight_layout()
593
+ st.pyplot()
594
+
595
+ st.subheader("Impact of Father's Occupation on Student Outcomes")
596
+ # Grouping the data by "Father's occupation" and "Output"
597
+ # But since of the occupations were numeric I first drop them so that we can just look at the ones that are strings
598
+
599
+ # Filter rows where "Mother's occupation" is not numeric
600
+ filtered_df = df_VIZ[~df_VIZ["Father's occupation"].astype(str).str.isnumeric()]
601
+
602
+ # Group the filtered data by "Father's occupation" and "Output"
603
+ grouped_data = filtered_df.groupby(["Father's occupation", 'Output']).size().unstack(fill_value=0)
604
+
605
+ # Reset index to make "Father's occupation" a column again for easier plotting
606
+ grouped_data.reset_index(inplace=True)
607
+
608
+ # Plotting
609
+ plt.figure(figsize=(14, 8))
610
+ sns.barplot(x="Father's occupation", y="Graduate", data=grouped_data, color="green", label="Graduate")
611
+ sns.barplot(x="Father's occupation", y="Dropout", data=grouped_data, color="red", label="Dropout",
612
+ bottom=grouped_data["Graduate"])
613
+ sns.barplot(x="Father's occupation", y="Enrolled", data=grouped_data, color="blue", label="Enrolled",
614
+ bottom=grouped_data["Graduate"] + grouped_data["Dropout"])
615
+
616
+ plt.xticks(rotation=90)
617
+ plt.xlabel("Father's Occupation")
618
+ plt.ylabel("Number of Students")
619
+ plt.title("Impact of Father's Occupation on Student Outcomes")
620
+ plt.legend(title="Output")
621
+ plt.tight_layout()
622
+ st.pyplot()
623
+
624
+ st.write(
625
+ "The graphs above illustrate the impact of parental occupation on student outcomes, categorized by 'Graduate', 'Dropout', and 'Enrolled' statuses.")
626
+ paragraphs = [
627
+ "Both graphs show that parents in more stable and intellectually-oriented professions (Administration, Armed forces) tend to have children who graduate at higher rates. This might be due to both economic stability and a cultural emphasis on the value of education in these families.",
628
+ "In both cases, occupations with lower socio-economic status correlate with higher dropout rates. This could indicate financial pressures or less available time with parents, which impacts educational support.",
629
+ "We can also observe that for some domains, the impact of fathers' occupations on dropout rates is more pronounced compared to mothers' occupations, possibly reflecting traditional gender roles where fathers' income and job stability might weigh more heavily on family decisions."]
630
+ for paragraph in paragraphs:
631
+ st.markdown(paragraph)
632
+ with tab3:
633
+ st.subheader("Student Pathways - Sankey Plot")
634
+
635
+ # Mapping labels for evening attendance and output
636
+ evening_label = {0: 'Day Classes', 1: 'Evening Classes'}
637
+ output_label = {'Graduate': 'Graduated', 'Dropout': 'Dropped Out', 'Enrolled': 'Enrolled in School'}
638
+
639
+ # Apply mappings to update DataFrame
640
+ df_updated = df_VIZ.copy()
641
+ df_updated['evening attendance'] = df_updated['evening attendance'].map(evening_label)
642
+ df_updated['Output'] = df_updated['Output'].map(output_label)
643
+
644
+ # Create a summary DataFrame for Sankey plot
645
+ summary_df = df_updated.groupby(['Output', 'evening attendance']).size().reset_index(name='Count')
646
+
647
+ # Define unique labels for nodes and colors
648
+ label_list = list(set(summary_df['evening attendance']).union(set(summary_df['Output'])))
649
+ color_map = {'Day Classes': 'lightgreen', 'Evening Classes': 'mediumseagreen',
650
+ 'Graduated': 'lightcoral', 'Dropped Out': 'indianred', 'Enrolled in School': 'goldenrod'}
651
+ node_colors = [color_map[label] for label in label_list]
652
+
653
+ # Create lists for source, target, and value
654
+ source, target, value = [], [], []
655
+ for index, row in summary_df.iterrows():
656
+ source.append(label_list.index(row['evening attendance']))
657
+ target.append(label_list.index(row['Output']))
658
+ value.append(row['Count'])
659
+
660
+ # Define link colors based on source or target
661
+ link_colors = [color_map[label_list[source[i]]] for i in range(len(source))]
662
+
663
+ # Create Sankey diagram figure
664
+ fig = go.Figure(data=[go.Sankey(
665
+ node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=label_list, color=node_colors),
666
+ link=dict(source=source, target=target, value=value, hoverinfo='all', color=link_colors)
667
+ )])
668
+
669
+ # Update layout for the Sankey plot
670
+ fig.update_layout(title_text="Student Pathways", font_size=10)
671
+
672
+ # Display the Sankey diagram within Streamlit
673
+ st.plotly_chart(fig)
674
+ paragraphs = [
675
+ "Graduation Rates:",
676
+ "Evening classes show a higher graduation rate than day classes. This could be because students who take evening classes are often working individuals who are more determined to finish their education quickly due to career commitments.",
677
+ "Dropout Rates:",
678
+ "Both class schedules show dropouts, but the rate is less pronounced for day classes. This might indicate that students in day classes have more flexible schedules or fewer outside commitments, reducing pressure and the likelihood of dropping out."]
679
+ for paragraph in paragraphs:
680
+ st.markdown(paragraph)
681
+
682
+ st.subheader("Student Pathways -Parallel Plot")
683
+ # Map your values and create a new DataFrame for parallel plot
684
+ df_parallel = df_VIZ.copy()
685
+ df_parallel['Tuition fees up to date'] = df_parallel['Tuition fees up to date'].map(
686
+ {0: 'Not up to date', 1: 'Up to date'})
687
+ df_parallel['Output'] = df_parallel['Output'].map(
688
+ {'Graduate': 'Graduated', 'Dropout': 'Dropped Out', 'Enrolled': 'Enrolled in School'})
689
+ df_parallel['Scholarship holder'] = df_parallel['Scholarship holder'].map({0: 'No', 1: 'Yes'})
690
+
691
+ # Assign colors based on 'Scholarship holder'
692
+ color_map = {'No': 'blue', 'Yes': 'orange'}
693
+ df_parallel['color'] = df_parallel['Scholarship holder'].map(color_map)
694
+
695
+ # Create Parcats plot using Plotly
696
+ fig = go.Figure(data=
697
+ go.Parcats(
698
+ dimensions=[
699
+ {'label': 'Scholarship', 'values': df_parallel['Scholarship holder']},
700
+ {'label': 'Tuition Status', 'values': df_parallel['Tuition fees up to date']},
701
+ {'label': 'Output', 'values': df_parallel['Output']}
702
+ ],
703
+ line={'color': df_parallel['color'], 'colorscale': 'Viridis'}, # Color lines by scholarship status
704
+ )
705
+ )
706
+
707
+ # Update layout
708
+ fig.update_layout(title="Student Pathways", width=800)
709
+
710
+ # Display the Parcats plot within Streamlit
711
+ st.plotly_chart(fig)
712
+ paragraphs = [
713
+ "A significant flow from students with scholarships maintains tuition payments up to date, which likely supports their ability to continue education and possibly graduate.",
714
+ "The transitions from having a scholarship and keeping tuition up to date towards graduation appear strong, suggesting that scholarships might help students successfully complete their courses.",
715
+ "There is a smaller but notable flow towards students dropping out or staying enrolled, even with scholarships, indicating that while financial support helps, it may not be sufficient to guarantee graduation for all students."]
716
+ for paragraph in paragraphs:
717
+ st.markdown(paragraph)
718
+ with tab4:
719
+ st.markdown('<center><h2>Explainable AI</h2></center>', unsafe_allow_html=True)
720
+ st.write(""" Shapash is User-friendly Explainability and Interpretability app that helps Develop Reliable and Transparent Machine Learning Models
721
+ in this case it will help us see and understand which variables have the most impact and contribute more towards our model prediction. We chose a couple a graphs that seemed to be the most helpfull to our case""")
722
+ # Assuming images are in the same directory as the script
723
+
724
+ st.image(img_importance)
725
+ st.write("""The feature importance plot shows the most important features in the dataset. The importance of a feature is calculated based on the contribution of the feature to the model's predictions. The higher the importance, the more the feature contributes to the model's predictions.
726
+ Here we have 5 to 7 variables that are very important. with The Yearly Approved Credit contributing the most.
727
+ """)
728
+
729
+ st.image(img_importance_subset)
730
+ st.write("Same as the previous graph but with a subset of the most important features.")
731
+
732
+ st.image(img_contribution_subset)
733
+ st.write("""The feature contribution plot shows the contribution of each feature to the model's predictions for each individual prediction. The contribution of a feature is calculated based on the feature's impact on the model's prediction for a specific instance. The higher the contribution, the more the feature influences the model's prediction for that instance.
734
+ Here we can see that for the yearly credit approved, the more credits taken then higher the chances of the student not dropping out.""")
735
+
736
+ if selected == "Prediction":
737
+ menu2 = option_menu(
738
+ menu_title=None,
739
+ options=["Models", "ML Flow"],
740
+ icons=["bookmark", "activity"],
741
+ default_index=0,
742
+ orientation="horizontal",
743
+
744
+ )
745
+ if menu2 == "Models":
746
+ prediction_type = st.sidebar.selectbox('Select Type of Prediction', ['Decision Tree (Default)', 'KNN'])
747
+
748
+ if prediction_type == "Decision Tree (Default)":
749
+ st.title("Decision Tree Prediction")
750
+ max = st.number_input("Enter the maximum depth of the decision tree (5 is the best)", 1, 10, value = 1, placeholder= "Enter a number")
751
+ decision_tree_model, y_pred,dt_f1_score, dt_accuracy, dt_precision = train_and_evaluate_model(
752
+ DecisionTreeClassifier(max_depth=max),
753
+ X_train, X_test, y_train, y_test
754
+ )
755
+ update_metrics("Decision Tree", dt_f1_score, dt_precision, dt_accuracy)
756
+ # Export the tree in Graphviz format
757
+ feature_names = X.columns
758
+ feature_cols = X.columns
759
+ dot_data = export_graphviz(decision_tree_model, out_file=None,
760
+ feature_names=feature_cols,
761
+ class_names=["0", "1", "2"],
762
+ filled=True, rounded=True,
763
+ special_characters=True)
764
+
765
+ # Convert to a graph using Graphviz
766
+ graph = graphviz.Source(dot_data)
767
+
768
+
769
+ # Function to display Graphviz tree in Streamlit
770
+ def st_graphviz(graph, width= None, height=None):
771
+ graphviz_html = f"<body>{graph.pipe(format='svg').decode('utf-8', errors='replace')}</body>"
772
+ st.components.v1.html(graphviz_html,width = width , height=height, scrolling=True)
773
+
774
+
775
+ # Display the tree in Streamlit
776
+ st.title('Decision Tree Visualization')
777
+ st_graphviz(graph,1200, 800)
778
+
779
+ st.markdown("""
780
+ ### Path Description:
781
+
782
+ **Starting Point (Root Node):**
783
+ The root node is the most significant on the prediction tree. It is the first decision point where the tree splits into branches based on the student's yearly credit approval.
784
+ The question we can ask is: "Is the student's yearly credit more than 15.5 or less?" And depending on the answer, we move down the tree to the next question.
785
+
786
+ **First Decision - True (Yes, 15.5 or less):**
787
+ For Yes, we can move to the next question down the left branch of the tree.
788
+
789
+ **Second Question:**
790
+ The next question is: "Is the student's yearly credit approved 4.5 or less?"
791
+ This further refines our group of students, focusing on those who have very few credits for the year.
792
+
793
+ **Second Decision - True (Yes, 4.5 or less):**
794
+ We again answer yes and proceed to a final category in this path for this example depth 2.
795
+
796
+ **Outcome (Leaf Node):**
797
+ The leaf node we reach after these two "yes" answers shows:
798
+ - **Gini:** 0.327 (This is a measure of uncertainty or impurity. The lower the value the better for the uniformity of the groups.
799
+ A lower value, like 0.327, suggests that the node is more or less pure, suggesting that most students in this node fall into the same category.)
800
+ - **Samples:** 733 (This is the number of students who fit this profile.)
801
+ - **Values:** [593, 84, 56] (This tells us how many students are predicted to dropout, stay enrolled, or graduate. Here, 593 are predicted to dropout, 84 to stay enrolled, and 56 to graduate.)
802
+ - **Majority Class:** 0 (Most students in this group, those with very low credit approval, are predicted to dropout.)
803
+ """)
804
+
805
+ elif prediction_type == "KNN":
806
+ st.title("KNN Prediction")
807
+ #KNN Classifier
808
+ k_neighbors = st.number_input("Enter the number of neighbors for the KNN model",1,100,value = 10, placeholder= "Enter a number")
809
+ knn_model, y_pred, knn_f1_score, knn_accuracy, knn_precision = train_and_evaluate_model(
810
+ KNeighborsClassifier(n_neighbors=int(k_neighbors)),
811
+ X_train, X_test, y_train, y_test)
812
+ update_metrics("KNN", knn_f1_score,knn_accuracy,knn_precision)
813
+
814
+ # Scale your data
815
+ scaler = StandardScaler()
816
+ X_train_scaled = scaler.fit_transform(X_train)
817
+ X_test_scaled = scaler.transform(X_test)
818
+
819
+ # User input for the number of neighbors
820
+ # Define the range of k values dynamically based on user input
821
+ max_k = k_neighbors + 20
822
+ k_list = list(range(1, max_k + 1))
823
+ k_values = dict(n_neighbors=k_list)
824
+
825
+ # Perform grid search with the list of k values
826
+ grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=k_values, cv=5, scoring='accuracy')
827
+ grid_search.fit(X_train_scaled, y_train)
828
+
829
+ # Get the results into a DataFrame
830
+ results_df = pd.DataFrame(grid_search.cv_results_)
831
+
832
+
833
+ results_df = pd.DataFrame(grid_search.cv_results_)
834
+
835
+ # Sort the DataFrame by 'mean_test_score' and 'std_test_score' and then take the top 5
836
+ top_results = results_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True]).head(5)
837
+
838
+ # Display the DataFrame in Streamlit
839
+ st.write("Top 5 K Values by Mean Test Score and Stability:")
840
+
841
+ st.dataframe(top_results[['params', 'mean_test_score', 'std_test_score']])
842
+ # Plotting the mean test scores
843
+ graphic = results_df['mean_test_score']
844
+ plt.figure(figsize=(10, 5))
845
+ plt.plot(k_list, graphic, color='navy', linestyle='dashed', marker='o')
846
+ plt.xlabel('K Number of Neighbors', fontdict={'fontsize': 12})
847
+ plt.ylabel('Accuracy', fontdict={'fontsize': 12})
848
+ plt.title('K NUMBER X ACCURACY', fontdict={'fontsize': 24})
849
+ plt.xticks(range(0, max_k, max(1, max_k // 10))) # Adjust x-ticks dynamically
850
+ st.pyplot(plt)
851
+
852
+ if menu2 == "ML Flow":
853
+ st.title("ML FLOW Visualization")
854
+ mlflowlink = "https://dagshub.com/Danjari/Dropout.mlflow/#/compare-experiments/s?experiments=%5B%220%22%2C%221%22%5D&searchFilter=&orderByKey=attributes.start_time&orderByAsc=false&startTime=ALL&lifecycleFilter=Active&modelVersionFilter=All+Runs&datasetsFilter=W10%3D"
855
+ column1, column2, column3 = st.columns([1,1,1])
856
+ with column1:
857
+ st.write("")
858
+ with column2:
859
+ ui.link_button(text="πŸ‘‰πŸ½ Go To ML Flow", url=mlflowlink, key="link_btnmlflow")
860
+ with column3:
861
+ st.write("")
862
+
863
+ #####################################################################
864
+ def main():
865
+ st.markdown("## Model Experimentation with MLflow")
866
+
867
+ # File upload
868
+ uploaded_file = st.file_uploader("Choose a file (CSV or Excel)")
869
+ if uploaded_file is not None:
870
+ try:
871
+ if uploaded_file.name.endswith('.csv'):
872
+ df = pd.read_csv(uploaded_file)
873
+ else:
874
+ df = pd.read_excel(uploaded_file)
875
+
876
+ # Validate data
877
+ if not all(df.dtypes.apply(
878
+ lambda dtype: pd.api.types.is_float_dtype(dtype) or pd.api.types.is_integer_dtype(
879
+ dtype))):
880
+ st.error("All columns must be numeric (float or int). Please upload a cleaned dataset.")
881
+ st.stop()
882
+ except Exception as e:
883
+ st.error(f"Error reading file: {e}")
884
+ st.stop()
885
+ else:
886
+ st.stop()
887
+
888
+ # Problem type selection
889
+ problem_type = st.selectbox("Select the problem type", ["classification", "regression"])
890
+
891
+ # Model selection based on problem type
892
+ MODELS = {
893
+ "classification": {
894
+ "KNN": KNeighborsClassifier,
895
+ "Decision Tree": DecisionTreeClassifier,
896
+ "Logistic Regression": LogisticRegression
897
+ },
898
+ "regression": {
899
+ "LR": LinearRegression,
900
+
901
+ }
902
+ }
903
+
904
+ model_options = list(MODELS[problem_type].keys())
905
+ model_choice = st.selectbox("Choose a model", model_options)
906
+
907
+ # Feature and target selection
908
+ if len(df.columns) > 1:
909
+ target = st.selectbox("Select the target variable", df.columns)
910
+ feature_options = [col for col in df.columns if col != target]
911
+ features = st.multiselect("Choose some features", feature_options, default=feature_options)
912
+ else:
913
+ st.error("Dataset must contain more than one column.")
914
+ st.stop()
915
+
916
+ # MLflow tracking
917
+ track_with_mlflow = st.checkbox("Track with mlflow?")
918
+
919
+ # Model training
920
+ start_training = st.button("Start training")
921
+ if start_training:
922
+ if track_with_mlflow:
923
+ mlflow.set_experiment("User_Uploaded_Data")
924
+ with mlflow.start_run():
925
+ train_and_evaluate(df, features, target, model_choice, problem_type, MODELS,
926
+ track_with_mlflow)
927
+
928
+ def train_and_evaluate(df, features, target, model_choice, problem_type, MODELS, track_with_mlflow):
929
+ X = df[features].copy()
930
+ y = df[target].copy()
931
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
932
+
933
+ model = MODELS[problem_type][model_choice]()
934
+ model.fit(X_train, y_train)
935
+
936
+ # Model evaluation
937
+ preds_train = model.predict(X_train)
938
+ preds_test = model.predict(X_test)
939
+ if problem_type == "classification":
940
+ metric_train = f1_score(y_train, preds_train, average='micro')
941
+ metric_test = f1_score(y_test, preds_test, average='micro')
942
+ metric_name = "f1_score"
943
+
944
+ else:
945
+ metric_train = r2_score(y_train, preds_train)
946
+ metric_test = r2_score(y_test, preds_test)
947
+ metric_name = "r2_score"
948
+
949
+ st.write(f"{metric_name}_train", round(metric_train, 3))
950
+ st.write(f"{metric_name}_test", round(metric_test, 3))
951
+
952
+ if track_with_mlflow:
953
+ mlflow.log_param('model', model_choice)
954
+ mlflow.log_param('features', features)
955
+ mlflow.log_metric(metric_name + "_train", metric_train)
956
+ mlflow.log_metric(metric_name + "_test", metric_test)
957
+
958
+
959
+ if __name__ == '__main__':
960
+ main()
961
+
962
+ #####################################################################
963
+
964
+ if selected == "Conclusion":
965
+ st.title("Conclusion 🎀")
966
+ st.markdown("""
967
+ **1. Data Quality and Preparation**
968
+ **Address Missing Values**: Given the socio-economic factors involved in our dataset, it is important to take note of how we handle missing values. It is crucial to use domain knowledge to remove missing values in a way that does not introduce bias.
969
+ To improve the accuracy of our model, we could also introduce new features that can help in making better predictive decisions.
970
+ For example, introducing new variables such as "parental job stability," "education policies," etc.
971
+
972
+ **2. Model-related improvements**
973
+ For a decision tree classifier, it is important to limit the growth of the tree to prevent overfitting but we also have to avoid underfitting.
974
+ Even though we have a way to calculate the most optimal K-value, we can't be certain that that is the best value for our model. It may be that the 1000th iteration of cross-validation will provide a different optimal value. It is crucial to test and validate different parameters to ensure the model's accuracy and reliability.
975
+
976
+ **3. Long-term:**
977
+ Since we are dealing with education data, it is important to continuously update the model with new data, such as changes in the economic landscape or educational policies in Portugal, to keep the model relevant and accurate.
978
+ Additionally, we could also merge our current dataset with other datasets that may provide additional insights. By incorporating external datasets, we can enhance the quality and accuracy of our model predictions.
979
+ """)
980
+
981
+ # Stop the emissions tracker
982
+ emissions = tracker.stop()
983
+
984
+ st.write(f"Total CO2 emissions:{emissions:.4f}kg CO2")
985
+
986
+