Danjari commited on
Commit
2d1efdd
1 Parent(s): d840b87

Upload 7 files

Browse files
Student_modified.csv ADDED
The diff for this file is too large to render. See raw diff
 
Students.csv ADDED
The diff for this file is too large to render. See raw diff
 
contribution subset.png ADDED
feature_importance.png ADDED
feature_subset.png ADDED
ml_flow.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mlflow
2
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
3
+ from sklearn.model_selection import GridSearchCV
4
+ import pandas as pd
5
+ from sklearn.tree import DecisionTreeClassifier
6
+ from main import X_train, X_test, y_train, y_test
7
+ from sklearn.neighbors import KNeighborsClassifier
8
+ from urllib.parse import urlparse
9
+ def train_and_evaluate_with_mlflow(model, param_grid, X_train, X_test, y_train, y_test, model_name, **kwargs):
10
+ """
11
+ Train a machine learning model using GridSearchCV and evaluate its performance,
12
+ with all results and the model itself logged to MLflow.
13
+
14
+ Parameters:
15
+ - model: The machine learning model to train.
16
+ - param_grid: Dictionary with parameters names as keys and lists of parameter settings to try as values.
17
+ - X_train: Training data features.
18
+ - X_test: Testing data features.
19
+ - y_train: Training data labels.
20
+ - y_test: Testing data labels.
21
+ - model_name: The name of the model (for MLflow logging).
22
+ - **kwargs: Additional keyword arguments to pass to the GridSearchCV.
23
+
24
+ Returns:
25
+ - The best estimator from GridSearchCV.
26
+ """
27
+ with mlflow.start_run():
28
+ mlflow.set_experiment("Student Status Prediction")
29
+
30
+ # Perform grid search to find the best parameters
31
+ grid_search = GridSearchCV(estimator=model, param_grid=param_grid, **kwargs)
32
+ grid_search.fit(X_train, y_train)
33
+
34
+ # Extract information from the grid search for logging
35
+ cv_results_df = pd.DataFrame(grid_search.cv_results_)
36
+
37
+ # Get the top 5 best parameter combinations by rank_test_score
38
+ top5_results = cv_results_df.sort_values('rank_test_score').head(5)
39
+
40
+ # Log the best parameters
41
+ best_params = grid_search.best_params_
42
+ mlflow.log_params(best_params)
43
+
44
+ # Evaluate the model
45
+ best_model = grid_search.best_estimator_
46
+ y_pred = best_model.predict(X_test)
47
+
48
+ # Log the performance metrics
49
+ mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
50
+ mlflow.log_metric("precision", precision_score(y_test, y_pred, average='weighted'))
51
+ mlflow.log_metric("recall", recall_score(y_test, y_pred, average='weighted'))
52
+ mlflow.log_metric("f1", f1_score(y_test, y_pred, average='weighted'))
53
+
54
+ # Log the top 5 best results as an artifact
55
+ top5_results.to_csv("top5_results.csv", index=False)
56
+ mlflow.log_artifact("top5_results.csv")
57
+
58
+ # Log the best model in MLflow
59
+ mlflow.sklearn.log_model(best_model, model_name)
60
+
61
+ # For remote server only (Dagshub)
62
+ remote_server_uri = "https://dagshub.com/Danjari/Dropout.mlflow"
63
+ mlflow.set_tracking_uri(remote_server_uri)
64
+
65
+
66
+
67
+ tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
68
+
69
+ # Model registry does not work with file store
70
+ if tracking_url_type_store != "file":
71
+ # Register the model
72
+ # There are other ways to use the Model Registry, which depends on the use case,
73
+ # please refer to the doc for more information:
74
+ # https://mlflow.org/docs/latest/model-registry.html#api-workflow
75
+ mlflow.sklearn.log_model(best_model, "model", registered_model_name=model_name)
76
+ else:
77
+ mlflow.sklearn.log_model(best_model, "model")
78
+
79
+ return best_model
80
+
81
+
82
+
83
+ # Decision Tree hyperparameters
84
+ dt_param_grid = {
85
+ 'max_depth': [3, 4,5,6, 10],
86
+ 'min_samples_leaf': [1, 2, 4]
87
+ }
88
+
89
+ # KNN hyperparameters
90
+ k_list = list(range(1, 101))
91
+ knn_param_grid = {
92
+ 'n_neighbors': k_list
93
+ }
94
+
95
+ # Set the MLflow experiment name
96
+ # mlflow.set_experiment("Model Comparison Experiment")
97
+
98
+ # Run Decision Tree experiment
99
+ train_and_evaluate_with_mlflow(
100
+ DecisionTreeClassifier(random_state=42),
101
+ dt_param_grid,
102
+ X_train, X_test, y_train, y_test,
103
+ model_name="DecisionTree",
104
+ cv=5
105
+ )
106
+ # Run KNN experiment
107
+ train_and_evaluate_with_mlflow(
108
+ KNeighborsClassifier(),
109
+ knn_param_grid,
110
+ X_train, X_test, y_train, y_test,
111
+ model_name="KNN",
112
+ cv=5
113
+ )
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ matplotlib
4
+ seaborn
5
+ numpy
6
+ altair
7
+ graphviz
8
+ streamlit_option_menu
9
+ scikit-learn
10
+ Pillow
11
+ shapash
12
+ mlflow