| """Script to create the model artifact | |
| Trains a simple logistic regression with grid search on a synthetic dataset and | |
| stores the model in a pickle file. | |
| """ | |
| import joblib | |
| from sklearn.datasets import make_classification | |
| from sklearn.linear_model import SGDClassifier | |
| from sklearn.model_selection import GridSearchCV | |
| SEED = 0 | |
| FILENAME = 'sklearn_model.joblib' | |
| def get_data(): | |
| X, y = make_classification(n_samples=1000, random_state=SEED) | |
| return X, y | |
| def get_model(**kwargs): | |
| model = SGDClassifier(random_state=SEED) | |
| model.set_params(**kwargs) | |
| return model | |
| def get_hparams(): | |
| hparams = { | |
| 'penalty': ['l1', 'l2'], | |
| 'alpha': [0.00001, 0.0001, 0.001], | |
| } | |
| return hparams | |
| def grid_search(model, X, y, hparams): | |
| search = GridSearchCV(model, hparams, cv=5, scoring='accuracy') | |
| search.fit(X, y) | |
| return search | |
| def train(model, X, y, hparams): | |
| search = grid_search(model, X, y, hparams=hparams) | |
| print(f"Best accuracy: {100 * search.best_score_:.1f}%") | |
| print(f"Best parameters: {search.best_params_}") | |
| return search.best_estimator_ | |
| def save_model(model, filename): | |
| joblib.dump(model, filename) | |
| print(f"Stored model in '{filename}'") | |
| def main(): | |
| X, y = get_data() | |
| model = get_model() | |
| hparams = get_hparams() | |
| model_trained = train(model, X, y, hparams=hparams) | |
| save_model(model_trained, FILENAME) | |
| if __name__ == '__main__': | |
| main() | |