{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split, cross_val_score\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from xgboost import XGBClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import GradientBoostingClassifier" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "redwine_dir = r'./data/winequality-red.csv'\n", "whitewine_dir = r'./data/winequality-white.csv'\n", "df_redwine = pd.read_csv(redwine_dir, sep=';')\n", "df_whitewine= pd.read_csv(whitewine_dir, sep=';')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# O = redwine; 1= whitewine\n", "df_redwine['wine_type'] = 0\n", "df_whitewine['wine_type']= 1" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_whitewine.columns == df_redwine.columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "13" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df_whitewine.columns == df_redwine.columns)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholwine_type
07.40.700.001.90.07611.034.00.99783.510.569.40
17.80.880.002.60.09825.067.00.99683.200.689.80
27.80.760.042.30.09215.054.00.99703.260.659.80
311.20.280.561.90.07517.060.00.99803.160.589.80
47.40.700.001.90.07611.034.00.99783.510.569.40
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol wine_type \n", "0 9.4 0 \n", "1 9.8 0 \n", "2 9.8 0 \n", "3 9.8 0 \n", "4 9.4 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_df = pd.concat([df_redwine,\n", " df_whitewine],\n", " axis = 0)\n", "merge_df.drop('quality', axis = 1, inplace= True)\n", "merge_df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6497, 12)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_df.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "wine_type\n", "1 4898\n", "0 1599\n", "Name: count, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_df['wine_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "label_count = merge_df['wine_type'].value_counts()\n", "fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20,6))\n", "\n", "sns.set_theme(style='darkgrid', palette='pastel')\n", "color = sns.color_palette(palette='pastel')\n", "explode = [0.02]*len(label_count)\n", "\n", "axes[0].pie(label_count.values, labels=label_count.index, autopct='%1.1f%%', colors=color, explode=explode)\n", "axes[0].set_title('Percentage Label',fontsize=16)\n", "\n", "sns.barplot(x=label_count.index, y=label_count.values, ax=axes[1])\n", "axes[1].set_title('Count Label',fontsize=16)\n", "axes[1].set_xlabel('Label')\n", "axes[1].set_ylabel('Count')\n", "\n", "fig.suptitle('Visual Label Count',fontsize=20)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholwine_type
07.40.700.001.90.07611.034.00.997803.510.569.40
17.80.880.002.60.09825.067.00.996803.200.689.80
27.80.760.042.30.09215.054.00.997003.260.659.80
311.20.280.561.90.07517.060.00.998003.160.589.80
47.40.700.001.90.07611.034.00.997803.510.569.40
.......................................
48936.20.210.291.60.03924.092.00.991143.270.5011.21
48946.60.320.368.00.04757.0168.00.994903.150.469.61
48956.50.240.191.20.04130.0111.00.992542.990.469.41
48965.50.290.301.10.02220.0110.00.988693.340.3812.81
48976.00.210.380.80.02022.098.00.989413.260.3211.81
\n", "

6497 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "... ... ... ... ... ... \n", "4893 6.2 0.21 0.29 1.6 0.039 \n", "4894 6.6 0.32 0.36 8.0 0.047 \n", "4895 6.5 0.24 0.19 1.2 0.041 \n", "4896 5.5 0.29 0.30 1.1 0.022 \n", "4897 6.0 0.21 0.38 0.8 0.020 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.99780 3.51 0.56 \n", "1 25.0 67.0 0.99680 3.20 0.68 \n", "2 15.0 54.0 0.99700 3.26 0.65 \n", "3 17.0 60.0 0.99800 3.16 0.58 \n", "4 11.0 34.0 0.99780 3.51 0.56 \n", "... ... ... ... ... ... \n", "4893 24.0 92.0 0.99114 3.27 0.50 \n", "4894 57.0 168.0 0.99490 3.15 0.46 \n", "4895 30.0 111.0 0.99254 2.99 0.46 \n", "4896 20.0 110.0 0.98869 3.34 0.38 \n", "4897 22.0 98.0 0.98941 3.26 0.32 \n", "\n", " alcohol wine_type \n", "0 9.4 0 \n", "1 9.8 0 \n", "2 9.8 0 \n", "3 9.8 0 \n", "4 9.4 0 \n", "... ... ... \n", "4893 11.2 1 \n", "4894 9.6 1 \n", "4895 9.4 1 \n", "4896 12.8 1 \n", "4897 11.8 1 \n", "\n", "[6497 rows x 12 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_df" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Split x and y\n", "x = merge_df.drop('wine_type', axis = 1)\n", "y = merge_df['wine_type']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=20)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KNN 0.994 0.002\n", "LR 0.994 0.002\n", "NB 0.976 0.007\n", "GBC 0.994 0.002\n", "SVC 0.996 0.002\n", "RFC 0.996 0.002\n", "DT 0.987 0.003\n", "XGB 0.995 0.003\n" ] } ], "source": [ "encoded = LabelEncoder()\n", "sc = StandardScaler()\n", "\n", "x_sc = sc.fit_transform(x_train)\n", "\n", "df_sc = pd.DataFrame(x_sc, columns=x.columns)\n", "\n", "models = []\n", "models.append(('KNN', KNeighborsClassifier()))\n", "models.append(('LR', LogisticRegression()))\n", "models.append(('NB', GaussianNB()))\n", "models.append(('GBC', GradientBoostingClassifier()))\n", "models.append(('SVC', SVC()))\n", "models.append(('RFC', RandomForestClassifier()))\n", "models.append(('DT', DecisionTreeClassifier()))\n", "models.append(('XGB', XGBClassifier()))\n", "\n", "result = []\n", "names = []\n", "\n", "for name, model in models:\n", " cvs = cross_val_score(model, x_sc, y_train, cv=10)\n", " result.append(cvs)\n", " names.append(name)\n", " print('{:4} {:.3f} {:.3f}'.format(name, cvs.mean(), cvs.std()))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = plt.figure()\n", "fig.suptitle('Algorithm Comparison')\n", "ax = fig.add_subplot(111)\n", "plt.boxplot(result)\n", "ax.set_xticklabels(names)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# GridSearchCV" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300}\n", "Best Score: 0.9969212260309469\n" ] } ], "source": [ "from sklearn.model_selection import GridSearchCV\n", "from xgboost import XGBClassifier\n", "\n", "param_grid = {\n", " 'n_estimators': [100, 200, 300], \n", " 'learning_rate': [0.01, 0.1, 0.2], \n", " 'max_depth': [3, 5, 7], }\n", "\n", "xgb = XGBClassifier()\n", "\n", "grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1)\n", "grid_search.fit(x_sc, y_train)\n", "\n", "print(\"Best Hyperparameters:\", grid_search.best_params_)\n", "print(\"Best Score:\", grid_search.best_score_)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cross val score:[1. 1. 1. 1. 0.99 1. 1. 1. 1. 0.99]\n", "Average 99.67 %\n" ] } ], "source": [ "model = grid_search.best_estimator_\n", "cvs = cross_val_score(model,x_sc,y_train,cv=10)\n", "print('Cross val score:{}'.format(cvs.round(2)))\n", "print('Average {:.2f} %'.format(cvs.mean()*100))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# RandomizedSearchCV" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Hyperparameters: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.16}\n", "Best Score: 0.9965364255571185\n" ] } ], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", "from xgboost import XGBClassifier\n", "import numpy as np\n", "\n", "param_dist = {\n", " 'n_estimators': np.arange(100, 1001, 100), \n", " 'learning_rate': np.linspace(0.01, 0.2, 20), \n", " 'max_depth': [3, 5, 7, 9], }\n", "\n", "xgb = XGBClassifier()\n", "\n", "random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1)\n", "random_search.fit(x_sc, y_train)\n", "\n", "print(\"Best Hyperparameters:\", random_search.best_params_)\n", "print(\"Best Score:\", random_search.best_score_)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cross val score:[1. 1. 1. 1. 0.99 1. 1. 1. 1. 0.99]\n", "Average 99.71 %\n" ] } ], "source": [ "model_rs = random_search.best_estimator_\n", "cvs = cross_val_score(model_rs,x_sc,y_train,cv=10)\n", "print('Cross val score:{}'.format(cvs.round(2)))\n", "print('Average {:.2f} %'.format(cvs.mean()*100))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
       "              interaction_constraints=None, learning_rate=0.16, max_bin=None,\n",
       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
       "              max_delta_step=None, max_depth=3, max_leaves=None,\n",
       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=300, n_jobs=None, num_parallel_tree=None,\n",
       "              predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.16, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=3, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=300, n_jobs=None, num_parallel_tree=None,\n", " predictor=None, random_state=None, ...)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random_search.best_estimator_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Classification" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy Train: 0.99\n", " precision recall f1-score support\n", "\n", " 0 1.00 0.97 0.98 321\n", " 1 0.99 1.00 0.99 979\n", "\n", " accuracy 0.99 1300\n", " macro avg 0.99 0.99 0.99 1300\n", "weighted avg 0.99 0.99 0.99 1300\n", "\n" ] } ], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.metrics import accuracy_score,classification_report\n", "\n", "pipe1 = Pipeline([\n", " ('scaler',StandardScaler()),\n", " ('xgb',XGBClassifier(n_estimators= 500, max_depth=5, learning_rate=0.13))\n", "])\n", "\n", "pipe1.fit(x_train,y_train)\n", "y_pred = pipe1.predict(x_test)\n", "\n", "print('Accuracy Train: {:.2f}'.format(accuracy_score(y_test,y_pred)))\n", "print(classification_report(y_test,y_pred))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['model_wine_binary.pkl']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(model, 'model_wine_binary.pkl')" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }