{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split, cross_val_score\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from xgboost import XGBClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import GradientBoostingClassifier" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "redwine_dir = r'./data/winequality-red.csv'\n", "whitewine_dir = r'./data/winequality-white.csv'\n", "df_redwine = pd.read_csv(redwine_dir, sep=';')\n", "df_whitewine= pd.read_csv(whitewine_dir, sep=';')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# O = redwine; 1= whitewine\n", "df_redwine['wine_type'] = 0\n", "df_whitewine['wine_type']= 1" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_whitewine.columns == df_redwine.columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "13" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df_whitewine.columns == df_redwine.columns)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | fixed acidity | \n", "volatile acidity | \n", "citric acid | \n", "residual sugar | \n", "chlorides | \n", "free sulfur dioxide | \n", "total sulfur dioxide | \n", "density | \n", "pH | \n", "sulphates | \n", "alcohol | \n", "wine_type | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "7.4 | \n", "0.70 | \n", "0.00 | \n", "1.9 | \n", "0.076 | \n", "11.0 | \n", "34.0 | \n", "0.9978 | \n", "3.51 | \n", "0.56 | \n", "9.4 | \n", "0 | \n", "
| 1 | \n", "7.8 | \n", "0.88 | \n", "0.00 | \n", "2.6 | \n", "0.098 | \n", "25.0 | \n", "67.0 | \n", "0.9968 | \n", "3.20 | \n", "0.68 | \n", "9.8 | \n", "0 | \n", "
| 2 | \n", "7.8 | \n", "0.76 | \n", "0.04 | \n", "2.3 | \n", "0.092 | \n", "15.0 | \n", "54.0 | \n", "0.9970 | \n", "3.26 | \n", "0.65 | \n", "9.8 | \n", "0 | \n", "
| 3 | \n", "11.2 | \n", "0.28 | \n", "0.56 | \n", "1.9 | \n", "0.075 | \n", "17.0 | \n", "60.0 | \n", "0.9980 | \n", "3.16 | \n", "0.58 | \n", "9.8 | \n", "0 | \n", "
| 4 | \n", "7.4 | \n", "0.70 | \n", "0.00 | \n", "1.9 | \n", "0.076 | \n", "11.0 | \n", "34.0 | \n", "0.9978 | \n", "3.51 | \n", "0.56 | \n", "9.4 | \n", "0 | \n", "
| \n", " | fixed acidity | \n", "volatile acidity | \n", "citric acid | \n", "residual sugar | \n", "chlorides | \n", "free sulfur dioxide | \n", "total sulfur dioxide | \n", "density | \n", "pH | \n", "sulphates | \n", "alcohol | \n", "wine_type | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "7.4 | \n", "0.70 | \n", "0.00 | \n", "1.9 | \n", "0.076 | \n", "11.0 | \n", "34.0 | \n", "0.99780 | \n", "3.51 | \n", "0.56 | \n", "9.4 | \n", "0 | \n", "
| 1 | \n", "7.8 | \n", "0.88 | \n", "0.00 | \n", "2.6 | \n", "0.098 | \n", "25.0 | \n", "67.0 | \n", "0.99680 | \n", "3.20 | \n", "0.68 | \n", "9.8 | \n", "0 | \n", "
| 2 | \n", "7.8 | \n", "0.76 | \n", "0.04 | \n", "2.3 | \n", "0.092 | \n", "15.0 | \n", "54.0 | \n", "0.99700 | \n", "3.26 | \n", "0.65 | \n", "9.8 | \n", "0 | \n", "
| 3 | \n", "11.2 | \n", "0.28 | \n", "0.56 | \n", "1.9 | \n", "0.075 | \n", "17.0 | \n", "60.0 | \n", "0.99800 | \n", "3.16 | \n", "0.58 | \n", "9.8 | \n", "0 | \n", "
| 4 | \n", "7.4 | \n", "0.70 | \n", "0.00 | \n", "1.9 | \n", "0.076 | \n", "11.0 | \n", "34.0 | \n", "0.99780 | \n", "3.51 | \n", "0.56 | \n", "9.4 | \n", "0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 4893 | \n", "6.2 | \n", "0.21 | \n", "0.29 | \n", "1.6 | \n", "0.039 | \n", "24.0 | \n", "92.0 | \n", "0.99114 | \n", "3.27 | \n", "0.50 | \n", "11.2 | \n", "1 | \n", "
| 4894 | \n", "6.6 | \n", "0.32 | \n", "0.36 | \n", "8.0 | \n", "0.047 | \n", "57.0 | \n", "168.0 | \n", "0.99490 | \n", "3.15 | \n", "0.46 | \n", "9.6 | \n", "1 | \n", "
| 4895 | \n", "6.5 | \n", "0.24 | \n", "0.19 | \n", "1.2 | \n", "0.041 | \n", "30.0 | \n", "111.0 | \n", "0.99254 | \n", "2.99 | \n", "0.46 | \n", "9.4 | \n", "1 | \n", "
| 4896 | \n", "5.5 | \n", "0.29 | \n", "0.30 | \n", "1.1 | \n", "0.022 | \n", "20.0 | \n", "110.0 | \n", "0.98869 | \n", "3.34 | \n", "0.38 | \n", "12.8 | \n", "1 | \n", "
| 4897 | \n", "6.0 | \n", "0.21 | \n", "0.38 | \n", "0.8 | \n", "0.020 | \n", "22.0 | \n", "98.0 | \n", "0.98941 | \n", "3.26 | \n", "0.32 | \n", "11.8 | \n", "1 | \n", "
6497 rows × 12 columns
\n", "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None, learning_rate=0.16, max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=3, max_leaves=None,\n",
" min_child_weight=None, missing=nan, monotone_constraints=None,\n",
" n_estimators=300, n_jobs=None, num_parallel_tree=None,\n",
" predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None, learning_rate=0.16, max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=3, max_leaves=None,\n",
" min_child_weight=None, missing=nan, monotone_constraints=None,\n",
" n_estimators=300, n_jobs=None, num_parallel_tree=None,\n",
" predictor=None, random_state=None, ...)