{ "cells": [ { "cell_type": "markdown", "id": "d6ffc7b7", "metadata": {}, "source": [ "# 1.0 Importing libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "4ca597ab", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Description: Import libraries\n", "\"\"\"\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import metrics\n", "import pandas as pd\n", "import os\n", "import random\n", "from humanfriendly import format_timespan\n", "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.ensemble import RandomForestClassifier\n", "import pickle\n", "# from sklearn.svm import SVC\n", "# from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 2, "id": "fffc59ee", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Description: Specify data path\n", "\"\"\"\n", "data_path = r'data\\winequality_red_label_remapped.csv'" ] }, { "cell_type": "code", "execution_count": 3, "id": "5a2e912f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.700.001.90.07611.034.00.99783.510.569.42
17.80.880.002.60.09825.067.00.99683.200.689.82
27.80.760.042.30.09215.054.00.99703.260.659.82
311.20.280.561.90.07517.060.00.99803.160.589.83
47.40.700.001.90.07611.034.00.99783.510.569.42
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality \n", "0 9.4 2 \n", "1 9.8 2 \n", "2 9.8 2 \n", "3 9.8 3 \n", "4 9.4 2 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Load data\n", "\"\"\"\n", "df = pd.read_csv(data_path)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "2815d511", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, 3, 4, 5], dtype=int64)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Get classes\n", "\"\"\"\n", "np.unique(df['quality'])" ] }, { "cell_type": "code", "execution_count": 5, "id": "d11d9540", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\nDescription: Remap \\n'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Remap \n", "\"\"\"\n", "# df['quality'] = df['quality'].apply(lambda x: x-3)" ] }, { "cell_type": "code", "execution_count": 6, "id": "4d694106", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, 3, 4, 5], dtype=int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Get classes\n", "\"\"\"\n", "np.unique(df['quality'])" ] }, { "cell_type": "code", "execution_count": 7, "id": "43458438", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"winequality_red_label_remapped.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 8, "id": "ade5900f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "fixed acidity 0\n", "volatile acidity 0\n", "citric acid 0\n", "residual sugar 0\n", "chlorides 0\n", "free sulfur dioxide 0\n", "total sulfur dioxide 0\n", "density 0\n", "pH 0\n", "sulphates 0\n", "alcohol 0\n", "quality 0\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Check null value\n", "\"\"\"\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 9, "id": "1b34f13e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1599, 11)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Prepare data\n", "\"\"\"\n", "x=df.drop(['quality'], axis=1)\n", "x.shape" ] }, { "cell_type": "code", "execution_count": 10, "id": "238dc707", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1599,)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Get target label\n", "\"\"\"\n", "y = df['quality']\n", "y.shape" ] }, { "cell_type": "code", "execution_count": 11, "id": "5617aeb1", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Description: Split data\n", "\"\"\"\n", "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=40,stratify=y)" ] }, { "cell_type": "code", "execution_count": 12, "id": "f5d3b86f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape of x_train: (1279, 11)\n", "shape of y_train: (1279,)\n", "shape of x_test: (320, 11)\n", "shape of y_test: (320,)\n" ] } ], "source": [ "'''\n", "Description : Check size of dataset\n", "'''\n", "print(\"shape of x_train: \",x_train.shape)\n", "print(\"shape of y_train: {}\".format(y_train.shape))\n", "print(f'shape of x_test: {x_test.shape}')\n", "print(f'shape of y_test: {y_test.shape}')" ] }, { "cell_type": "code", "execution_count": 13, "id": "67168e49", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier(n_estimators=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestClassifier(n_estimators=1000)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Create model architecture\n", "\"\"\"\n", "model = RandomForestClassifier(n_estimators=1000)\n", "model" ] }, { "cell_type": "code", "execution_count": 14, "id": "fcad50e5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier(n_estimators=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestClassifier(n_estimators=1000)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Train model\n", "\"\"\"\n", "model.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "id": "a20a2ec3", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomForestClassifier(n_estimators=1000) : \n", "Training Accuracy : 1.0\n", "Validation Accuracy : 0.66875\n" ] } ], "source": [ "\"\"\"\n", "Description: Get training and test accuracy\n", "\"\"\"\n", "print(f'{model} : ')\n", "print('Training Accuracy : ', metrics.accuracy_score(y_train, model.predict(x_train)))\n", "print('Validation Accuracy : ', metrics.accuracy_score(y_test, model.predict(x_test)))" ] }, { "cell_type": "code", "execution_count": 16, "id": "5c20bc9e", "metadata": {}, "outputs": [], "source": [ "pickle.dump(model, open(\"random_forest_model.pkl\", 'wb'))" ] }, { "cell_type": "code", "execution_count": 17, "id": "f55a0ec8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "fixed acidity 15.90000\n", "volatile acidity 1.58000\n", "citric acid 1.00000\n", "residual sugar 15.50000\n", "chlorides 0.61100\n", "free sulfur dioxide 72.00000\n", "total sulfur dioxide 289.00000\n", "density 1.00369\n", "pH 4.01000\n", "sulphates 2.00000\n", "alcohol 14.90000\n", "quality 5.00000\n", "dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: min, max\n", "\"\"\"\n", "df.max()" ] }, { "cell_type": "code", "execution_count": 18, "id": "234d7a65", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "fixed acidity 4.60000\n", "volatile acidity 0.12000\n", "citric acid 0.00000\n", "residual sugar 0.90000\n", "chlorides 0.01200\n", "free sulfur dioxide 1.00000\n", "total sulfur dioxide 6.00000\n", "density 0.99007\n", "pH 2.74000\n", "sulphates 0.33000\n", "alcohol 8.40000\n", "quality 0.00000\n", "dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: min, max\n", "\"\"\"\n", "df.min()" ] }, { "cell_type": "code", "execution_count": 19, "id": "3fcb0d81", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\n", " 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',\n", " 'pH', 'sulphates', 'alcohol', 'quality'],\n", " dtype='object')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Description: Check columns\n", "\"\"\"\n", "df.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "29e30ec2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 5 }