# 1.0 Importing libraries

In [1]:
"""
Description: Import libraries
"""
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import os
import random
from humanfriendly import format_timespan
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import pickle
# from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression

In [2]:
"""
Description: Specify data path
"""
data_path = r'data\winequality_red_label_remapped.csv'

In [3]:
"""
Description: Load data
"""
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2


In [4]:
"""
Description: Get classes
"""
np.unique(df['quality'])

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [5]:
"""
Description: Remap 
"""
# df['quality'] = df['quality'].apply(lambda x: x-3)

'\nDescription: Remap \n'

In [6]:
"""
Description: Get classes
"""
np.unique(df['quality'])

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [7]:
df.to_csv("winequality_red_label_remapped.csv",index=False)

In [8]:
"""
Description: Check null value
"""
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [9]:
"""
Description: Prepare data
"""
x=df.drop(['quality'], axis=1)
x.shape

(1599, 11)

In [10]:
"""
Description: Get target label
"""
y = df['quality']
y.shape

(1599,)

In [11]:
"""
Description: Split data
"""
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=40,stratify=y)

In [12]:
'''
Description : Check size of dataset
'''
print("shape of x_train: ",x_train.shape)
print("shape of y_train: {}".format(y_train.shape))
print(f'shape of x_test: {x_test.shape}')
print(f'shape of y_test: {y_test.shape}')

shape of x_train:  (1279, 11)
shape of y_train: (1279,)
shape of x_test: (320, 11)
shape of y_test: (320,)


In [13]:
"""
Description: Create model architecture
"""
model = RandomForestClassifier(n_estimators=1000)
model

In [14]:
"""
Description: Train model
"""
model.fit(x_train, y_train)

In [15]:
"""
Description: Get training and test accuracy
"""
print(f'{model} : ')
print('Training Accuracy : ', metrics.accuracy_score(y_train, model.predict(x_train)))
print('Validation Accuracy : ', metrics.accuracy_score(y_test, model.predict(x_test)))

RandomForestClassifier(n_estimators=1000) : 
Training Accuracy :  1.0
Validation Accuracy :  0.66875


In [16]:
pickle.dump(model, open("random_forest_model.pkl", 'wb'))

In [17]:
"""
Description: min, max
"""
df.max()

fixed acidity            15.90000
volatile acidity          1.58000
citric acid               1.00000
residual sugar           15.50000
chlorides                 0.61100
free sulfur dioxide      72.00000
total sulfur dioxide    289.00000
density                   1.00369
pH                        4.01000
sulphates                 2.00000
alcohol                  14.90000
quality                   5.00000
dtype: float64

In [18]:
"""
Description: min, max
"""
df.min()

fixed acidity           4.60000
volatile acidity        0.12000
citric acid             0.00000
residual sugar          0.90000
chlorides               0.01200
free sulfur dioxide     1.00000
total sulfur dioxide    6.00000
density                 0.99007
pH                      2.74000
sulphates               0.33000
alcohol                 8.40000
quality                 0.00000
dtype: float64

In [19]:
"""
Description: Check columns
"""
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')