File size: 6,659 Bytes
c33411e
 
 
 
 
 
 
 
 
 
 
 
675fe68
 
2db7855
675fe68
c33411e
 
 
 
 
 
 
 
 
 
 
 
9a4989a
675fe68
 
 
 
 
 
 
c33411e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675fe68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c33411e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675fe68
 
c33411e
 
 
 
 
 
 
675fe68
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import os
os.system('git clone --recursive https://github.com/dmlc/xgboost')
os.system('cd xgboost')
os.system('sudo cp make/minimum.mk ./config.mk;')
os.system('sudo make -j4;')
os.system('sh build.sh')
os.system('cd python-package')
os.system('python setup.py install')
os.system('pip install graphviz')
os.system('pip install python-pydot')
os.system('pip install python-pydot-ng')
os.system('pip install -U scikit-learn scipy matplotlib')
os.system('pip install wandb --upgrade')
os.system('pip install tensorboardX --upgrade')
os.system('pip install ipython --upgrade')
os.system('wandb login 5a0e81f39777351977ce52cf57ea09c4f48f3d93 --relogin')

from collections import namedtuple
import altair as alt
import math
import streamlit as st
import pandas
import numpy
import xgboost
import graphviz
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot
os.system('load_ext tensorboard')
import os
import datetime
from tensorboardX import SummaryWriter
import wandb
from wandb.xgboost import wandb_callback

wandb.init(project="australian_rain", entity="epitech1")

"""
# MLOPS
"""


max_depth_input = st.slider("Max depth", 1, 100, 5)
colsample_bytree_input = st.slider("Colsample bytree", 0.0, 1.0, 0.5)
learning_rate_input = st.slider("Learning rate", 0.0, 1.0, 0.2)
alpha_input = st.slider("Alpha", 1, 100, 10)
n_estimators_input = st.slider("n estimators", 1, 100, 20)
city_input = st.selectbox(
     'Which city do you want to predict rain ?',
     ("Canberra",
    "Albury",
    "Penrith",
    "Sydney",
    "MountGinini",
    "Bendigo",
    "Brisbane",
    "Portland"), index=0)

dataset = pandas.read_csv('weatherAUS.csv')

location_dataset = dataset["Location"].unique()
wind_dataset = dataset["WindGustDir"].unique()
date_dataset = dataset["Date"].unique()

dataset.drop(dataset.loc[dataset['Location'] != city_input].index, inplace=True)

i_RainTomorrow = dataset.columns.get_loc("RainTomorrow")
#i_Location = dataset.columns.get_loc("Location")
i_WindGustDir = dataset.columns.get_loc("WindGustDir")
i_Date = dataset.columns.get_loc("Date")
yes = dataset.iat[8, dataset.columns.get_loc("RainTomorrow")]
no = dataset.iat[0, dataset.columns.get_loc("RainTomorrow")]

for i in range(len(dataset)):
    if (dataset.iat[i, i_RainTomorrow] == yes):
        dataset.iat[i, i_RainTomorrow] = True
    else:
        dataset.iat[i, i_RainTomorrow] = False
    #dataset.iat[i, i_Location] = numpy.where(location_dataset == dataset.iat[i, i_Location])[0][0]
    if (pandas.isna(dataset.iat[i, i_WindGustDir])):
        dataset.iat[i, i_WindGustDir] = 0
    else:
        dataset.iat[i, i_WindGustDir] = numpy.where(wind_dataset == dataset.iat[i, i_WindGustDir])[0][0] + 1
    dataset.iat[i, i_Date] = numpy.where(date_dataset == dataset.iat[i, i_Date])[0][0]
    
    
dataset = dataset.astype({'RainTomorrow': 'bool'})
#dataset = dataset.astype({'Location': 'int'})
dataset = dataset.astype({'WindGustDir': 'int'})
dataset = dataset.astype({'Date': 'int'})

dataset.drop(columns=["WindDir9am", "WindDir3pm", "WindSpeed9am", "WindSpeed3pm", "Temp9am", "Temp3pm", "RainToday"], inplace=True)
dataset.drop(dataset.index[dataset.isnull().any(axis=1)], 0, inplace=True)

dataset["Humidity"] = 0.0
dataset["Pressure"] = 0.0
dataset["Cloud"] = 0.0

for i in dataset.index:
    humidity = (dataset["Humidity9am"][i] + dataset["Humidity3pm"][i]) / 2
    dataset.at[i, "Humidity"] = humidity
    pressure = (dataset["Pressure9am"][i] + dataset["Pressure3pm"][i]) / 2
    dataset.at[i, "Pressure"] = pressure
    cloud = (dataset["Cloud9am"][i] + dataset["Cloud3pm"][i]) / 2
    dataset.at[i, "Cloud"] = cloud

dataset.drop(columns=["Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm"], inplace=True)

x, y = dataset.iloc[:,[False, False, True, True, False, True, True, True, True, True, True, True, True]],dataset.iloc[:,4]

data_dmatrix = xgboost.DMatrix(data=x,label=y)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

class TensorBoardCallback(xgboost.callback.TrainingCallback):
    def __init__(self, experiment: str = None, data_name: str = None):
        self.experiment = experiment or "logs"
        self.data_name = data_name or "test"
        self.datetime_ = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.log_dir = f"runs/{self.experiment}/{self.datetime_}"
        self.train_writer = SummaryWriter(log_dir=os.path.join(self.log_dir, "train/"))
        if self.data_name:
            self.test_writer = SummaryWriter(log_dir=os.path.join(self.log_dir, f"{self.data_name}/"))

    def after_iteration(
        self, model, epoch: int, evals_log: xgboost.callback.TrainingCallback.EvalsLog
    ) -> bool:
        if not evals_log:
            return False

        for data, metric in evals_log.items():
            for metric_name, log in metric.items():
                score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
                if data == "train":
                    self.train_writer.add_scalar(metric_name, score, epoch)
                else:
                    self.test_writer.add_scalar(metric_name, score, epoch)

        return False
        
xg_reg = xgboost.XGBRegressor(colsample_bytree = colsample_bytree_input, learning_rate = learning_rate_input, max_depth = max_depth_input, alpha = alpha_input, n_estimators = n_estimators_input, eval_metric = ['rmse', 'error', 'logloss', 'map'],
        callbacks=[TensorBoardCallback(experiment='exp_1', data_name='test')])

xg_reg.fit(X_train,y_train, eval_set=[(X_train, y_train)])

preds = xg_reg.predict(X_test)

rmse = numpy.sqrt(mean_squared_error(y_test, preds))
st.write("RMSE: %f" % (rmse))

params = {'colsample_bytree': colsample_bytree_input,'learning_rate': learning_rate_input,
                'max_depth': max_depth_input, 'alpha': alpha_input}

cv_results = xgboost.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

st.write((cv_results["test-rmse-mean"]).tail(1))

xg_reg = xgboost.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

os.system('tensorboard --logdir runs')

#xgboost.plot_tree(xg_reg,num_trees=0)
#matplotlib.pyplot.rcParams['figure.figsize'] = [200, 200]
#matplotlib.pyplot.show()

#xgboost.plot_importance(xg_reg)
#matplotlib.pyplot.rcParams['figure.figsize'] = [5, 5]
#matplotlib.pyplot.show()

#xg_reg = xgboost.train(params=params, dtrain=data_dmatrix, num_boost_round=10, callbacks=[wandb_callback()])

# MLOPS - W&B analytics
# added the wandb to the callbacks