shivansh-ka commited on
Commit
0734f8e
1 Parent(s): 3e80ddd

experiment notebook added

Browse files
eda/eda.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
experiment_notebooks/Experiment 1.ipynb ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2c30c254",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import warnings\n",
12
+ "warnings.filterwarnings(\"ignore\")\n",
13
+ "\n",
14
+ "import pandas as pd\n",
15
+ "import numpy as np\n",
16
+ "import matplotlib.pyplot as plt\n",
17
+ "import seaborn as sns\n",
18
+ "import tensorflow as tf\n",
19
+ "#import tensorflow_gpu\n",
20
+ "import urllib\n",
21
+ "from tensorflow.keras.layers import TextVectorization\n",
22
+ "from tensorflow.keras.models import Sequential\n",
23
+ "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
24
+ "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
25
+ "from sklearn.metrics import roc_auc_score, f1_score\n",
26
+ "\n",
27
+ "import nltk\n",
28
+ "from nltk.corpus import stopwords\n",
29
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
30
+ "import re\n",
31
+ "import string\n",
32
+ "nltk.download('stopwords')\n",
33
+ "nltk.download('omw-1.4')\n",
34
+ "nltk.download('wordnet')\n",
35
+ "nltk.download('wordnet2022')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "2487874b",
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "def tf_tpu_or_gpu(device: str='gpu'):\n",
46
+ " if device.lower() == 'gpu':\n",
47
+ " print(\"Setting up GPU.....\")\n",
48
+ " device_name = tf.test.gpu_device_name()\n",
49
+ " if \"GPU\" not in device_name:\n",
50
+ " print(\"GPU device not found\")\n",
51
+ " print('Found GPU at: {}'.format(device_name))\n",
52
+ " config = tf.compat.v1.ConfigProto() \n",
53
+ " config.gpu_options.allow_growth = True \n",
54
+ " sess = tf.compat.v1.Session(config=config) \n",
55
+ " tf.compat.v1.keras.backend.set_session(sess)\n",
56
+ " print(config)\n",
57
+ " \n",
58
+ " elif device.lower() == 'tpu':\n",
59
+ " print(\"Setting up TPU.....\")\n",
60
+ " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
61
+ " print('Running on TPU ', tpu.master())\n",
62
+ " tf.config.experimental_connect_to_cluster(tpu)\n",
63
+ " tf.tpu.experimental.initialize_tpu_system(tpu)\n",
64
+ " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
65
+ " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
66
+ "\n",
67
+ " else:\n",
68
+ " raise Exception(\"Wrong Device Paramter Passed\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "id": "4fb1df02",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "tf_tpu_or_gpu(device='tpu')"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "id": "3377596d",
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "class Config:\n",
89
+ " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
90
+ " FILE_NAME = \"toxic_comment_data.csv\"\n",
91
+ " VOCAB_SIZE = 200000\n",
92
+ " OUTPUT_DIM = 1800\n",
93
+ " BUFFER_SIZE = 160000\n",
94
+ " BATCH_SIZE = 16*8\n",
95
+ " EPOCHS = 10\n",
96
+ " BASE_LOG_DIR = \"log_dir\"\n",
97
+ " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "id": "6ca4db64",
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
108
+ "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
109
+ "data.head()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "3f687273",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "X = data['comment_text']\n",
120
+ "y = data[data.columns[2:]].values"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "403cbd7d",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "X"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "e012a53e",
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "y"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "id": "c6db618c",
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
151
+ " output_sequence_length=Config.OUTPUT_DIM,\n",
152
+ " output_mode='int')\n",
153
+ "vectorizer.adapt(X.values)\n",
154
+ "vectorized_text = vectorizer(X.values)"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "id": "c5b25ecc",
161
+ "metadata": {},
162
+ "outputs": [],
163
+ "source": [
164
+ "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
165
+ "dataset = dataset.cache()\n",
166
+ "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
167
+ "dataset = dataset.batch(Config.BATCH_SIZE)\n",
168
+ "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "id": "a60be072",
175
+ "metadata": {},
176
+ "outputs": [],
177
+ "source": [
178
+ "train = dataset.take(int(len(dataset)*0.8))\n",
179
+ "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
180
+ "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "id": "6d4c3d18",
187
+ "metadata": {},
188
+ "outputs": [],
189
+ "source": [
190
+ "def callbacks(base_dir=\".\"):\n",
191
+ " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
192
+ " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
193
+ " os.makedirs(ckpt_file,exist_ok=True)\n",
194
+ "\n",
195
+ " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
196
+ " filepath = ckpt_file,\n",
197
+ " save_best_only = True)\n",
198
+ "\n",
199
+ " callback_list = [early_stopping,\n",
200
+ " ckpt_cb]\n",
201
+ " return callback_list\n",
202
+ "callbacks_list = callbacks()"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "id": "8cf70d04",
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "def create_model():\n",
213
+ " LAYERS = [\n",
214
+ " Embedding(Config.VOCAB_SIZE+1, 32),,\n",
215
+ " Bidirectional(LSTM(64, activation='tanh')),\n",
216
+ " Dense(128, activation='relu'),\n",
217
+ " Dense(256, activation='relu'),\n",
218
+ " Dense(128, activation='relu'),\n",
219
+ " Dense(6, activation='sigmoid')]\n",
220
+ " \n",
221
+ " model = Sequential(LAYERS)\n",
222
+ " return model"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": null,
228
+ "id": "26a56966",
229
+ "metadata": {},
230
+ "outputs": [],
231
+ "source": [
232
+ "with tpu_strategy.scope():\n",
233
+ " model = create_model()\n",
234
+ " model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
235
+ " loss=tf.keras.losses.binary_crossentropy,\n",
236
+ " metrics=AUC(multi_label=True, num_labels=6))\n",
237
+ "model.summary()"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": null,
243
+ "id": "891727f6",
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "history = model.fit(train, \n",
248
+ " epochs=Config.EPOCHS,\n",
249
+ " steps_per_epoch=len(train),\n",
250
+ " validation_data=val,\n",
251
+ " callbacks=callbacks_list)"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": null,
257
+ "id": "533cd762",
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
262
+ " y_pred = model.predict(pred_data)\n",
263
+ " try:\n",
264
+ " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
265
+ " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
266
+ " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
267
+ " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
268
+ " except Exception as e:\n",
269
+ " print(e)\n",
270
+ " \n",
271
+ " print(f\"Precision: {precision}\\n\"\n",
272
+ " f\"Recall: {recall}\\n\"\n",
273
+ " f\"F1-Score: {f1}\\n\"\n",
274
+ " f\"ROC-AUC-Score: {auc}\")\n",
275
+ " return (precision, recall, f1, auc)"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": null,
281
+ "id": "a2f19754",
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "model.save(\"baseline_model_1.h5\")"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "id": "314be9bc",
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "x_train = np.concatenate([x for x, y in train])\n",
296
+ "y_train = np.concatenate([y for x, y in train])\n",
297
+ "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": null,
303
+ "id": "ec45f5ad",
304
+ "metadata": {},
305
+ "outputs": [],
306
+ "source": [
307
+ "x_val = np.concatenate([x for x, y in val])\n",
308
+ "y_val = np.concatenate([y for x, y in val])\n",
309
+ "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
310
+ ]
311
+ }
312
+ ],
313
+ "metadata": {
314
+ "kernelspec": {
315
+ "display_name": "Python 3 (ipykernel)",
316
+ "language": "python",
317
+ "name": "python3"
318
+ },
319
+ "language_info": {
320
+ "codemirror_mode": {
321
+ "name": "ipython",
322
+ "version": 3
323
+ },
324
+ "file_extension": ".py",
325
+ "mimetype": "text/x-python",
326
+ "name": "python",
327
+ "nbconvert_exporter": "python",
328
+ "pygments_lexer": "ipython3",
329
+ "version": "3.9.12"
330
+ }
331
+ },
332
+ "nbformat": 4,
333
+ "nbformat_minor": 5
334
+ }
experiment_notebooks/Experiment 2.ipynb ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2c30c254",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import warnings\n",
12
+ "warnings.filterwarnings(\"ignore\")\n",
13
+ "\n",
14
+ "import pandas as pd\n",
15
+ "import numpy as np\n",
16
+ "import matplotlib.pyplot as plt\n",
17
+ "import seaborn as sns\n",
18
+ "import tensorflow as tf\n",
19
+ "#import tensorflow_gpu\n",
20
+ "import urllib\n",
21
+ "from tensorflow.keras.layers import TextVectorization\n",
22
+ "from tensorflow.keras.models import Sequential\n",
23
+ "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
24
+ "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
25
+ "from sklearn.metrics import roc_auc_score, f1_score\n",
26
+ "\n",
27
+ "import nltk\n",
28
+ "from nltk.corpus import stopwords\n",
29
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
30
+ "import re\n",
31
+ "import string\n",
32
+ "nltk.download('stopwords')\n",
33
+ "nltk.download('omw-1.4')\n",
34
+ "nltk.download('wordnet')\n",
35
+ "nltk.download('wordnet2022')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "2487874b",
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "def tf_tpu_or_gpu(device: str='gpu'):\n",
46
+ " if device.lower() == 'gpu':\n",
47
+ " print(\"Setting up GPU.....\")\n",
48
+ " device_name = tf.test.gpu_device_name()\n",
49
+ " if \"GPU\" not in device_name:\n",
50
+ " print(\"GPU device not found\")\n",
51
+ " print('Found GPU at: {}'.format(device_name))\n",
52
+ " config = tf.compat.v1.ConfigProto() \n",
53
+ " config.gpu_options.allow_growth = True \n",
54
+ " sess = tf.compat.v1.Session(config=config) \n",
55
+ " tf.compat.v1.keras.backend.set_session(sess)\n",
56
+ " print(config)\n",
57
+ " \n",
58
+ " elif device.lower() == 'tpu':\n",
59
+ " print(\"Setting up TPU.....\")\n",
60
+ " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
61
+ " print('Running on TPU ', tpu.master())\n",
62
+ " tf.config.experimental_connect_to_cluster(tpu)\n",
63
+ " tf.tpu.experimental.initialize_tpu_system(tpu)\n",
64
+ " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
65
+ " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
66
+ "\n",
67
+ " else:\n",
68
+ " raise Exception(\"Wrong Device Paramter Passed\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "id": "4fb1df02",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "tf_tpu_or_gpu(device='tpu')"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "id": "3377596d",
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "class Config:\n",
89
+ " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
90
+ " FILE_NAME = \"toxic_comment_data.csv\"\n",
91
+ " VOCAB_SIZE = 200000\n",
92
+ " OUTPUT_DIM = 1800\n",
93
+ " BUFFER_SIZE = 160000\n",
94
+ " BATCH_SIZE = 16*8\n",
95
+ " EPOCHS = 10\n",
96
+ " BASE_LOG_DIR = \"log_dir\"\n",
97
+ " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "id": "6ca4db64",
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
108
+ "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
109
+ "data.head()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "3f687273",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "X = data['comment_text']\n",
120
+ "y = data[data.columns[2:]].values"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "403cbd7d",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "X"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "e012a53e",
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "y"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "id": "d383e72a",
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "class Text_Cleaner:\n",
151
+ " def __init__(self, data):\n",
152
+ " self.data = data\n",
153
+ " self.STOPWORDS = stopwords.words('english')\n",
154
+ " self.wordnet = WordNetLemmatizer()\n",
155
+ " \n",
156
+ " def new_line_code(self, x:str)->str:\n",
157
+ " pattern = \"\\n\"\n",
158
+ " x = re.sub(pattern,' ', x).strip().lower()\n",
159
+ " return x\n",
160
+ "\n",
161
+ " def remove_punctuations(self, x:str)->str:\n",
162
+ " x = x.translate(str.maketrans('','',string.punctuation))\n",
163
+ " return x\n",
164
+ "\n",
165
+ " def remove_stopwords(self, x:str)->str:\n",
166
+ " sent=[]\n",
167
+ " for word in x.split():\n",
168
+ " if word not in self.STOPWORDS:\n",
169
+ " sent.append(word)\n",
170
+ " return ' '.join(sent)\n",
171
+ "\n",
172
+ " def lemmatization(self, x:str)->str:\n",
173
+ " sent=[]\n",
174
+ " for word in x.split():\n",
175
+ " sent.append(self.wordnet.lemmatize(word))\n",
176
+ " return ' '.join(sent)\n",
177
+ " \n",
178
+ " def clean_text(self):\n",
179
+ " self.data = self.data.apply(self.new_line_code)\n",
180
+ " self.data = self.data.apply(self.remove_punctuations)\n",
181
+ " self.data = self.data.apply(self.remove_stopwords)\n",
182
+ " self.data = self.data.apply(self.lemmatization)\n",
183
+ " self.data = self.data.apply(lambda x: x.strip())\n",
184
+ " return self.data"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": null,
190
+ "id": "b121fd12",
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "X = Text_Cleaner(X).clean_text()"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "id": "81c860cf",
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "X"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "id": "d5b374af",
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
215
+ " output_sequence_length=Config.OUTPUT_DIM,\n",
216
+ " output_mode='int')\n",
217
+ "vectorizer.adapt(X.values)\n",
218
+ "vectorized_text = vectorizer(X.values)`"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "id": "c5b25ecc",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
229
+ "dataset = dataset.cache()\n",
230
+ "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
231
+ "dataset = dataset.batch(Config.BATCH_SIZE)\n",
232
+ "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "id": "a60be072",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "train = dataset.take(int(len(dataset)*0.8))\n",
243
+ "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
244
+ "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": null,
250
+ "id": "6d4c3d18",
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": [
254
+ "def callbacks(base_dir=\".\"):\n",
255
+ " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
256
+ " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
257
+ " os.makedirs(ckpt_file,exist_ok=True)\n",
258
+ "\n",
259
+ " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
260
+ " filepath = ckpt_file,\n",
261
+ " save_best_only = True)\n",
262
+ "\n",
263
+ " callback_list = [early_stopping,\n",
264
+ " ckpt_cb]\n",
265
+ " return callback_list\n",
266
+ "callbacks_list = callbacks()"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "id": "8cf70d04",
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "def create_model():\n",
277
+ " LAYERS = [\n",
278
+ " Embedding(Config.VOCAB_SIZE+1, 32),,\n",
279
+ " Bidirectional(LSTM(64, activation='tanh')),\n",
280
+ " Dense(128, activation='relu'),\n",
281
+ " Dense(256, activation='relu'),\n",
282
+ " Dense(128, activation='relu'),\n",
283
+ " Dense(6, activation='sigmoid')]\n",
284
+ " \n",
285
+ " model = Sequential(LAYERS)\n",
286
+ " return model"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": null,
292
+ "id": "26a56966",
293
+ "metadata": {},
294
+ "outputs": [],
295
+ "source": [
296
+ "with tpu_strategy.scope():\n",
297
+ " model = create_model()\n",
298
+ " model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
299
+ " loss=tf.keras.losses.binary_crossentropy,\n",
300
+ " metrics=AUC(multi_label=True, num_labels=6))\n",
301
+ "model.summary()"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": null,
307
+ "id": "891727f6",
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "history = model.fit(train, \n",
312
+ " epochs=Config.EPOCHS,\n",
313
+ " steps_per_epoch=len(train),\n",
314
+ " validation_data=val,\n",
315
+ " callbacks=callbacks_list)"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": null,
321
+ "id": "533cd762",
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": [
325
+ "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
326
+ " y_pred = model.predict(pred_data)\n",
327
+ " try:\n",
328
+ " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
329
+ " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
330
+ " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
331
+ " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
332
+ " except Exception as e:\n",
333
+ " print(e)\n",
334
+ " \n",
335
+ " print(f\"Precision: {precision}\\n\"\n",
336
+ " f\"Recall: {recall}\\n\"\n",
337
+ " f\"F1-Score: {f1}\\n\"\n",
338
+ " f\"ROC-AUC-Score: {auc}\")\n",
339
+ " return (precision, recall, f1, auc)"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": null,
345
+ "id": "a2f19754",
346
+ "metadata": {},
347
+ "outputs": [],
348
+ "source": [
349
+ "model.save(\"model_2.h5\")"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": null,
355
+ "id": "314be9bc",
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "x_train = np.concatenate([x for x, y in train])\n",
360
+ "y_train = np.concatenate([y for x, y in train])\n",
361
+ "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": null,
367
+ "id": "ec45f5ad",
368
+ "metadata": {},
369
+ "outputs": [],
370
+ "source": [
371
+ "x_val = np.concatenate([x for x, y in val])\n",
372
+ "y_val = np.concatenate([y for x, y in val])\n",
373
+ "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
374
+ ]
375
+ }
376
+ ],
377
+ "metadata": {
378
+ "kernelspec": {
379
+ "display_name": "Python 3 (ipykernel)",
380
+ "language": "python",
381
+ "name": "python3"
382
+ },
383
+ "language_info": {
384
+ "codemirror_mode": {
385
+ "name": "ipython",
386
+ "version": 3
387
+ },
388
+ "file_extension": ".py",
389
+ "mimetype": "text/x-python",
390
+ "name": "python",
391
+ "nbconvert_exporter": "python",
392
+ "pygments_lexer": "ipython3",
393
+ "version": "3.9.12"
394
+ }
395
+ },
396
+ "nbformat": 4,
397
+ "nbformat_minor": 5
398
+ }
experiment_notebooks/Experiment 3.ipynb ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2c30c254",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import warnings\n",
12
+ "warnings.filterwarnings(\"ignore\")\n",
13
+ "\n",
14
+ "import pandas as pd\n",
15
+ "import numpy as np\n",
16
+ "import matplotlib.pyplot as plt\n",
17
+ "import seaborn as sns\n",
18
+ "import tensorflow as tf\n",
19
+ "#import tensorflow_gpu\n",
20
+ "import urllib\n",
21
+ "from tensorflow.keras.layers import TextVectorization\n",
22
+ "from tensorflow.keras.models import Sequential\n",
23
+ "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
24
+ "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
25
+ "from sklearn.metrics import roc_auc_score, f1_score\n",
26
+ "\n",
27
+ "import nltk\n",
28
+ "from nltk.corpus import stopwords\n",
29
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
30
+ "import re\n",
31
+ "import string\n",
32
+ "nltk.download('stopwords')\n",
33
+ "nltk.download('omw-1.4')\n",
34
+ "nltk.download('wordnet')\n",
35
+ "nltk.download('wordnet2022')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "2487874b",
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "def tf_tpu_or_gpu(device: str='gpu'):\n",
46
+ " if device.lower() == 'gpu':\n",
47
+ " print(\"Setting up GPU.....\")\n",
48
+ " device_name = tf.test.gpu_device_name()\n",
49
+ " if \"GPU\" not in device_name:\n",
50
+ " print(\"GPU device not found\")\n",
51
+ " print('Found GPU at: {}'.format(device_name))\n",
52
+ " config = tf.compat.v1.ConfigProto() \n",
53
+ " config.gpu_options.allow_growth = True \n",
54
+ " sess = tf.compat.v1.Session(config=config) \n",
55
+ " tf.compat.v1.keras.backend.set_session(sess)\n",
56
+ " print(config)\n",
57
+ " \n",
58
+ " elif device.lower() == 'tpu':\n",
59
+ " print(\"Setting up TPU.....\")\n",
60
+ " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
61
+ " print('Running on TPU ', tpu.master())\n",
62
+ " tf.config.experimental_connect_to_cluster(tpu)\n",
63
+ " tf.tpu.experimental.initialize_tpu_system(tpu)\n",
64
+ " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
65
+ " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
66
+ "\n",
67
+ " else:\n",
68
+ " raise Exception(\"Wrong Device Paramter Passed\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "id": "4fb1df02",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "tf_tpu_or_gpu(device='tpu')"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "id": "3377596d",
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "class Config:\n",
89
+ " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
90
+ " FILE_NAME = \"toxic_comment_data.csv\"\n",
91
+ " VOCAB_SIZE = 200000\n",
92
+ " OUTPUT_DIM = 1800\n",
93
+ " BUFFER_SIZE = 160000\n",
94
+ " BATCH_SIZE = 16*8\n",
95
+ " EPOCHS = 10\n",
96
+ " BASE_LOG_DIR = \"log_dir\"\n",
97
+ " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "id": "6ca4db64",
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
108
+ "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
109
+ "data.head()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "3f687273",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "X = data['comment_text']\n",
120
+ "y = data[data.columns[2:]].values"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "403cbd7d",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "X"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "e012a53e",
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "y"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "id": "d383e72a",
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "class Text_Cleaner:\n",
151
+ " def __init__(self, data):\n",
152
+ " self.data = data\n",
153
+ " self.STOPWORDS = stopwords.words('english')\n",
154
+ " self.wordnet = WordNetLemmatizer()\n",
155
+ " \n",
156
+ " def new_line_code(self, x:str)->str:\n",
157
+ " pattern = \"\\n\"\n",
158
+ " x = re.sub(pattern,' ', x).strip().lower()\n",
159
+ " return x\n",
160
+ "\n",
161
+ " def remove_punctuations(self, x:str)->str:\n",
162
+ " x = x.translate(str.maketrans('','',string.punctuation))\n",
163
+ " return x\n",
164
+ "\n",
165
+ " def remove_stopwords(self, x:str)->str:\n",
166
+ " sent=[]\n",
167
+ " for word in x.split():\n",
168
+ " if word not in self.STOPWORDS:\n",
169
+ " sent.append(word)\n",
170
+ " return ' '.join(sent)\n",
171
+ "\n",
172
+ " def lemmatization(self, x:str)->str:\n",
173
+ " sent=[]\n",
174
+ " for word in x.split():\n",
175
+ " sent.append(self.wordnet.lemmatize(word))\n",
176
+ " return ' '.join(sent)\n",
177
+ " \n",
178
+ " def clean_text(self):\n",
179
+ " self.data = self.data.apply(self.new_line_code)\n",
180
+ " self.data = self.data.apply(self.remove_punctuations)\n",
181
+ " self.data = self.data.apply(self.remove_stopwords)\n",
182
+ " self.data = self.data.apply(self.lemmatization)\n",
183
+ " self.data = self.data.apply(lambda x: x.strip())\n",
184
+ " return self.data"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": null,
190
+ "id": "b121fd12",
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "X = Text_Cleaner(X).clean_text()"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "id": "81c860cf",
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "X"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "id": "d5b374af",
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
215
+ " output_sequence_length=Config.OUTPUT_DIM,\n",
216
+ " output_mode='int')\n",
217
+ "vectorizer.adapt(X.values)\n",
218
+ "vectorized_text = vectorizer(X.values)`"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "id": "c5b25ecc",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
229
+ "dataset = dataset.cache()\n",
230
+ "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
231
+ "dataset = dataset.batch(Config.BATCH_SIZE)\n",
232
+ "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "id": "a60be072",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "train = dataset.take(int(len(dataset)*0.8))\n",
243
+ "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
244
+ "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": null,
250
+ "id": "6d4c3d18",
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": [
254
+ "def callbacks(base_dir=\".\"):\n",
255
+ " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
256
+ " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
257
+ " os.makedirs(ckpt_file,exist_ok=True)\n",
258
+ "\n",
259
+ " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
260
+ " filepath = ckpt_file,\n",
261
+ " save_best_only = True)\n",
262
+ "\n",
263
+ " callback_list = [early_stopping,\n",
264
+ " ckpt_cb]\n",
265
+ " return callback_list\n",
266
+ "callbacks_list = callbacks()"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "id": "8cf70d04",
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "def create_model():\n",
277
+ " LAYERS = [\n",
278
+ " Embedding(Config.VOCAB_SIZE+1, 32),\n",
279
+ " Bidirectional(LSTM(64, return_sequences=True, activation='tanh')),\n",
280
+ " Bidirectional(LSTM(32)),\n",
281
+ " Dense(128, activation='relu'),\n",
282
+ " Dense(256, activation='relu'),\n",
283
+ " Dense(128, activation='relu'),\n",
284
+ " Dense(6, activation='sigmoid')]\n",
285
+ " \n",
286
+ " model = Sequential(LAYERS)\n",
287
+ " return model"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": null,
293
+ "id": "26a56966",
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "with tpu_strategy.scope():\n",
298
+ " model = create_model()\n",
299
+ " model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
300
+ " loss=tf.keras.losses.binary_crossentropy,\n",
301
+ " metrics=AUC(multi_label=True, num_labels=6))\n",
302
+ "model.summary()"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": null,
308
+ "id": "891727f6",
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": [
312
+ "history = model.fit(train, \n",
313
+ " epochs=Config.EPOCHS,\n",
314
+ " steps_per_epoch=len(train),\n",
315
+ " validation_data=val,\n",
316
+ " callbacks=callbacks_list)"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "id": "533cd762",
323
+ "metadata": {},
324
+ "outputs": [],
325
+ "source": [
326
+ "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
327
+ " y_pred = model.predict(pred_data)\n",
328
+ " try:\n",
329
+ " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
330
+ " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
331
+ " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
332
+ " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
333
+ " except Exception as e:\n",
334
+ " print(e)\n",
335
+ " \n",
336
+ " print(f\"Precision: {precision}\\n\"\n",
337
+ " f\"Recall: {recall}\\n\"\n",
338
+ " f\"F1-Score: {f1}\\n\"\n",
339
+ " f\"ROC-AUC-Score: {auc}\")\n",
340
+ " return (precision, recall, f1, auc)"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": null,
346
+ "id": "a2f19754",
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "model.save(\"model_3.h5\")"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "id": "314be9bc",
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "x_train = np.concatenate([x for x, y in train])\n",
361
+ "y_train = np.concatenate([y for x, y in train])\n",
362
+ "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": null,
368
+ "id": "ec45f5ad",
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "x_val = np.concatenate([x for x, y in val])\n",
373
+ "y_val = np.concatenate([y for x, y in val])\n",
374
+ "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
375
+ ]
376
+ }
377
+ ],
378
+ "metadata": {
379
+ "kernelspec": {
380
+ "display_name": "Python 3 (ipykernel)",
381
+ "language": "python",
382
+ "name": "python3"
383
+ },
384
+ "language_info": {
385
+ "codemirror_mode": {
386
+ "name": "ipython",
387
+ "version": 3
388
+ },
389
+ "file_extension": ".py",
390
+ "mimetype": "text/x-python",
391
+ "name": "python",
392
+ "nbconvert_exporter": "python",
393
+ "pygments_lexer": "ipython3",
394
+ "version": "3.9.12"
395
+ }
396
+ },
397
+ "nbformat": 4,
398
+ "nbformat_minor": 5
399
+ }
experiment_notebooks/Experiment 4.ipynb ADDED
@@ -0,0 +1,1475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "!pip install nltk scikit-learn"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {
16
+ "execution": {
17
+ "iopub.execute_input": "2023-05-03T14:13:06.118200Z",
18
+ "iopub.status.busy": "2023-05-03T14:13:06.117322Z",
19
+ "iopub.status.idle": "2023-05-03T14:13:36.869507Z",
20
+ "shell.execute_reply": "2023-05-03T14:13:36.868619Z",
21
+ "shell.execute_reply.started": "2023-05-03T14:13:06.118149Z"
22
+ },
23
+ "scrolled": true
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "import os\n",
28
+ "import warnings\n",
29
+ "warnings.filterwarnings(\"ignore\")\n",
30
+ "\n",
31
+ "import pandas as pd\n",
32
+ "import numpy as np\n",
33
+ "import matplotlib.pyplot as plt\n",
34
+ "#import seaborn as sns\n",
35
+ "import tensorflow as tf\n",
36
+ "#import tensorflow_gpu\n",
37
+ "import urllib\n",
38
+ "from tensorflow.keras.layers import TextVectorization\n",
39
+ "from tensorflow.keras.models import Sequential\n",
40
+ "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
41
+ "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC\n",
42
+ "from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score\n",
43
+ "\n",
44
+ "import nltk\n",
45
+ "from nltk.corpus import stopwords\n",
46
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
47
+ "import re\n",
48
+ "import string\n",
49
+ "nltk.download('stopwords')\n",
50
+ "nltk.download('omw-1.4')\n",
51
+ "nltk.download('wordnet')\n",
52
+ "nltk.download('wordnet2022')"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "def tf_tpu_or_gpu(device: str='gpu'):\n",
62
+ " if device.lower() == 'gpu':\n",
63
+ " print(\"Setting up GPU.....\")\n",
64
+ " device_name = tf.test.gpu_device_name()\n",
65
+ " if \"GPU\" not in device_name:\n",
66
+ " print(\"GPU device not found\")\n",
67
+ " print('Found GPU at: {}'.format(device_name))\n",
68
+ " \n",
69
+ " config = tf.compat.v1.ConfigProto() \n",
70
+ " config.gpu_options.allow_growth = True \n",
71
+ " sess = tf.compat.v1.Session(config=config) \n",
72
+ " tf.compat.v1.keras.backend.set_session(sess)\n",
73
+ " \n",
74
+ " print(config)\n",
75
+ " \n",
76
+ " elif device.lower() == 'tpu':\n",
77
+ " print(\"Setting up TPU.....\")\n",
78
+ " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
79
+ " print('Running on TPU ', tpu.master())\n",
80
+ " tf.config.experimental_connect_to_cluster(tpu)\n",
81
+ " tf.tpu.experimental.initialize_tpu_system(tpu)\n",
82
+ " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
83
+ " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
84
+ " \n",
85
+ " else:\n",
86
+ " raise Exception(\"Wrong Device Paramter Passed\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "tf_tpu_or_gpu(device='tpu')"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 4,
101
+ "metadata": {
102
+ "execution": {
103
+ "iopub.execute_input": "2023-05-03T14:16:10.072253Z",
104
+ "iopub.status.busy": "2023-05-03T14:16:10.071138Z",
105
+ "iopub.status.idle": "2023-05-03T14:16:19.830833Z",
106
+ "shell.execute_reply": "2023-05-03T14:16:19.829780Z",
107
+ "shell.execute_reply.started": "2023-05-03T14:16:10.072215Z"
108
+ }
109
+ },
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "Running on TPU \n",
116
+ "INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\n",
117
+ "INFO:tensorflow:Initializing the TPU system: local\n",
118
+ "INFO:tensorflow:Finished initializing TPU system.\n",
119
+ "INFO:tensorflow:Found TPU system:\n",
120
+ "INFO:tensorflow:*** Num TPU Cores: 8\n",
121
+ "INFO:tensorflow:*** Num TPU Workers: 1\n",
122
+ "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n",
123
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n",
124
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n",
125
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n",
126
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n",
127
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n",
128
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n",
129
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n",
130
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n",
131
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n",
132
+ "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n",
133
+ "REPLICAS: 8\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
139
+ "print('Running on TPU ', tpu.master())\n",
140
+ "tf.config.experimental_connect_to_cluster(tpu)\n",
141
+ "tf.tpu.experimental.initialize_tpu_system(tpu)\n",
142
+ "tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
143
+ "print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "markdown",
148
+ "metadata": {},
149
+ "source": [
150
+ "device_name = tf.test.gpu_device_name()\n",
151
+ "if \"GPU\" not in device_name:\n",
152
+ " print(\"GPU device not found\")\n",
153
+ "print('Found GPU at: {}'.format(device_name))"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "markdown",
158
+ "metadata": {},
159
+ "source": [
160
+ "config = tf.compat.v1.ConfigProto() \n",
161
+ "config.gpu_options.allow_growth = True \n",
162
+ "sess = tf.compat.v1.Session(config=config) \n",
163
+ "tf.compat.v1.keras.backend.set_session(sess)"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 5,
169
+ "metadata": {
170
+ "execution": {
171
+ "iopub.execute_input": "2023-05-03T14:16:24.940878Z",
172
+ "iopub.status.busy": "2023-05-03T14:16:24.940140Z",
173
+ "iopub.status.idle": "2023-05-03T14:16:24.946837Z",
174
+ "shell.execute_reply": "2023-05-03T14:16:24.945707Z",
175
+ "shell.execute_reply.started": "2023-05-03T14:16:24.940845Z"
176
+ }
177
+ },
178
+ "outputs": [],
179
+ "source": [
180
+ "class Config:\n",
181
+ " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
182
+ " FILE_NAME = \"toxic_comment_data.csv\"\n",
183
+ " VOCAB_SIZE = 200000\n",
184
+ " OUTPUT_DIM = 1800\n",
185
+ " BUFFER_SIZE = 160000\n",
186
+ " BATCH_SIZE = 16*8\n",
187
+ " EPOCHS = 10\n",
188
+ " BASE_LOG_DIR = \"log_dir\"\n",
189
+ " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 6,
195
+ "metadata": {
196
+ "execution": {
197
+ "iopub.execute_input": "2023-05-03T14:16:29.171506Z",
198
+ "iopub.status.busy": "2023-05-03T14:16:29.170711Z",
199
+ "iopub.status.idle": "2023-05-03T14:16:30.613189Z",
200
+ "shell.execute_reply": "2023-05-03T14:16:30.612012Z",
201
+ "shell.execute_reply.started": "2023-05-03T14:16:29.171466Z"
202
+ }
203
+ },
204
+ "outputs": [
205
+ {
206
+ "data": {
207
+ "text/html": [
208
+ "<div>\n",
209
+ "<style scoped>\n",
210
+ " .dataframe tbody tr th:only-of-type {\n",
211
+ " vertical-align: middle;\n",
212
+ " }\n",
213
+ "\n",
214
+ " .dataframe tbody tr th {\n",
215
+ " vertical-align: top;\n",
216
+ " }\n",
217
+ "\n",
218
+ " .dataframe thead th {\n",
219
+ " text-align: right;\n",
220
+ " }\n",
221
+ "</style>\n",
222
+ "<table border=\"1\" class=\"dataframe\">\n",
223
+ " <thead>\n",
224
+ " <tr style=\"text-align: right;\">\n",
225
+ " <th></th>\n",
226
+ " <th>id</th>\n",
227
+ " <th>comment_text</th>\n",
228
+ " <th>toxic</th>\n",
229
+ " <th>severe_toxic</th>\n",
230
+ " <th>obscene</th>\n",
231
+ " <th>threat</th>\n",
232
+ " <th>insult</th>\n",
233
+ " <th>identity_hate</th>\n",
234
+ " </tr>\n",
235
+ " </thead>\n",
236
+ " <tbody>\n",
237
+ " <tr>\n",
238
+ " <th>0</th>\n",
239
+ " <td>0000997932d777bf</td>\n",
240
+ " <td>Explanation\\nWhy the edits made under my usern...</td>\n",
241
+ " <td>0</td>\n",
242
+ " <td>0</td>\n",
243
+ " <td>0</td>\n",
244
+ " <td>0</td>\n",
245
+ " <td>0</td>\n",
246
+ " <td>0</td>\n",
247
+ " </tr>\n",
248
+ " <tr>\n",
249
+ " <th>1</th>\n",
250
+ " <td>000103f0d9cfb60f</td>\n",
251
+ " <td>D'aww! He matches this background colour I'm s...</td>\n",
252
+ " <td>0</td>\n",
253
+ " <td>0</td>\n",
254
+ " <td>0</td>\n",
255
+ " <td>0</td>\n",
256
+ " <td>0</td>\n",
257
+ " <td>0</td>\n",
258
+ " </tr>\n",
259
+ " <tr>\n",
260
+ " <th>2</th>\n",
261
+ " <td>000113f07ec002fd</td>\n",
262
+ " <td>Hey man, I'm really not trying to edit war. It...</td>\n",
263
+ " <td>0</td>\n",
264
+ " <td>0</td>\n",
265
+ " <td>0</td>\n",
266
+ " <td>0</td>\n",
267
+ " <td>0</td>\n",
268
+ " <td>0</td>\n",
269
+ " </tr>\n",
270
+ " <tr>\n",
271
+ " <th>3</th>\n",
272
+ " <td>0001b41b1c6bb37e</td>\n",
273
+ " <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
274
+ " <td>0</td>\n",
275
+ " <td>0</td>\n",
276
+ " <td>0</td>\n",
277
+ " <td>0</td>\n",
278
+ " <td>0</td>\n",
279
+ " <td>0</td>\n",
280
+ " </tr>\n",
281
+ " <tr>\n",
282
+ " <th>4</th>\n",
283
+ " <td>0001d958c54c6e35</td>\n",
284
+ " <td>You, sir, are my hero. Any chance you remember...</td>\n",
285
+ " <td>0</td>\n",
286
+ " <td>0</td>\n",
287
+ " <td>0</td>\n",
288
+ " <td>0</td>\n",
289
+ " <td>0</td>\n",
290
+ " <td>0</td>\n",
291
+ " </tr>\n",
292
+ " </tbody>\n",
293
+ "</table>\n",
294
+ "</div>"
295
+ ],
296
+ "text/plain": [
297
+ " id comment_text toxic \n",
298
+ "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n",
299
+ "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n",
300
+ "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n",
301
+ "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n",
302
+ "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n",
303
+ "\n",
304
+ " severe_toxic obscene threat insult identity_hate \n",
305
+ "0 0 0 0 0 0 \n",
306
+ "1 0 0 0 0 0 \n",
307
+ "2 0 0 0 0 0 \n",
308
+ "3 0 0 0 0 0 \n",
309
+ "4 0 0 0 0 0 "
310
+ ]
311
+ },
312
+ "execution_count": 6,
313
+ "metadata": {},
314
+ "output_type": "execute_result"
315
+ }
316
+ ],
317
+ "source": [
318
+ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
319
+ "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
320
+ "data.head()"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": 7,
326
+ "metadata": {
327
+ "execution": {
328
+ "iopub.execute_input": "2023-05-03T14:16:37.492444Z",
329
+ "iopub.status.busy": "2023-05-03T14:16:37.491342Z",
330
+ "iopub.status.idle": "2023-05-03T14:16:37.533400Z",
331
+ "shell.execute_reply": "2023-05-03T14:16:37.532235Z",
332
+ "shell.execute_reply.started": "2023-05-03T14:16:37.492404Z"
333
+ }
334
+ },
335
+ "outputs": [
336
+ {
337
+ "name": "stdout",
338
+ "output_type": "stream",
339
+ "text": [
340
+ "<class 'pandas.core.frame.DataFrame'>\n",
341
+ "RangeIndex: 159571 entries, 0 to 159570\n",
342
+ "Data columns (total 8 columns):\n",
343
+ " # Column Non-Null Count Dtype \n",
344
+ "--- ------ -------------- ----- \n",
345
+ " 0 id 159571 non-null object\n",
346
+ " 1 comment_text 159571 non-null object\n",
347
+ " 2 toxic 159571 non-null int64 \n",
348
+ " 3 severe_toxic 159571 non-null int64 \n",
349
+ " 4 obscene 159571 non-null int64 \n",
350
+ " 5 threat 159571 non-null int64 \n",
351
+ " 6 insult 159571 non-null int64 \n",
352
+ " 7 identity_hate 159571 non-null int64 \n",
353
+ "dtypes: int64(6), object(2)\n",
354
+ "memory usage: 9.7+ MB\n"
355
+ ]
356
+ }
357
+ ],
358
+ "source": [
359
+ "data.info()"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 8,
365
+ "metadata": {
366
+ "execution": {
367
+ "iopub.execute_input": "2023-05-03T14:16:41.586932Z",
368
+ "iopub.status.busy": "2023-05-03T14:16:41.585997Z",
369
+ "iopub.status.idle": "2023-05-03T14:16:41.618902Z",
370
+ "shell.execute_reply": "2023-05-03T14:16:41.617979Z",
371
+ "shell.execute_reply.started": "2023-05-03T14:16:41.586895Z"
372
+ }
373
+ },
374
+ "outputs": [
375
+ {
376
+ "data": {
377
+ "text/plain": [
378
+ "id 0\n",
379
+ "comment_text 0\n",
380
+ "toxic 0\n",
381
+ "severe_toxic 0\n",
382
+ "obscene 0\n",
383
+ "threat 0\n",
384
+ "insult 0\n",
385
+ "identity_hate 0\n",
386
+ "dtype: int64"
387
+ ]
388
+ },
389
+ "execution_count": 8,
390
+ "metadata": {},
391
+ "output_type": "execute_result"
392
+ }
393
+ ],
394
+ "source": [
395
+ "data.isnull().sum()"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": 9,
401
+ "metadata": {
402
+ "execution": {
403
+ "iopub.execute_input": "2023-05-03T14:16:44.561198Z",
404
+ "iopub.status.busy": "2023-05-03T14:16:44.560414Z",
405
+ "iopub.status.idle": "2023-05-03T14:16:44.586487Z",
406
+ "shell.execute_reply": "2023-05-03T14:16:44.585582Z",
407
+ "shell.execute_reply.started": "2023-05-03T14:16:44.561152Z"
408
+ }
409
+ },
410
+ "outputs": [
411
+ {
412
+ "data": {
413
+ "text/html": [
414
+ "<div>\n",
415
+ "<style scoped>\n",
416
+ " .dataframe tbody tr th:only-of-type {\n",
417
+ " vertical-align: middle;\n",
418
+ " }\n",
419
+ "\n",
420
+ " .dataframe tbody tr th {\n",
421
+ " vertical-align: top;\n",
422
+ " }\n",
423
+ "\n",
424
+ " .dataframe thead th {\n",
425
+ " text-align: right;\n",
426
+ " }\n",
427
+ "</style>\n",
428
+ "<table border=\"1\" class=\"dataframe\">\n",
429
+ " <thead>\n",
430
+ " <tr style=\"text-align: right;\">\n",
431
+ " <th></th>\n",
432
+ " <th>0</th>\n",
433
+ " <th>1</th>\n",
434
+ " </tr>\n",
435
+ " </thead>\n",
436
+ " <tbody>\n",
437
+ " <tr>\n",
438
+ " <th>toxic</th>\n",
439
+ " <td>144277</td>\n",
440
+ " <td>15294</td>\n",
441
+ " </tr>\n",
442
+ " <tr>\n",
443
+ " <th>severe_toxic</th>\n",
444
+ " <td>157976</td>\n",
445
+ " <td>1595</td>\n",
446
+ " </tr>\n",
447
+ " <tr>\n",
448
+ " <th>obscene</th>\n",
449
+ " <td>151122</td>\n",
450
+ " <td>8449</td>\n",
451
+ " </tr>\n",
452
+ " <tr>\n",
453
+ " <th>threat</th>\n",
454
+ " <td>159093</td>\n",
455
+ " <td>478</td>\n",
456
+ " </tr>\n",
457
+ " <tr>\n",
458
+ " <th>insult</th>\n",
459
+ " <td>151694</td>\n",
460
+ " <td>7877</td>\n",
461
+ " </tr>\n",
462
+ " <tr>\n",
463
+ " <th>identity_hate</th>\n",
464
+ " <td>158166</td>\n",
465
+ " <td>1405</td>\n",
466
+ " </tr>\n",
467
+ " </tbody>\n",
468
+ "</table>\n",
469
+ "</div>"
470
+ ],
471
+ "text/plain": [
472
+ " 0 1\n",
473
+ "toxic 144277 15294\n",
474
+ "severe_toxic 157976 1595\n",
475
+ "obscene 151122 8449\n",
476
+ "threat 159093 478\n",
477
+ "insult 151694 7877\n",
478
+ "identity_hate 158166 1405"
479
+ ]
480
+ },
481
+ "execution_count": 9,
482
+ "metadata": {},
483
+ "output_type": "execute_result"
484
+ }
485
+ ],
486
+ "source": [
487
+ "data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T"
488
+ ]
489
+ },
490
+ {
491
+ "cell_type": "code",
492
+ "execution_count": 10,
493
+ "metadata": {
494
+ "execution": {
495
+ "iopub.execute_input": "2023-05-03T14:16:51.639830Z",
496
+ "iopub.status.busy": "2023-05-03T14:16:51.639059Z",
497
+ "iopub.status.idle": "2023-05-03T14:16:51.658065Z",
498
+ "shell.execute_reply": "2023-05-03T14:16:51.657049Z",
499
+ "shell.execute_reply.started": "2023-05-03T14:16:51.639796Z"
500
+ }
501
+ },
502
+ "outputs": [
503
+ {
504
+ "name": "stdout",
505
+ "output_type": "stream",
506
+ "text": [
507
+ "toxic value count\n",
508
+ "--------------------\n",
509
+ "0: 144277 | 90.42 %\n",
510
+ "1: 15294 | 9.58 %\n",
511
+ "\n",
512
+ "severe_toxic value count\n",
513
+ "--------------------\n",
514
+ "0: 157976 | 99.0 %\n",
515
+ "1: 1595 | 1.0 %\n",
516
+ "\n",
517
+ "obscene value count\n",
518
+ "--------------------\n",
519
+ "0: 151122 | 94.71 %\n",
520
+ "1: 8449 | 5.29 %\n",
521
+ "\n",
522
+ "threat value count\n",
523
+ "--------------------\n",
524
+ "0: 159093 | 99.7 %\n",
525
+ "1: 478 | 0.3 %\n",
526
+ "\n",
527
+ "insult value count\n",
528
+ "--------------------\n",
529
+ "0: 151694 | 95.06 %\n",
530
+ "1: 7877 | 4.94 %\n",
531
+ "\n",
532
+ "identity_hate value count\n",
533
+ "--------------------\n",
534
+ "0: 158166 | 99.12 %\n",
535
+ "1: 1405 | 0.88 %\n",
536
+ "\n"
537
+ ]
538
+ }
539
+ ],
540
+ "source": [
541
+ "for column in data.columns:\n",
542
+ " if data[column].dtype != 'O':\n",
543
+ " value_count = data[column].value_counts()\n",
544
+ " print(f\"{column} value count\\n{'--'*10}\")\n",
545
+ " print(f\"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\\n\"\n",
546
+ " f\"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\\n\")"
547
+ ]
548
+ },
549
+ {
550
+ "cell_type": "code",
551
+ "execution_count": null,
552
+ "metadata": {},
553
+ "outputs": [],
554
+ "source": [
555
+ "data[\"text_len\"] = data[\"comment_text\"].apply(lambda x: len(x.split()))\n",
556
+ "data[data[\"text_len\"]==data[\"text_len\"].max()]['comment_text']"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "code",
561
+ "execution_count": 11,
562
+ "metadata": {
563
+ "execution": {
564
+ "iopub.execute_input": "2023-05-03T14:16:58.642154Z",
565
+ "iopub.status.busy": "2023-05-03T14:16:58.641279Z",
566
+ "iopub.status.idle": "2023-05-03T14:16:58.648851Z",
567
+ "shell.execute_reply": "2023-05-03T14:16:58.647773Z",
568
+ "shell.execute_reply.started": "2023-05-03T14:16:58.642119Z"
569
+ }
570
+ },
571
+ "outputs": [],
572
+ "source": [
573
+ "X = data['comment_text']\n",
574
+ "y = data[data.columns[2:]].values"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 12,
580
+ "metadata": {
581
+ "execution": {
582
+ "iopub.execute_input": "2023-05-03T14:17:02.919383Z",
583
+ "iopub.status.busy": "2023-05-03T14:17:02.918865Z",
584
+ "iopub.status.idle": "2023-05-03T14:17:02.927191Z",
585
+ "shell.execute_reply": "2023-05-03T14:17:02.926293Z",
586
+ "shell.execute_reply.started": "2023-05-03T14:17:02.919350Z"
587
+ }
588
+ },
589
+ "outputs": [
590
+ {
591
+ "data": {
592
+ "text/plain": [
593
+ "0 Explanation\\nWhy the edits made under my usern...\n",
594
+ "1 D'aww! He matches this background colour I'm s...\n",
595
+ "2 Hey man, I'm really not trying to edit war. It...\n",
596
+ "3 \"\\nMore\\nI can't make any real suggestions on ...\n",
597
+ "4 You, sir, are my hero. Any chance you remember...\n",
598
+ " ... \n",
599
+ "159566 \":::::And for the second time of asking, when ...\n",
600
+ "159567 You should be ashamed of yourself \\n\\nThat is ...\n",
601
+ "159568 Spitzer \\n\\nUmm, theres no actual article for ...\n",
602
+ "159569 And it looks like it was actually you who put ...\n",
603
+ "159570 \"\\nAnd ... I really don't think you understand...\n",
604
+ "Name: comment_text, Length: 159571, dtype: object"
605
+ ]
606
+ },
607
+ "execution_count": 12,
608
+ "metadata": {},
609
+ "output_type": "execute_result"
610
+ }
611
+ ],
612
+ "source": [
613
+ "X"
614
+ ]
615
+ },
616
+ {
617
+ "cell_type": "code",
618
+ "execution_count": 13,
619
+ "metadata": {
620
+ "execution": {
621
+ "iopub.execute_input": "2023-05-03T14:17:08.246451Z",
622
+ "iopub.status.busy": "2023-05-03T14:17:08.245491Z",
623
+ "iopub.status.idle": "2023-05-03T14:17:08.252604Z",
624
+ "shell.execute_reply": "2023-05-03T14:17:08.251608Z",
625
+ "shell.execute_reply.started": "2023-05-03T14:17:08.246414Z"
626
+ }
627
+ },
628
+ "outputs": [
629
+ {
630
+ "data": {
631
+ "text/plain": [
632
+ "array([[0, 0, 0, 0, 0, 0],\n",
633
+ " [0, 0, 0, 0, 0, 0],\n",
634
+ " [0, 0, 0, 0, 0, 0],\n",
635
+ " ...,\n",
636
+ " [0, 0, 0, 0, 0, 0],\n",
637
+ " [0, 0, 0, 0, 0, 0],\n",
638
+ " [0, 0, 0, 0, 0, 0]])"
639
+ ]
640
+ },
641
+ "execution_count": 13,
642
+ "metadata": {},
643
+ "output_type": "execute_result"
644
+ }
645
+ ],
646
+ "source": [
647
+ "y"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "markdown",
652
+ "metadata": {},
653
+ "source": [
654
+ "### Text Preprocessing"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": 15,
660
+ "metadata": {
661
+ "execution": {
662
+ "iopub.execute_input": "2023-05-03T14:17:25.208007Z",
663
+ "iopub.status.busy": "2023-05-03T14:17:25.207157Z",
664
+ "iopub.status.idle": "2023-05-03T14:17:25.220446Z",
665
+ "shell.execute_reply": "2023-05-03T14:17:25.219390Z",
666
+ "shell.execute_reply.started": "2023-05-03T14:17:25.207968Z"
667
+ }
668
+ },
669
+ "outputs": [],
670
+ "source": [
671
+ "class Text_Cleaner:\n",
672
+ " def __init__(self, data):\n",
673
+ " self.data = data\n",
674
+ " self.STOPWORDS = stopwords.words('english')\n",
675
+ " self.wordnet = WordNetLemmatizer()\n",
676
+ " \n",
677
+ " def new_line_code(self, x:str)->str:\n",
678
+ " pattern = \"\\n\"\n",
679
+ " x = re.sub(pattern,' ', x).strip().lower()\n",
680
+ " return x\n",
681
+ "\n",
682
+ " def remove_punctuations(self, x:str)->str:\n",
683
+ " x = x.translate(str.maketrans('','',string.punctuation))\n",
684
+ " return x\n",
685
+ "\n",
686
+ " def remove_stopwords(self, x:str)->str:\n",
687
+ " sent=[]\n",
688
+ " for word in x.split():\n",
689
+ " if word not in self.STOPWORDS:\n",
690
+ " sent.append(word)\n",
691
+ " return ' '.join(sent)\n",
692
+ "\n",
693
+ " def lemmatization(self, x:str)->str:\n",
694
+ " sent=[]\n",
695
+ " for word in x.split():\n",
696
+ " sent.append(self.wordnet.lemmatize(word))\n",
697
+ " return ' '.join(sent)\n",
698
+ " \n",
699
+ " def clean_text(self):\n",
700
+ " self.data = self.data.apply(self.new_line_code)\n",
701
+ " self.data = self.data.apply(self.remove_punctuations)\n",
702
+ " self.data = self.data.apply(self.remove_stopwords)\n",
703
+ " self.data = self.data.apply(self.lemmatization)\n",
704
+ " self.data = self.data.apply(lambda x: x.strip())\n",
705
+ " return self.data"
706
+ ]
707
+ },
708
+ {
709
+ "cell_type": "code",
710
+ "execution_count": 16,
711
+ "metadata": {
712
+ "execution": {
713
+ "iopub.execute_input": "2023-05-03T14:17:28.812213Z",
714
+ "iopub.status.busy": "2023-05-03T14:17:28.811115Z",
715
+ "iopub.status.idle": "2023-05-03T14:18:45.134664Z",
716
+ "shell.execute_reply": "2023-05-03T14:18:45.133093Z",
717
+ "shell.execute_reply.started": "2023-05-03T14:17:28.812159Z"
718
+ }
719
+ },
720
+ "outputs": [],
721
+ "source": [
722
+ "X = Text_Cleaner(X).clean_text()"
723
+ ]
724
+ },
725
+ {
726
+ "cell_type": "code",
727
+ "execution_count": 17,
728
+ "metadata": {
729
+ "execution": {
730
+ "iopub.execute_input": "2023-05-03T14:19:08.971107Z",
731
+ "iopub.status.busy": "2023-05-03T14:19:08.969951Z",
732
+ "iopub.status.idle": "2023-05-03T14:19:08.979371Z",
733
+ "shell.execute_reply": "2023-05-03T14:19:08.978320Z",
734
+ "shell.execute_reply.started": "2023-05-03T14:19:08.971065Z"
735
+ }
736
+ },
737
+ "outputs": [
738
+ {
739
+ "data": {
740
+ "text/plain": [
741
+ "0 explanation edits made username hardcore metal...\n",
742
+ "1 daww match background colour im seemingly stuc...\n",
743
+ "2 hey man im really trying edit war guy constant...\n",
744
+ "3 cant make real suggestion improvement wondered...\n",
745
+ "4 sir hero chance remember page thats\n",
746
+ " ... \n",
747
+ "159566 second time asking view completely contradicts...\n",
748
+ "159567 ashamed horrible thing put talk page 128611993\n",
749
+ "159568 spitzer umm there actual article prostitution ...\n",
750
+ "159569 look like actually put speedy first version de...\n",
751
+ "159570 really dont think understand came idea bad rig...\n",
752
+ "Name: comment_text, Length: 159571, dtype: object"
753
+ ]
754
+ },
755
+ "execution_count": 17,
756
+ "metadata": {},
757
+ "output_type": "execute_result"
758
+ }
759
+ ],
760
+ "source": [
761
+ "X"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "markdown",
766
+ "metadata": {},
767
+ "source": [
768
+ "### Model Building"
769
+ ]
770
+ },
771
+ {
772
+ "cell_type": "code",
773
+ "execution_count": null,
774
+ "metadata": {},
775
+ "outputs": [],
776
+ "source": [
777
+ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
778
+ " output_sequence_length=Config.OUTPUT_DIM,\n",
779
+ " output_mode='int')\n",
780
+ "vectorizer.adapt(X.values)\n",
781
+ "vectorized_text = vectorizer(X.values)"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "code",
786
+ "execution_count": 37,
787
+ "metadata": {
788
+ "execution": {
789
+ "iopub.execute_input": "2023-05-03T14:42:24.692312Z",
790
+ "iopub.status.busy": "2023-05-03T14:42:24.691267Z",
791
+ "iopub.status.idle": "2023-05-03T14:42:24.709520Z",
792
+ "shell.execute_reply": "2023-05-03T14:42:24.708295Z",
793
+ "shell.execute_reply.started": "2023-05-03T14:42:24.692272Z"
794
+ }
795
+ },
796
+ "outputs": [],
797
+ "source": [
798
+ "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
799
+ "dataset = dataset.cache()\n",
800
+ "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
801
+ "dataset = dataset.batch(Config.BATCH_SIZE)\n",
802
+ "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
803
+ ]
804
+ },
805
+ {
806
+ "cell_type": "code",
807
+ "execution_count": 38,
808
+ "metadata": {
809
+ "execution": {
810
+ "iopub.execute_input": "2023-05-03T14:42:27.187117Z",
811
+ "iopub.status.busy": "2023-05-03T14:42:27.185929Z",
812
+ "iopub.status.idle": "2023-05-03T14:42:27.196570Z",
813
+ "shell.execute_reply": "2023-05-03T14:42:27.195443Z",
814
+ "shell.execute_reply.started": "2023-05-03T14:42:27.187074Z"
815
+ }
816
+ },
817
+ "outputs": [],
818
+ "source": [
819
+ "train = dataset.take(int(len(dataset)*0.8))\n",
820
+ "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
821
+ "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
822
+ ]
823
+ },
824
+ {
825
+ "cell_type": "code",
826
+ "execution_count": 35,
827
+ "metadata": {
828
+ "execution": {
829
+ "iopub.execute_input": "2023-05-03T14:41:54.920944Z",
830
+ "iopub.status.busy": "2023-05-03T14:41:54.920085Z",
831
+ "iopub.status.idle": "2023-05-03T14:41:54.928526Z",
832
+ "shell.execute_reply": "2023-05-03T14:41:54.927502Z",
833
+ "shell.execute_reply.started": "2023-05-03T14:41:54.920907Z"
834
+ }
835
+ },
836
+ "outputs": [],
837
+ "source": [
838
+ "def create_model():\n",
839
+ " \n",
840
+ " LAYERS = [\n",
841
+ " Embedding(Config.VOCAB_SIZE+1, 32),\n",
842
+ " Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),\n",
843
+ " Bidirectional(LSTM(32)),\n",
844
+ " Dense(128, activation='relu'),\n",
845
+ " Dropout(0.1),\n",
846
+ " Dense(256, activation='relu'),\n",
847
+ " Dropout(0.1),\n",
848
+ " Dense(128, activation='relu'),\n",
849
+ " Dense(6, activation='sigmoid')]\n",
850
+ " \n",
851
+ " model = Sequential(LAYERS)\n",
852
+ " return model"
853
+ ]
854
+ },
855
+ {
856
+ "cell_type": "code",
857
+ "execution_count": 34,
858
+ "metadata": {
859
+ "execution": {
860
+ "iopub.execute_input": "2023-05-03T14:41:41.900942Z",
861
+ "iopub.status.busy": "2023-05-03T14:41:41.900504Z",
862
+ "iopub.status.idle": "2023-05-03T14:41:41.908480Z",
863
+ "shell.execute_reply": "2023-05-03T14:41:41.907187Z",
864
+ "shell.execute_reply.started": "2023-05-03T14:41:41.900911Z"
865
+ }
866
+ },
867
+ "outputs": [],
868
+ "source": [
869
+ "def callbacks(base_dir=\".\"):\n",
870
+ " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
871
+ " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
872
+ " os.makedirs(ckpt_file,exist_ok=True)\n",
873
+ "\n",
874
+ " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
875
+ " filepath = ckpt_file,\n",
876
+ " save_best_only = True)\n",
877
+ "\n",
878
+ " callback_list = [early_stopping,\n",
879
+ " ckpt_cb]\n",
880
+ " return callback_list\n",
881
+ "callbacks_list = callbacks()"
882
+ ]
883
+ },
884
+ {
885
+ "cell_type": "code",
886
+ "execution_count": 36,
887
+ "metadata": {
888
+ "execution": {
889
+ "iopub.execute_input": "2023-05-03T14:42:07.719948Z",
890
+ "iopub.status.busy": "2023-05-03T14:42:07.719137Z",
891
+ "iopub.status.idle": "2023-05-03T14:42:09.288990Z",
892
+ "shell.execute_reply": "2023-05-03T14:42:09.287682Z",
893
+ "shell.execute_reply.started": "2023-05-03T14:42:07.719910Z"
894
+ }
895
+ },
896
+ "outputs": [],
897
+ "source": [
898
+ "with tpu_strategy.scope():\n",
899
+ " model = create_model()\n",
900
+ " model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
901
+ " loss=tf.keras.losses.binary_crossentropy,\n",
902
+ " metrics=AUC(multi_label=True, num_labels=6))"
903
+ ]
904
+ },
905
+ {
906
+ "cell_type": "code",
907
+ "execution_count": 39,
908
+ "metadata": {
909
+ "execution": {
910
+ "iopub.execute_input": "2023-05-03T14:42:34.084064Z",
911
+ "iopub.status.busy": "2023-05-03T14:42:34.083255Z",
912
+ "iopub.status.idle": "2023-05-03T14:42:34.110375Z",
913
+ "shell.execute_reply": "2023-05-03T14:42:34.109380Z",
914
+ "shell.execute_reply.started": "2023-05-03T14:42:34.084025Z"
915
+ }
916
+ },
917
+ "outputs": [
918
+ {
919
+ "name": "stdout",
920
+ "output_type": "stream",
921
+ "text": [
922
+ "Model: \"sequential_2\"\n",
923
+ "_________________________________________________________________\n",
924
+ " Layer (type) Output Shape Param # \n",
925
+ "=================================================================\n",
926
+ " embedding_2 (Embedding) (None, None, 32) 6400032 \n",
927
+ " \n",
928
+ " bidirectional_4 (Bidirectio (None, None, 128) 49664 \n",
929
+ " nal) \n",
930
+ " \n",
931
+ " bidirectional_5 (Bidirectio (None, 64) 41216 \n",
932
+ " nal) \n",
933
+ " \n",
934
+ " dense_8 (Dense) (None, 128) 8320 \n",
935
+ " \n",
936
+ " dropout_4 (Dropout) (None, 128) 0 \n",
937
+ " \n",
938
+ " dense_9 (Dense) (None, 256) 33024 \n",
939
+ " \n",
940
+ " dropout_5 (Dropout) (None, 256) 0 \n",
941
+ " \n",
942
+ " dense_10 (Dense) (None, 128) 32896 \n",
943
+ " \n",
944
+ " dense_11 (Dense) (None, 6) 774 \n",
945
+ " \n",
946
+ "=================================================================\n",
947
+ "Total params: 6,565,926\n",
948
+ "Trainable params: 6,565,926\n",
949
+ "Non-trainable params: 0\n",
950
+ "_________________________________________________________________\n"
951
+ ]
952
+ }
953
+ ],
954
+ "source": [
955
+ "model.summary()"
956
+ ]
957
+ },
958
+ {
959
+ "cell_type": "code",
960
+ "execution_count": 24,
961
+ "metadata": {
962
+ "execution": {
963
+ "iopub.execute_input": "2023-05-03T14:20:19.051437Z",
964
+ "iopub.status.busy": "2023-05-03T14:20:19.050592Z",
965
+ "iopub.status.idle": "2023-05-03T14:20:19.057744Z",
966
+ "shell.execute_reply": "2023-05-03T14:20:19.056746Z",
967
+ "shell.execute_reply.started": "2023-05-03T14:20:19.051377Z"
968
+ }
969
+ },
970
+ "outputs": [
971
+ {
972
+ "data": {
973
+ "text/plain": [
974
+ "997"
975
+ ]
976
+ },
977
+ "execution_count": 24,
978
+ "metadata": {},
979
+ "output_type": "execute_result"
980
+ }
981
+ ],
982
+ "source": [
983
+ "len(train)"
984
+ ]
985
+ },
986
+ {
987
+ "cell_type": "code",
988
+ "execution_count": 40,
989
+ "metadata": {
990
+ "execution": {
991
+ "iopub.execute_input": "2023-05-03T14:42:42.306143Z",
992
+ "iopub.status.busy": "2023-05-03T14:42:42.305188Z",
993
+ "iopub.status.idle": "2023-05-03T18:36:14.400588Z",
994
+ "shell.execute_reply": "2023-05-03T18:36:14.399250Z",
995
+ "shell.execute_reply.started": "2023-05-03T14:42:42.306107Z"
996
+ }
997
+ },
998
+ "outputs": [
999
+ {
1000
+ "name": "stdout",
1001
+ "output_type": "stream",
1002
+ "text": [
1003
+ "Epoch 1/10\n"
1004
+ ]
1005
+ },
1006
+ {
1007
+ "name": "stderr",
1008
+ "output_type": "stream",
1009
+ "text": [
1010
+ "2023-05-03 14:42:57.854226: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n",
1011
+ "2023-05-03 14:42:58.165317: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n"
1012
+ ]
1013
+ },
1014
+ {
1015
+ "name": "stdout",
1016
+ "output_type": "stream",
1017
+ "text": [
1018
+ "997/997 [==============================] - ETA: 0s - loss: 0.1688 - auc_2: 0.5909"
1019
+ ]
1020
+ },
1021
+ {
1022
+ "name": "stderr",
1023
+ "output_type": "stream",
1024
+ "text": [
1025
+ "2023-05-03 15:05:36.690047: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n",
1026
+ "2023-05-03 15:05:36.851778: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n",
1027
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1028
+ ]
1029
+ },
1030
+ {
1031
+ "name": "stdout",
1032
+ "output_type": "stream",
1033
+ "text": [
1034
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1035
+ ]
1036
+ },
1037
+ {
1038
+ "name": "stderr",
1039
+ "output_type": "stream",
1040
+ "text": [
1041
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1042
+ ]
1043
+ },
1044
+ {
1045
+ "name": "stdout",
1046
+ "output_type": "stream",
1047
+ "text": [
1048
+ "997/997 [==============================] - 1425s 1s/step - loss: 0.1688 - auc_2: 0.5909 - val_loss: 0.0750 - val_auc_2: 0.9196\n",
1049
+ "Epoch 2/10\n",
1050
+ "997/997 [==============================] - ETA: 0s - loss: 0.0640 - auc_2: 0.9400"
1051
+ ]
1052
+ },
1053
+ {
1054
+ "name": "stderr",
1055
+ "output_type": "stream",
1056
+ "text": [
1057
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1058
+ ]
1059
+ },
1060
+ {
1061
+ "name": "stdout",
1062
+ "output_type": "stream",
1063
+ "text": [
1064
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1065
+ ]
1066
+ },
1067
+ {
1068
+ "name": "stderr",
1069
+ "output_type": "stream",
1070
+ "text": [
1071
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1072
+ ]
1073
+ },
1074
+ {
1075
+ "name": "stdout",
1076
+ "output_type": "stream",
1077
+ "text": [
1078
+ "997/997 [==============================] - 1395s 1s/step - loss: 0.0640 - auc_2: 0.9400 - val_loss: 0.0548 - val_auc_2: 0.9532\n",
1079
+ "Epoch 3/10\n",
1080
+ "997/997 [==============================] - ETA: 0s - loss: 0.0524 - auc_2: 0.9594"
1081
+ ]
1082
+ },
1083
+ {
1084
+ "name": "stderr",
1085
+ "output_type": "stream",
1086
+ "text": [
1087
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1088
+ ]
1089
+ },
1090
+ {
1091
+ "name": "stdout",
1092
+ "output_type": "stream",
1093
+ "text": [
1094
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1095
+ ]
1096
+ },
1097
+ {
1098
+ "name": "stderr",
1099
+ "output_type": "stream",
1100
+ "text": [
1101
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1102
+ ]
1103
+ },
1104
+ {
1105
+ "name": "stdout",
1106
+ "output_type": "stream",
1107
+ "text": [
1108
+ "997/997 [==============================] - 1396s 1s/step - loss: 0.0524 - auc_2: 0.9594 - val_loss: 0.0484 - val_auc_2: 0.9597\n",
1109
+ "Epoch 4/10\n",
1110
+ "997/997 [==============================] - ETA: 0s - loss: 0.0466 - auc_2: 0.9672"
1111
+ ]
1112
+ },
1113
+ {
1114
+ "name": "stderr",
1115
+ "output_type": "stream",
1116
+ "text": [
1117
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1118
+ ]
1119
+ },
1120
+ {
1121
+ "name": "stdout",
1122
+ "output_type": "stream",
1123
+ "text": [
1124
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1125
+ ]
1126
+ },
1127
+ {
1128
+ "name": "stderr",
1129
+ "output_type": "stream",
1130
+ "text": [
1131
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1132
+ ]
1133
+ },
1134
+ {
1135
+ "name": "stdout",
1136
+ "output_type": "stream",
1137
+ "text": [
1138
+ "997/997 [==============================] - 1396s 1s/step - loss: 0.0466 - auc_2: 0.9672 - val_loss: 0.0426 - val_auc_2: 0.9729\n",
1139
+ "Epoch 5/10\n",
1140
+ "997/997 [==============================] - ETA: 0s - loss: 0.0440 - auc_2: 0.9715"
1141
+ ]
1142
+ },
1143
+ {
1144
+ "name": "stderr",
1145
+ "output_type": "stream",
1146
+ "text": [
1147
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1148
+ ]
1149
+ },
1150
+ {
1151
+ "name": "stdout",
1152
+ "output_type": "stream",
1153
+ "text": [
1154
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1155
+ ]
1156
+ },
1157
+ {
1158
+ "name": "stderr",
1159
+ "output_type": "stream",
1160
+ "text": [
1161
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1162
+ ]
1163
+ },
1164
+ {
1165
+ "name": "stdout",
1166
+ "output_type": "stream",
1167
+ "text": [
1168
+ "997/997 [==============================] - 1395s 1s/step - loss: 0.0440 - auc_2: 0.9715 - val_loss: 0.0406 - val_auc_2: 0.9761\n",
1169
+ "Epoch 6/10\n",
1170
+ "997/997 [==============================] - ETA: 0s - loss: 0.0416 - auc_2: 0.9725"
1171
+ ]
1172
+ },
1173
+ {
1174
+ "name": "stderr",
1175
+ "output_type": "stream",
1176
+ "text": [
1177
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1178
+ ]
1179
+ },
1180
+ {
1181
+ "name": "stdout",
1182
+ "output_type": "stream",
1183
+ "text": [
1184
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1185
+ ]
1186
+ },
1187
+ {
1188
+ "name": "stderr",
1189
+ "output_type": "stream",
1190
+ "text": [
1191
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1192
+ ]
1193
+ },
1194
+ {
1195
+ "name": "stdout",
1196
+ "output_type": "stream",
1197
+ "text": [
1198
+ "997/997 [==============================] - 1396s 1s/step - loss: 0.0416 - auc_2: 0.9725 - val_loss: 0.0382 - val_auc_2: 0.9787\n",
1199
+ "Epoch 7/10\n",
1200
+ "997/997 [==============================] - ETA: 0s - loss: 0.0394 - auc_2: 0.9762"
1201
+ ]
1202
+ },
1203
+ {
1204
+ "name": "stderr",
1205
+ "output_type": "stream",
1206
+ "text": [
1207
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1208
+ ]
1209
+ },
1210
+ {
1211
+ "name": "stdout",
1212
+ "output_type": "stream",
1213
+ "text": [
1214
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1215
+ ]
1216
+ },
1217
+ {
1218
+ "name": "stderr",
1219
+ "output_type": "stream",
1220
+ "text": [
1221
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1222
+ ]
1223
+ },
1224
+ {
1225
+ "name": "stdout",
1226
+ "output_type": "stream",
1227
+ "text": [
1228
+ "997/997 [==============================] - 1396s 1s/step - loss: 0.0394 - auc_2: 0.9762 - val_loss: 0.0359 - val_auc_2: 0.9819\n",
1229
+ "Epoch 8/10\n",
1230
+ "997/997 [==============================] - ETA: 0s - loss: 0.0379 - auc_2: 0.9773"
1231
+ ]
1232
+ },
1233
+ {
1234
+ "name": "stderr",
1235
+ "output_type": "stream",
1236
+ "text": [
1237
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1238
+ ]
1239
+ },
1240
+ {
1241
+ "name": "stdout",
1242
+ "output_type": "stream",
1243
+ "text": [
1244
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1245
+ ]
1246
+ },
1247
+ {
1248
+ "name": "stderr",
1249
+ "output_type": "stream",
1250
+ "text": [
1251
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1252
+ ]
1253
+ },
1254
+ {
1255
+ "name": "stdout",
1256
+ "output_type": "stream",
1257
+ "text": [
1258
+ "997/997 [==============================] - 1396s 1s/step - loss: 0.0379 - auc_2: 0.9773 - val_loss: 0.0346 - val_auc_2: 0.9821\n",
1259
+ "Epoch 9/10\n",
1260
+ "997/997 [==============================] - ETA: 0s - loss: 0.0367 - auc_2: 0.9776"
1261
+ ]
1262
+ },
1263
+ {
1264
+ "name": "stderr",
1265
+ "output_type": "stream",
1266
+ "text": [
1267
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1268
+ ]
1269
+ },
1270
+ {
1271
+ "name": "stdout",
1272
+ "output_type": "stream",
1273
+ "text": [
1274
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1275
+ ]
1276
+ },
1277
+ {
1278
+ "name": "stderr",
1279
+ "output_type": "stream",
1280
+ "text": [
1281
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1282
+ ]
1283
+ },
1284
+ {
1285
+ "name": "stdout",
1286
+ "output_type": "stream",
1287
+ "text": [
1288
+ "997/997 [==============================] - 1396s 1s/step - loss: 0.0367 - auc_2: 0.9776 - val_loss: 0.0336 - val_auc_2: 0.9827\n",
1289
+ "Epoch 10/10\n",
1290
+ "997/997 [==============================] - ETA: 0s - loss: 0.0357 - auc_2: 0.9782"
1291
+ ]
1292
+ },
1293
+ {
1294
+ "name": "stderr",
1295
+ "output_type": "stream",
1296
+ "text": [
1297
+ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
1298
+ ]
1299
+ },
1300
+ {
1301
+ "name": "stdout",
1302
+ "output_type": "stream",
1303
+ "text": [
1304
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1305
+ ]
1306
+ },
1307
+ {
1308
+ "name": "stderr",
1309
+ "output_type": "stream",
1310
+ "text": [
1311
+ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
1312
+ ]
1313
+ },
1314
+ {
1315
+ "name": "stdout",
1316
+ "output_type": "stream",
1317
+ "text": [
1318
+ "997/997 [==============================] - 1395s 1s/step - loss: 0.0357 - auc_2: 0.9782 - val_loss: 0.0328 - val_auc_2: 0.9819\n"
1319
+ ]
1320
+ }
1321
+ ],
1322
+ "source": [
1323
+ "history = model.fit(train, \n",
1324
+ " epochs=Config.EPOCHS,\n",
1325
+ " steps_per_epoch=len(train),\n",
1326
+ " validation_data=val,\n",
1327
+ " callbacks=callbacks_list)"
1328
+ ]
1329
+ },
1330
+ {
1331
+ "cell_type": "code",
1332
+ "execution_count": 42,
1333
+ "metadata": {
1334
+ "execution": {
1335
+ "iopub.execute_input": "2023-05-03T18:36:42.693133Z",
1336
+ "iopub.status.busy": "2023-05-03T18:36:42.692246Z",
1337
+ "iopub.status.idle": "2023-05-03T18:36:42.702544Z",
1338
+ "shell.execute_reply": "2023-05-03T18:36:42.701196Z",
1339
+ "shell.execute_reply.started": "2023-05-03T18:36:42.693095Z"
1340
+ }
1341
+ },
1342
+ "outputs": [],
1343
+ "source": [
1344
+ "def model_evaluation(model, vectorizer: TextVectorization, pred_data: pd.Series, y_true):\n",
1345
+ " #pred_data = Text_Cleaner(pred_data).clean_text()\n",
1346
+ " #vectorized_text = vectorizer(pred_data)\n",
1347
+ " y_pred = model.predict(pred_data)\n",
1348
+ " try:\n",
1349
+ " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
1350
+ " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
1351
+ " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
1352
+ " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
1353
+ " except Exception as e:\n",
1354
+ " print(e)\n",
1355
+ " \n",
1356
+ " print(f\"Precision: {precision}\\n\"\n",
1357
+ " f\"Recall: {recall}\\n\"\n",
1358
+ " f\"F1-Score: {f1}\\n\"\n",
1359
+ " f\"ROC-AUC-Score: {auc}\")\n",
1360
+ " return (precision, recall, f1, auc)"
1361
+ ]
1362
+ },
1363
+ {
1364
+ "cell_type": "code",
1365
+ "execution_count": null,
1366
+ "metadata": {},
1367
+ "outputs": [],
1368
+ "source": [
1369
+ "model.evaluate(test)"
1370
+ ]
1371
+ },
1372
+ {
1373
+ "cell_type": "code",
1374
+ "execution_count": 41,
1375
+ "metadata": {
1376
+ "execution": {
1377
+ "iopub.execute_input": "2023-05-03T18:36:28.884733Z",
1378
+ "iopub.status.busy": "2023-05-03T18:36:28.883953Z",
1379
+ "iopub.status.idle": "2023-05-03T18:36:29.233282Z",
1380
+ "shell.execute_reply": "2023-05-03T18:36:29.231964Z",
1381
+ "shell.execute_reply.started": "2023-05-03T18:36:28.884694Z"
1382
+ }
1383
+ },
1384
+ "outputs": [],
1385
+ "source": [
1386
+ "model.save(\"model_4.h5\")"
1387
+ ]
1388
+ },
1389
+ {
1390
+ "cell_type": "code",
1391
+ "execution_count": 55,
1392
+ "metadata": {
1393
+ "execution": {
1394
+ "iopub.execute_input": "2023-05-03T18:51:24.530412Z",
1395
+ "iopub.status.busy": "2023-05-03T18:51:24.529307Z",
1396
+ "iopub.status.idle": "2023-05-03T19:20:36.675080Z",
1397
+ "shell.execute_reply": "2023-05-03T19:20:36.673739Z",
1398
+ "shell.execute_reply.started": "2023-05-03T18:51:24.530375Z"
1399
+ },
1400
+ "scrolled": true
1401
+ },
1402
+ "outputs": [
1403
+ {
1404
+ "name": "stdout",
1405
+ "output_type": "stream",
1406
+ "text": [
1407
+ "3988/3988 [==============================] - 1747s 438ms/step\n",
1408
+ "Precision: 0.034067329786671804\n",
1409
+ "Recall: 0.03396435372259718\n",
1410
+ "F1-Score: 0.03375883387877523\n",
1411
+ "ROC-AUC-Score: 0.4963643308231378\n"
1412
+ ]
1413
+ }
1414
+ ],
1415
+ "source": [
1416
+ "x_train = np.concatenate([x for x, y in train])\n",
1417
+ "y_train = np.concatenate([y for x, y in train])\n",
1418
+ "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_train, y_true=y_train)"
1419
+ ]
1420
+ },
1421
+ {
1422
+ "cell_type": "code",
1423
+ "execution_count": 53,
1424
+ "metadata": {
1425
+ "execution": {
1426
+ "iopub.execute_input": "2023-05-03T18:49:02.718178Z",
1427
+ "iopub.status.busy": "2023-05-03T18:49:02.717234Z",
1428
+ "iopub.status.idle": "2023-05-03T18:49:50.438077Z",
1429
+ "shell.execute_reply": "2023-05-03T18:49:50.436458Z",
1430
+ "shell.execute_reply.started": "2023-05-03T18:49:02.718132Z"
1431
+ },
1432
+ "scrolled": true
1433
+ },
1434
+ "outputs": [
1435
+ {
1436
+ "name": "stdout",
1437
+ "output_type": "stream",
1438
+ "text": [
1439
+ "996/996 [==============================] - 43s 42ms/step\n",
1440
+ "Precision: 0.03615509646190422\n",
1441
+ "Recall: 0.03674059129986899\n",
1442
+ "F1-Score: 0.03625622443975915\n",
1443
+ "ROC-AUC-Score: 0.4868083116383068\n"
1444
+ ]
1445
+ }
1446
+ ],
1447
+ "source": [
1448
+ "x_val = np.concatenate([x for x, y in val])\n",
1449
+ "y_val = np.concatenate([y for x, y in val])\n",
1450
+ "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_val, y_true=y_val)"
1451
+ ]
1452
+ }
1453
+ ],
1454
+ "metadata": {
1455
+ "kernelspec": {
1456
+ "display_name": "Python 3 (ipykernel)",
1457
+ "language": "python",
1458
+ "name": "python3"
1459
+ },
1460
+ "language_info": {
1461
+ "codemirror_mode": {
1462
+ "name": "ipython",
1463
+ "version": 3
1464
+ },
1465
+ "file_extension": ".py",
1466
+ "mimetype": "text/x-python",
1467
+ "name": "python",
1468
+ "nbconvert_exporter": "python",
1469
+ "pygments_lexer": "ipython3",
1470
+ "version": "3.9.12"
1471
+ }
1472
+ },
1473
+ "nbformat": 4,
1474
+ "nbformat_minor": 4
1475
+ }
experiment_notebooks/Experiment 5.ipynb ADDED
@@ -0,0 +1,1003 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "!pip install nltk scikit-learn"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {
16
+ "execution": {
17
+ "iopub.execute_input": "2023-05-03T14:13:06.118200Z",
18
+ "iopub.status.busy": "2023-05-03T14:13:06.117322Z",
19
+ "iopub.status.idle": "2023-05-03T14:13:36.869507Z",
20
+ "shell.execute_reply": "2023-05-03T14:13:36.868619Z",
21
+ "shell.execute_reply.started": "2023-05-03T14:13:06.118149Z"
22
+ },
23
+ "scrolled": true
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "import os\n",
28
+ "import warnings\n",
29
+ "warnings.filterwarnings(\"ignore\")\n",
30
+ "\n",
31
+ "import pandas as pd\n",
32
+ "import numpy as np\n",
33
+ "import matplotlib.pyplot as plt\n",
34
+ "#import seaborn as sns\n",
35
+ "import tensorflow as tf\n",
36
+ "#import tensorflow_gpu\n",
37
+ "import urllib\n",
38
+ "from tensorflow.keras.layers import TextVectorization\n",
39
+ "from tensorflow.keras.models import Sequential\n",
40
+ "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
41
+ "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC\n",
42
+ "from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score\n",
43
+ "\n",
44
+ "import nltk\n",
45
+ "from nltk.corpus import stopwords\n",
46
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
47
+ "import re\n",
48
+ "import string\n",
49
+ "nltk.download('stopwords')\n",
50
+ "nltk.download('omw-1.4')\n",
51
+ "nltk.download('wordnet')\n",
52
+ "nltk.download('wordnet2022')"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "def tf_tpu_or_gpu(device: str='gpu'):\n",
62
+ " if device.lower() == 'gpu':\n",
63
+ " print(\"Setting up GPU.....\")\n",
64
+ " device_name = tf.test.gpu_device_name()\n",
65
+ " if \"GPU\" not in device_name:\n",
66
+ " print(\"GPU device not found\")\n",
67
+ " print('Found GPU at: {}'.format(device_name))\n",
68
+ " \n",
69
+ " config = tf.compat.v1.ConfigProto() \n",
70
+ " config.gpu_options.allow_growth = True \n",
71
+ " sess = tf.compat.v1.Session(config=config) \n",
72
+ " tf.compat.v1.keras.backend.set_session(sess)\n",
73
+ " \n",
74
+ " print(config)\n",
75
+ " \n",
76
+ " elif device.lower() == 'tpu':\n",
77
+ " print(\"Setting up TPU.....\")\n",
78
+ " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
79
+ " print('Running on TPU ', tpu.master())\n",
80
+ " tf.config.experimental_connect_to_cluster(tpu)\n",
81
+ " tf.tpu.experimental.initialize_tpu_system(tpu)\n",
82
+ " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
83
+ " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
84
+ " \n",
85
+ " else:\n",
86
+ " raise Exception(\"Wrong Device Paramter Passed\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "tf_tpu_or_gpu(device='gpu')"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "markdown",
100
+ "metadata": {},
101
+ "source": [
102
+ "device_name = tf.test.gpu_device_name()\n",
103
+ "if \"GPU\" not in device_name:\n",
104
+ " print(\"GPU device not found\")\n",
105
+ "print('Found GPU at: {}'.format(device_name))"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "markdown",
110
+ "metadata": {},
111
+ "source": [
112
+ "config = tf.compat.v1.ConfigProto() \n",
113
+ "config.gpu_options.allow_growth = True \n",
114
+ "sess = tf.compat.v1.Session(config=config) \n",
115
+ "tf.compat.v1.keras.backend.set_session(sess)"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 5,
121
+ "metadata": {
122
+ "execution": {
123
+ "iopub.execute_input": "2023-05-03T14:16:24.940878Z",
124
+ "iopub.status.busy": "2023-05-03T14:16:24.940140Z",
125
+ "iopub.status.idle": "2023-05-03T14:16:24.946837Z",
126
+ "shell.execute_reply": "2023-05-03T14:16:24.945707Z",
127
+ "shell.execute_reply.started": "2023-05-03T14:16:24.940845Z"
128
+ }
129
+ },
130
+ "outputs": [],
131
+ "source": [
132
+ "class Config:\n",
133
+ " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
134
+ " FILE_NAME = \"toxic_comment_data.csv\"\n",
135
+ " VOCAB_SIZE = 10000\n",
136
+ " OUTPUT_DIM = 100\n",
137
+ " BUFFER_SIZE = 10000\n",
138
+ " BATCH_SIZE = 64\n",
139
+ " EPOCHS = 10\n",
140
+ " BASE_LOG_DIR = \"log_dir\"\n",
141
+ " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 6,
147
+ "metadata": {
148
+ "execution": {
149
+ "iopub.execute_input": "2023-05-03T14:16:29.171506Z",
150
+ "iopub.status.busy": "2023-05-03T14:16:29.170711Z",
151
+ "iopub.status.idle": "2023-05-03T14:16:30.613189Z",
152
+ "shell.execute_reply": "2023-05-03T14:16:30.612012Z",
153
+ "shell.execute_reply.started": "2023-05-03T14:16:29.171466Z"
154
+ }
155
+ },
156
+ "outputs": [
157
+ {
158
+ "data": {
159
+ "text/html": [
160
+ "<div>\n",
161
+ "<style scoped>\n",
162
+ " .dataframe tbody tr th:only-of-type {\n",
163
+ " vertical-align: middle;\n",
164
+ " }\n",
165
+ "\n",
166
+ " .dataframe tbody tr th {\n",
167
+ " vertical-align: top;\n",
168
+ " }\n",
169
+ "\n",
170
+ " .dataframe thead th {\n",
171
+ " text-align: right;\n",
172
+ " }\n",
173
+ "</style>\n",
174
+ "<table border=\"1\" class=\"dataframe\">\n",
175
+ " <thead>\n",
176
+ " <tr style=\"text-align: right;\">\n",
177
+ " <th></th>\n",
178
+ " <th>id</th>\n",
179
+ " <th>comment_text</th>\n",
180
+ " <th>toxic</th>\n",
181
+ " <th>severe_toxic</th>\n",
182
+ " <th>obscene</th>\n",
183
+ " <th>threat</th>\n",
184
+ " <th>insult</th>\n",
185
+ " <th>identity_hate</th>\n",
186
+ " </tr>\n",
187
+ " </thead>\n",
188
+ " <tbody>\n",
189
+ " <tr>\n",
190
+ " <th>0</th>\n",
191
+ " <td>0000997932d777bf</td>\n",
192
+ " <td>Explanation\\nWhy the edits made under my usern...</td>\n",
193
+ " <td>0</td>\n",
194
+ " <td>0</td>\n",
195
+ " <td>0</td>\n",
196
+ " <td>0</td>\n",
197
+ " <td>0</td>\n",
198
+ " <td>0</td>\n",
199
+ " </tr>\n",
200
+ " <tr>\n",
201
+ " <th>1</th>\n",
202
+ " <td>000103f0d9cfb60f</td>\n",
203
+ " <td>D'aww! He matches this background colour I'm s...</td>\n",
204
+ " <td>0</td>\n",
205
+ " <td>0</td>\n",
206
+ " <td>0</td>\n",
207
+ " <td>0</td>\n",
208
+ " <td>0</td>\n",
209
+ " <td>0</td>\n",
210
+ " </tr>\n",
211
+ " <tr>\n",
212
+ " <th>2</th>\n",
213
+ " <td>000113f07ec002fd</td>\n",
214
+ " <td>Hey man, I'm really not trying to edit war. It...</td>\n",
215
+ " <td>0</td>\n",
216
+ " <td>0</td>\n",
217
+ " <td>0</td>\n",
218
+ " <td>0</td>\n",
219
+ " <td>0</td>\n",
220
+ " <td>0</td>\n",
221
+ " </tr>\n",
222
+ " <tr>\n",
223
+ " <th>3</th>\n",
224
+ " <td>0001b41b1c6bb37e</td>\n",
225
+ " <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
226
+ " <td>0</td>\n",
227
+ " <td>0</td>\n",
228
+ " <td>0</td>\n",
229
+ " <td>0</td>\n",
230
+ " <td>0</td>\n",
231
+ " <td>0</td>\n",
232
+ " </tr>\n",
233
+ " <tr>\n",
234
+ " <th>4</th>\n",
235
+ " <td>0001d958c54c6e35</td>\n",
236
+ " <td>You, sir, are my hero. Any chance you remember...</td>\n",
237
+ " <td>0</td>\n",
238
+ " <td>0</td>\n",
239
+ " <td>0</td>\n",
240
+ " <td>0</td>\n",
241
+ " <td>0</td>\n",
242
+ " <td>0</td>\n",
243
+ " </tr>\n",
244
+ " </tbody>\n",
245
+ "</table>\n",
246
+ "</div>"
247
+ ],
248
+ "text/plain": [
249
+ " id comment_text toxic \n",
250
+ "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n",
251
+ "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n",
252
+ "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n",
253
+ "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n",
254
+ "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n",
255
+ "\n",
256
+ " severe_toxic obscene threat insult identity_hate \n",
257
+ "0 0 0 0 0 0 \n",
258
+ "1 0 0 0 0 0 \n",
259
+ "2 0 0 0 0 0 \n",
260
+ "3 0 0 0 0 0 \n",
261
+ "4 0 0 0 0 0 "
262
+ ]
263
+ },
264
+ "execution_count": 6,
265
+ "metadata": {},
266
+ "output_type": "execute_result"
267
+ }
268
+ ],
269
+ "source": [
270
+ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
271
+ "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
272
+ "data.head()"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 7,
278
+ "metadata": {
279
+ "execution": {
280
+ "iopub.execute_input": "2023-05-03T14:16:37.492444Z",
281
+ "iopub.status.busy": "2023-05-03T14:16:37.491342Z",
282
+ "iopub.status.idle": "2023-05-03T14:16:37.533400Z",
283
+ "shell.execute_reply": "2023-05-03T14:16:37.532235Z",
284
+ "shell.execute_reply.started": "2023-05-03T14:16:37.492404Z"
285
+ }
286
+ },
287
+ "outputs": [
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "<class 'pandas.core.frame.DataFrame'>\n",
293
+ "RangeIndex: 159571 entries, 0 to 159570\n",
294
+ "Data columns (total 8 columns):\n",
295
+ " # Column Non-Null Count Dtype \n",
296
+ "--- ------ -------------- ----- \n",
297
+ " 0 id 159571 non-null object\n",
298
+ " 1 comment_text 159571 non-null object\n",
299
+ " 2 toxic 159571 non-null int64 \n",
300
+ " 3 severe_toxic 159571 non-null int64 \n",
301
+ " 4 obscene 159571 non-null int64 \n",
302
+ " 5 threat 159571 non-null int64 \n",
303
+ " 6 insult 159571 non-null int64 \n",
304
+ " 7 identity_hate 159571 non-null int64 \n",
305
+ "dtypes: int64(6), object(2)\n",
306
+ "memory usage: 9.7+ MB\n"
307
+ ]
308
+ }
309
+ ],
310
+ "source": [
311
+ "data.info()"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": 8,
317
+ "metadata": {
318
+ "execution": {
319
+ "iopub.execute_input": "2023-05-03T14:16:41.586932Z",
320
+ "iopub.status.busy": "2023-05-03T14:16:41.585997Z",
321
+ "iopub.status.idle": "2023-05-03T14:16:41.618902Z",
322
+ "shell.execute_reply": "2023-05-03T14:16:41.617979Z",
323
+ "shell.execute_reply.started": "2023-05-03T14:16:41.586895Z"
324
+ }
325
+ },
326
+ "outputs": [
327
+ {
328
+ "data": {
329
+ "text/plain": [
330
+ "id 0\n",
331
+ "comment_text 0\n",
332
+ "toxic 0\n",
333
+ "severe_toxic 0\n",
334
+ "obscene 0\n",
335
+ "threat 0\n",
336
+ "insult 0\n",
337
+ "identity_hate 0\n",
338
+ "dtype: int64"
339
+ ]
340
+ },
341
+ "execution_count": 8,
342
+ "metadata": {},
343
+ "output_type": "execute_result"
344
+ }
345
+ ],
346
+ "source": [
347
+ "data.isnull().sum()"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 9,
353
+ "metadata": {
354
+ "execution": {
355
+ "iopub.execute_input": "2023-05-03T14:16:44.561198Z",
356
+ "iopub.status.busy": "2023-05-03T14:16:44.560414Z",
357
+ "iopub.status.idle": "2023-05-03T14:16:44.586487Z",
358
+ "shell.execute_reply": "2023-05-03T14:16:44.585582Z",
359
+ "shell.execute_reply.started": "2023-05-03T14:16:44.561152Z"
360
+ }
361
+ },
362
+ "outputs": [
363
+ {
364
+ "data": {
365
+ "text/html": [
366
+ "<div>\n",
367
+ "<style scoped>\n",
368
+ " .dataframe tbody tr th:only-of-type {\n",
369
+ " vertical-align: middle;\n",
370
+ " }\n",
371
+ "\n",
372
+ " .dataframe tbody tr th {\n",
373
+ " vertical-align: top;\n",
374
+ " }\n",
375
+ "\n",
376
+ " .dataframe thead th {\n",
377
+ " text-align: right;\n",
378
+ " }\n",
379
+ "</style>\n",
380
+ "<table border=\"1\" class=\"dataframe\">\n",
381
+ " <thead>\n",
382
+ " <tr style=\"text-align: right;\">\n",
383
+ " <th></th>\n",
384
+ " <th>0</th>\n",
385
+ " <th>1</th>\n",
386
+ " </tr>\n",
387
+ " </thead>\n",
388
+ " <tbody>\n",
389
+ " <tr>\n",
390
+ " <th>toxic</th>\n",
391
+ " <td>144277</td>\n",
392
+ " <td>15294</td>\n",
393
+ " </tr>\n",
394
+ " <tr>\n",
395
+ " <th>severe_toxic</th>\n",
396
+ " <td>157976</td>\n",
397
+ " <td>1595</td>\n",
398
+ " </tr>\n",
399
+ " <tr>\n",
400
+ " <th>obscene</th>\n",
401
+ " <td>151122</td>\n",
402
+ " <td>8449</td>\n",
403
+ " </tr>\n",
404
+ " <tr>\n",
405
+ " <th>threat</th>\n",
406
+ " <td>159093</td>\n",
407
+ " <td>478</td>\n",
408
+ " </tr>\n",
409
+ " <tr>\n",
410
+ " <th>insult</th>\n",
411
+ " <td>151694</td>\n",
412
+ " <td>7877</td>\n",
413
+ " </tr>\n",
414
+ " <tr>\n",
415
+ " <th>identity_hate</th>\n",
416
+ " <td>158166</td>\n",
417
+ " <td>1405</td>\n",
418
+ " </tr>\n",
419
+ " </tbody>\n",
420
+ "</table>\n",
421
+ "</div>"
422
+ ],
423
+ "text/plain": [
424
+ " 0 1\n",
425
+ "toxic 144277 15294\n",
426
+ "severe_toxic 157976 1595\n",
427
+ "obscene 151122 8449\n",
428
+ "threat 159093 478\n",
429
+ "insult 151694 7877\n",
430
+ "identity_hate 158166 1405"
431
+ ]
432
+ },
433
+ "execution_count": 9,
434
+ "metadata": {},
435
+ "output_type": "execute_result"
436
+ }
437
+ ],
438
+ "source": [
439
+ "data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": 10,
445
+ "metadata": {
446
+ "execution": {
447
+ "iopub.execute_input": "2023-05-03T14:16:51.639830Z",
448
+ "iopub.status.busy": "2023-05-03T14:16:51.639059Z",
449
+ "iopub.status.idle": "2023-05-03T14:16:51.658065Z",
450
+ "shell.execute_reply": "2023-05-03T14:16:51.657049Z",
451
+ "shell.execute_reply.started": "2023-05-03T14:16:51.639796Z"
452
+ }
453
+ },
454
+ "outputs": [
455
+ {
456
+ "name": "stdout",
457
+ "output_type": "stream",
458
+ "text": [
459
+ "toxic value count\n",
460
+ "--------------------\n",
461
+ "0: 144277 | 90.42 %\n",
462
+ "1: 15294 | 9.58 %\n",
463
+ "\n",
464
+ "severe_toxic value count\n",
465
+ "--------------------\n",
466
+ "0: 157976 | 99.0 %\n",
467
+ "1: 1595 | 1.0 %\n",
468
+ "\n",
469
+ "obscene value count\n",
470
+ "--------------------\n",
471
+ "0: 151122 | 94.71 %\n",
472
+ "1: 8449 | 5.29 %\n",
473
+ "\n",
474
+ "threat value count\n",
475
+ "--------------------\n",
476
+ "0: 159093 | 99.7 %\n",
477
+ "1: 478 | 0.3 %\n",
478
+ "\n",
479
+ "insult value count\n",
480
+ "--------------------\n",
481
+ "0: 151694 | 95.06 %\n",
482
+ "1: 7877 | 4.94 %\n",
483
+ "\n",
484
+ "identity_hate value count\n",
485
+ "--------------------\n",
486
+ "0: 158166 | 99.12 %\n",
487
+ "1: 1405 | 0.88 %\n",
488
+ "\n"
489
+ ]
490
+ }
491
+ ],
492
+ "source": [
493
+ "for column in data.columns:\n",
494
+ " if data[column].dtype != 'O':\n",
495
+ " value_count = data[column].value_counts()\n",
496
+ " print(f\"{column} value count\\n{'--'*10}\")\n",
497
+ " print(f\"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\\n\"\n",
498
+ " f\"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\\n\")"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "execution_count": null,
504
+ "metadata": {},
505
+ "outputs": [],
506
+ "source": [
507
+ "data[\"text_len\"] = data[\"comment_text\"].apply(lambda x: len(x.split()))\n",
508
+ "data[data[\"text_len\"]==data[\"text_len\"].max()]['comment_text']"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": 11,
514
+ "metadata": {
515
+ "execution": {
516
+ "iopub.execute_input": "2023-05-03T14:16:58.642154Z",
517
+ "iopub.status.busy": "2023-05-03T14:16:58.641279Z",
518
+ "iopub.status.idle": "2023-05-03T14:16:58.648851Z",
519
+ "shell.execute_reply": "2023-05-03T14:16:58.647773Z",
520
+ "shell.execute_reply.started": "2023-05-03T14:16:58.642119Z"
521
+ }
522
+ },
523
+ "outputs": [],
524
+ "source": [
525
+ "X = data['comment_text']\n",
526
+ "y = data[data.columns[2:]].values"
527
+ ]
528
+ },
529
+ {
530
+ "cell_type": "code",
531
+ "execution_count": 12,
532
+ "metadata": {
533
+ "execution": {
534
+ "iopub.execute_input": "2023-05-03T14:17:02.919383Z",
535
+ "iopub.status.busy": "2023-05-03T14:17:02.918865Z",
536
+ "iopub.status.idle": "2023-05-03T14:17:02.927191Z",
537
+ "shell.execute_reply": "2023-05-03T14:17:02.926293Z",
538
+ "shell.execute_reply.started": "2023-05-03T14:17:02.919350Z"
539
+ }
540
+ },
541
+ "outputs": [
542
+ {
543
+ "data": {
544
+ "text/plain": [
545
+ "0 Explanation\\nWhy the edits made under my usern...\n",
546
+ "1 D'aww! He matches this background colour I'm s...\n",
547
+ "2 Hey man, I'm really not trying to edit war. It...\n",
548
+ "3 \"\\nMore\\nI can't make any real suggestions on ...\n",
549
+ "4 You, sir, are my hero. Any chance you remember...\n",
550
+ " ... \n",
551
+ "159566 \":::::And for the second time of asking, when ...\n",
552
+ "159567 You should be ashamed of yourself \\n\\nThat is ...\n",
553
+ "159568 Spitzer \\n\\nUmm, theres no actual article for ...\n",
554
+ "159569 And it looks like it was actually you who put ...\n",
555
+ "159570 \"\\nAnd ... I really don't think you understand...\n",
556
+ "Name: comment_text, Length: 159571, dtype: object"
557
+ ]
558
+ },
559
+ "execution_count": 12,
560
+ "metadata": {},
561
+ "output_type": "execute_result"
562
+ }
563
+ ],
564
+ "source": [
565
+ "X"
566
+ ]
567
+ },
568
+ {
569
+ "cell_type": "code",
570
+ "execution_count": 13,
571
+ "metadata": {
572
+ "execution": {
573
+ "iopub.execute_input": "2023-05-03T14:17:08.246451Z",
574
+ "iopub.status.busy": "2023-05-03T14:17:08.245491Z",
575
+ "iopub.status.idle": "2023-05-03T14:17:08.252604Z",
576
+ "shell.execute_reply": "2023-05-03T14:17:08.251608Z",
577
+ "shell.execute_reply.started": "2023-05-03T14:17:08.246414Z"
578
+ }
579
+ },
580
+ "outputs": [
581
+ {
582
+ "data": {
583
+ "text/plain": [
584
+ "array([[0, 0, 0, 0, 0, 0],\n",
585
+ " [0, 0, 0, 0, 0, 0],\n",
586
+ " [0, 0, 0, 0, 0, 0],\n",
587
+ " ...,\n",
588
+ " [0, 0, 0, 0, 0, 0],\n",
589
+ " [0, 0, 0, 0, 0, 0],\n",
590
+ " [0, 0, 0, 0, 0, 0]])"
591
+ ]
592
+ },
593
+ "execution_count": 13,
594
+ "metadata": {},
595
+ "output_type": "execute_result"
596
+ }
597
+ ],
598
+ "source": [
599
+ "y"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "markdown",
604
+ "metadata": {},
605
+ "source": [
606
+ "### Text Preprocessing"
607
+ ]
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": 15,
612
+ "metadata": {
613
+ "execution": {
614
+ "iopub.execute_input": "2023-05-03T14:17:25.208007Z",
615
+ "iopub.status.busy": "2023-05-03T14:17:25.207157Z",
616
+ "iopub.status.idle": "2023-05-03T14:17:25.220446Z",
617
+ "shell.execute_reply": "2023-05-03T14:17:25.219390Z",
618
+ "shell.execute_reply.started": "2023-05-03T14:17:25.207968Z"
619
+ }
620
+ },
621
+ "outputs": [],
622
+ "source": [
623
+ "class Text_Cleaner:\n",
624
+ " def __init__(self, data):\n",
625
+ " self.data = data\n",
626
+ " self.STOPWORDS = stopwords.words('english')\n",
627
+ " self.wordnet = WordNetLemmatizer()\n",
628
+ " \n",
629
+ " def new_line_code(self, x:str)->str:\n",
630
+ " pattern = \"\\n\"\n",
631
+ " x = re.sub(pattern,' ', x).strip().lower()\n",
632
+ " return x\n",
633
+ "\n",
634
+ " def remove_punctuations(self, x:str)->str:\n",
635
+ " x = x.translate(str.maketrans('','',string.punctuation))\n",
636
+ " return x\n",
637
+ "\n",
638
+ " def remove_stopwords(self, x:str)->str:\n",
639
+ " sent=[]\n",
640
+ " for word in x.split():\n",
641
+ " if word not in self.STOPWORDS:\n",
642
+ " sent.append(word)\n",
643
+ " return ' '.join(sent)\n",
644
+ "\n",
645
+ " def lemmatization(self, x:str)->str:\n",
646
+ " sent=[]\n",
647
+ " for word in x.split():\n",
648
+ " sent.append(self.wordnet.lemmatize(word))\n",
649
+ " return ' '.join(sent)\n",
650
+ " \n",
651
+ " def clean_text(self):\n",
652
+ " self.data = self.data.apply(self.new_line_code)\n",
653
+ " self.data = self.data.apply(self.remove_punctuations)\n",
654
+ " self.data = self.data.apply(self.remove_stopwords)\n",
655
+ " self.data = self.data.apply(self.lemmatization)\n",
656
+ " self.data = self.data.apply(lambda x: x.strip())\n",
657
+ " return self.data"
658
+ ]
659
+ },
660
+ {
661
+ "cell_type": "code",
662
+ "execution_count": 16,
663
+ "metadata": {
664
+ "execution": {
665
+ "iopub.execute_input": "2023-05-03T14:17:28.812213Z",
666
+ "iopub.status.busy": "2023-05-03T14:17:28.811115Z",
667
+ "iopub.status.idle": "2023-05-03T14:18:45.134664Z",
668
+ "shell.execute_reply": "2023-05-03T14:18:45.133093Z",
669
+ "shell.execute_reply.started": "2023-05-03T14:17:28.812159Z"
670
+ }
671
+ },
672
+ "outputs": [],
673
+ "source": [
674
+ "X = Text_Cleaner(X).clean_text()"
675
+ ]
676
+ },
677
+ {
678
+ "cell_type": "code",
679
+ "execution_count": 17,
680
+ "metadata": {
681
+ "execution": {
682
+ "iopub.execute_input": "2023-05-03T14:19:08.971107Z",
683
+ "iopub.status.busy": "2023-05-03T14:19:08.969951Z",
684
+ "iopub.status.idle": "2023-05-03T14:19:08.979371Z",
685
+ "shell.execute_reply": "2023-05-03T14:19:08.978320Z",
686
+ "shell.execute_reply.started": "2023-05-03T14:19:08.971065Z"
687
+ }
688
+ },
689
+ "outputs": [
690
+ {
691
+ "data": {
692
+ "text/plain": [
693
+ "0 explanation edits made username hardcore metal...\n",
694
+ "1 daww match background colour im seemingly stuc...\n",
695
+ "2 hey man im really trying edit war guy constant...\n",
696
+ "3 cant make real suggestion improvement wondered...\n",
697
+ "4 sir hero chance remember page thats\n",
698
+ " ... \n",
699
+ "159566 second time asking view completely contradicts...\n",
700
+ "159567 ashamed horrible thing put talk page 128611993\n",
701
+ "159568 spitzer umm there actual article prostitution ...\n",
702
+ "159569 look like actually put speedy first version de...\n",
703
+ "159570 really dont think understand came idea bad rig...\n",
704
+ "Name: comment_text, Length: 159571, dtype: object"
705
+ ]
706
+ },
707
+ "execution_count": 17,
708
+ "metadata": {},
709
+ "output_type": "execute_result"
710
+ }
711
+ ],
712
+ "source": [
713
+ "X"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "markdown",
718
+ "metadata": {},
719
+ "source": [
720
+ "### Model Building"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "code",
725
+ "execution_count": null,
726
+ "metadata": {},
727
+ "outputs": [],
728
+ "source": []
729
+ },
730
+ {
731
+ "cell_type": "code",
732
+ "execution_count": 37,
733
+ "metadata": {
734
+ "execution": {
735
+ "iopub.execute_input": "2023-05-03T14:42:24.692312Z",
736
+ "iopub.status.busy": "2023-05-03T14:42:24.691267Z",
737
+ "iopub.status.idle": "2023-05-03T14:42:24.709520Z",
738
+ "shell.execute_reply": "2023-05-03T14:42:24.708295Z",
739
+ "shell.execute_reply.started": "2023-05-03T14:42:24.692272Z"
740
+ }
741
+ },
742
+ "outputs": [],
743
+ "source": [
744
+ "dataset = tf.data.Dataset.from_tensor_slices((X, y))\n",
745
+ "dataset = dataset.cache()\n",
746
+ "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
747
+ "dataset = dataset.batch(Config.BATCH_SIZE)\n",
748
+ "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
749
+ ]
750
+ },
751
+ {
752
+ "cell_type": "code",
753
+ "execution_count": 38,
754
+ "metadata": {
755
+ "execution": {
756
+ "iopub.execute_input": "2023-05-03T14:42:27.187117Z",
757
+ "iopub.status.busy": "2023-05-03T14:42:27.185929Z",
758
+ "iopub.status.idle": "2023-05-03T14:42:27.196570Z",
759
+ "shell.execute_reply": "2023-05-03T14:42:27.195443Z",
760
+ "shell.execute_reply.started": "2023-05-03T14:42:27.187074Z"
761
+ }
762
+ },
763
+ "outputs": [],
764
+ "source": [
765
+ "train = dataset.take(int(len(dataset)*0.8))\n",
766
+ "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
767
+ "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
768
+ ]
769
+ },
770
+ {
771
+ "cell_type": "code",
772
+ "execution_count": 35,
773
+ "metadata": {
774
+ "execution": {
775
+ "iopub.execute_input": "2023-05-03T14:41:54.920944Z",
776
+ "iopub.status.busy": "2023-05-03T14:41:54.920085Z",
777
+ "iopub.status.idle": "2023-05-03T14:41:54.928526Z",
778
+ "shell.execute_reply": "2023-05-03T14:41:54.927502Z",
779
+ "shell.execute_reply.started": "2023-05-03T14:41:54.920907Z"
780
+ }
781
+ },
782
+ "outputs": [],
783
+ "source": [
784
+ "def create_model(vectorizer):\n",
785
+ " LAYERS = [\n",
786
+ " vectorizer,\n",
787
+ " Embedding(Config.VOCAB_SIZE+1, 32),\n",
788
+ " Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),\n",
789
+ " Bidirectional(LSTM(32)),\n",
790
+ " Dense(128, activation='relu'),\n",
791
+ " Dropout(0.1),\n",
792
+ " Dense(256, activation='relu'),\n",
793
+ " Dropout(0.1),\n",
794
+ " Dense(128, activation='relu'),\n",
795
+ " Dense(6, activation='sigmoid')]\n",
796
+ " \n",
797
+ " model = Sequential(LAYERS)\n",
798
+ " return model"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": 34,
804
+ "metadata": {
805
+ "execution": {
806
+ "iopub.execute_input": "2023-05-03T14:41:41.900942Z",
807
+ "iopub.status.busy": "2023-05-03T14:41:41.900504Z",
808
+ "iopub.status.idle": "2023-05-03T14:41:41.908480Z",
809
+ "shell.execute_reply": "2023-05-03T14:41:41.907187Z",
810
+ "shell.execute_reply.started": "2023-05-03T14:41:41.900911Z"
811
+ }
812
+ },
813
+ "outputs": [],
814
+ "source": [
815
+ "def callbacks(base_dir=\".\"):\n",
816
+ " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
817
+ " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
818
+ " os.makedirs(ckpt_file,exist_ok=True)\n",
819
+ "\n",
820
+ " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
821
+ " filepath = ckpt_file,\n",
822
+ " save_best_only = True)\n",
823
+ "\n",
824
+ " callback_list = [early_stopping,\n",
825
+ " ckpt_cb]\n",
826
+ " return callback_list\n",
827
+ "callbacks_list = callbacks()"
828
+ ]
829
+ },
830
+ {
831
+ "cell_type": "code",
832
+ "execution_count": 36,
833
+ "metadata": {
834
+ "execution": {
835
+ "iopub.execute_input": "2023-05-03T14:42:07.719948Z",
836
+ "iopub.status.busy": "2023-05-03T14:42:07.719137Z",
837
+ "iopub.status.idle": "2023-05-03T14:42:09.288990Z",
838
+ "shell.execute_reply": "2023-05-03T14:42:09.287682Z",
839
+ "shell.execute_reply.started": "2023-05-03T14:42:07.719910Z"
840
+ }
841
+ },
842
+ "outputs": [],
843
+ "source": [
844
+ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
845
+ " output_sequence_length=Config.OUTPUT_DIM,\n",
846
+ " output_mode='int')\n",
847
+ "vectorizer.adapt(X.values)\n",
848
+ "\n",
849
+ "model = create_model(vectorizer)\n",
850
+ "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
851
+ " loss=tf.keras.losses.binary_crossentropy,\n",
852
+ " metrics=AUC(multi_label=True, num_labels=6))"
853
+ ]
854
+ },
855
+ {
856
+ "cell_type": "code",
857
+ "execution_count": null,
858
+ "metadata": {
859
+ "execution": {
860
+ "iopub.execute_input": "2023-05-03T14:42:34.084064Z",
861
+ "iopub.status.busy": "2023-05-03T14:42:34.083255Z",
862
+ "iopub.status.idle": "2023-05-03T14:42:34.110375Z",
863
+ "shell.execute_reply": "2023-05-03T14:42:34.109380Z",
864
+ "shell.execute_reply.started": "2023-05-03T14:42:34.084025Z"
865
+ }
866
+ },
867
+ "outputs": [],
868
+ "source": [
869
+ "model.summary()"
870
+ ]
871
+ },
872
+ {
873
+ "cell_type": "code",
874
+ "execution_count": null,
875
+ "metadata": {
876
+ "execution": {
877
+ "iopub.execute_input": "2023-05-03T14:42:42.306143Z",
878
+ "iopub.status.busy": "2023-05-03T14:42:42.305188Z",
879
+ "iopub.status.idle": "2023-05-03T18:36:14.400588Z",
880
+ "shell.execute_reply": "2023-05-03T18:36:14.399250Z",
881
+ "shell.execute_reply.started": "2023-05-03T14:42:42.306107Z"
882
+ }
883
+ },
884
+ "outputs": [],
885
+ "source": [
886
+ "history = model.fit(train, \n",
887
+ " epochs=Config.EPOCHS,\n",
888
+ " steps_per_epoch=len(train),\n",
889
+ " validation_data=val,\n",
890
+ " callbacks=callbacks_list)"
891
+ ]
892
+ },
893
+ {
894
+ "cell_type": "code",
895
+ "execution_count": 42,
896
+ "metadata": {
897
+ "execution": {
898
+ "iopub.execute_input": "2023-05-03T18:36:42.693133Z",
899
+ "iopub.status.busy": "2023-05-03T18:36:42.692246Z",
900
+ "iopub.status.idle": "2023-05-03T18:36:42.702544Z",
901
+ "shell.execute_reply": "2023-05-03T18:36:42.701196Z",
902
+ "shell.execute_reply.started": "2023-05-03T18:36:42.693095Z"
903
+ }
904
+ },
905
+ "outputs": [],
906
+ "source": [
907
+ "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
908
+ " y_pred = model.predict(pred_data)\n",
909
+ " try:\n",
910
+ " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
911
+ " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
912
+ " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
913
+ " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
914
+ " except Exception as e:\n",
915
+ " print(e)\n",
916
+ " \n",
917
+ " print(f\"Precision: {precision}\\n\"\n",
918
+ " f\"Recall: {recall}\\n\"\n",
919
+ " f\"F1-Score: {f1}\\n\"\n",
920
+ " f\"ROC-AUC-Score: {auc}\")\n",
921
+ " return (precision, recall, f1, auc)"
922
+ ]
923
+ },
924
+ {
925
+ "cell_type": "code",
926
+ "execution_count": 41,
927
+ "metadata": {
928
+ "execution": {
929
+ "iopub.execute_input": "2023-05-03T18:36:28.884733Z",
930
+ "iopub.status.busy": "2023-05-03T18:36:28.883953Z",
931
+ "iopub.status.idle": "2023-05-03T18:36:29.233282Z",
932
+ "shell.execute_reply": "2023-05-03T18:36:29.231964Z",
933
+ "shell.execute_reply.started": "2023-05-03T18:36:28.884694Z"
934
+ }
935
+ },
936
+ "outputs": [],
937
+ "source": [
938
+ "model.save(\"model_5\", save_format='tf')"
939
+ ]
940
+ },
941
+ {
942
+ "cell_type": "code",
943
+ "execution_count": null,
944
+ "metadata": {
945
+ "execution": {
946
+ "iopub.execute_input": "2023-05-03T18:51:24.530412Z",
947
+ "iopub.status.busy": "2023-05-03T18:51:24.529307Z",
948
+ "iopub.status.idle": "2023-05-03T19:20:36.675080Z",
949
+ "shell.execute_reply": "2023-05-03T19:20:36.673739Z",
950
+ "shell.execute_reply.started": "2023-05-03T18:51:24.530375Z"
951
+ },
952
+ "scrolled": true
953
+ },
954
+ "outputs": [],
955
+ "source": [
956
+ "x_train = np.concatenate([x for x, y in train])\n",
957
+ "y_train = np.concatenate([y for x, y in train])\n",
958
+ "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
959
+ ]
960
+ },
961
+ {
962
+ "cell_type": "code",
963
+ "execution_count": null,
964
+ "metadata": {
965
+ "execution": {
966
+ "iopub.execute_input": "2023-05-03T18:49:02.718178Z",
967
+ "iopub.status.busy": "2023-05-03T18:49:02.717234Z",
968
+ "iopub.status.idle": "2023-05-03T18:49:50.438077Z",
969
+ "shell.execute_reply": "2023-05-03T18:49:50.436458Z",
970
+ "shell.execute_reply.started": "2023-05-03T18:49:02.718132Z"
971
+ },
972
+ "scrolled": true
973
+ },
974
+ "outputs": [],
975
+ "source": [
976
+ "x_val = np.concatenate([x for x, y in val])\n",
977
+ "y_val = np.concatenate([y for x, y in val])\n",
978
+ "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
979
+ ]
980
+ }
981
+ ],
982
+ "metadata": {
983
+ "kernelspec": {
984
+ "display_name": "Python 3 (ipykernel)",
985
+ "language": "python",
986
+ "name": "python3"
987
+ },
988
+ "language_info": {
989
+ "codemirror_mode": {
990
+ "name": "ipython",
991
+ "version": 3
992
+ },
993
+ "file_extension": ".py",
994
+ "mimetype": "text/x-python",
995
+ "name": "python",
996
+ "nbconvert_exporter": "python",
997
+ "pygments_lexer": "ipython3",
998
+ "version": "3.9.12"
999
+ }
1000
+ },
1001
+ "nbformat": 4,
1002
+ "nbformat_minor": 4
1003
+ }
experiment_notebooks/Transformer-Roberta-Hidden-state.ipynb.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install evaluate seaborn datasets transformers[sentencepiece] huggingface -q","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D, Dropout\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel\n\nfrom huggingface_hub import notebook_login, push_to_hub_keras, from_pretrained_keras","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:26:47.419597Z","iopub.execute_input":"2023-05-12T09:26:47.420405Z","iopub.status.idle":"2023-05-12T09:27:19.516156Z","shell.execute_reply.started":"2023-05-12T09:26:47.420369Z","shell.execute_reply":"2023-05-12T09:27:19.515021Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stderr","text":"D0512 09:27:14.707565289 4094 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 09:27:14.707594232 4094 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 09:27:14.707597846 4094 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 09:27:14.707600855 4094 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 09:27:14.707603482 4094 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 09:27:14.707606134 4094 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 09:27:14.707608635 4094 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 09:27:14.707611107 4094 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 09:27:14.707613449 4094 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 09:27:14.707615899 4094 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 09:27:14.707618252 4094 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 09:27:14.707620838 4094 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 09:27:14.707623337 4094 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 09:27:14.707625846 4094 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 09:27:14.707819467 4094 ev_epoll1_linux.cc:122] grpc epoll fd: 61\nD0512 09:27:14.707840561 4094 ev_posix.cc:144] Using polling engine: epoll1\nD0512 09:27:14.707863480 4094 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 09:27:14.713612859 4094 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 09:27:14.713623729 4094 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 09:27:14.713627418 4094 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 09:27:14.713630274 4094 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 09:27:14.713633606 4094 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 09:27:14.713636693 4094 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 09:27:14.713643493 4094 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 09:27:14.713661028 4094 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 09:27:14.713693399 4094 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 09:27:14.713706519 4094 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 09:27:14.713710276 4094 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 09:27:14.713713448 4094 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 09:27:14.713719112 4094 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 09:27:14.713722473 4094 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 09:27:14.713725524 4094 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 09:27:14.713729567 4094 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 09:27:14.716178145 4094 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 09:27:14.739649515 4522 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 09:27:14.756902832 4522 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {grpc_status:2, created_time:\"2023-05-12T09:27:14.756883083+00:00\"}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:27:32.819854Z","iopub.execute_input":"2023-05-12T09:27:32.820514Z","iopub.status.idle":"2023-05-12T09:27:41.564252Z","shell.execute_reply.started":"2023-05-12T09:27:32.820481Z","shell.execute_reply":"2023-05-12T09:27:41.563229Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 2\n MODEL = \"xlm-roberta-large\"\n BUFFER_SIZE = 1000\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 2e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:27:46.715057Z","iopub.execute_input":"2023-05-12T09:27:46.715825Z","iopub.status.idle":"2023-05-12T09:27:46.720839Z","shell.execute_reply.started":"2023-05-12T09:27:46.715793Z","shell.execute_reply":"2023-05-12T09:27:46.719700Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:27:50.289765Z","iopub.execute_input":"2023-05-12T09:27:50.290538Z","iopub.status.idle":"2023-05-12T09:28:07.709160Z","shell.execute_reply.started":"2023-05-12T09:27:50.290502Z","shell.execute_reply":"2023-05-12T09:28:07.708042Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:07.710969Z","iopub.execute_input":"2023-05-12T09:28:07.711413Z","iopub.status.idle":"2023-05-12T09:28:07.859006Z","shell.execute_reply.started":"2023-05-12T09:28:07.711382Z","shell.execute_reply":"2023-05-12T09:28:07.857738Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:07.860176Z","iopub.execute_input":"2023-05-12T09:28:07.860484Z","iopub.status.idle":"2023-05-12T09:28:08.221508Z","shell.execute_reply.started":"2023-05-12T09:28:07.860458Z","shell.execute_reply":"2023-05-12T09:28:08.220408Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:08.223917Z","iopub.execute_input":"2023-05-12T09:28:08.224303Z","iopub.status.idle":"2023-05-12T09:28:08.232987Z","shell.execute_reply.started":"2023-05-12T09:28:08.224260Z","shell.execute_reply":"2023-05-12T09:28:08.232058Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:08.233974Z","iopub.execute_input":"2023-05-12T09:28:08.234229Z","iopub.status.idle":"2023-05-12T09:28:08.245737Z","shell.execute_reply.started":"2023-05-12T09:28:08.234207Z","shell.execute_reply":"2023-05-12T09:28:08.244925Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:08.246730Z","iopub.execute_input":"2023-05-12T09:28:08.247086Z","iopub.status.idle":"2023-05-12T09:28:09.760956Z","shell.execute_reply.started":"2023-05-12T09:28:08.247061Z","shell.execute_reply":"2023-05-12T09:28:09.759744Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:09.762337Z","iopub.execute_input":"2023-05-12T09:28:09.762701Z","iopub.status.idle":"2023-05-12T09:28:12.438704Z","shell.execute_reply.started":"2023-05-12T09:28:09.762670Z","shell.execute_reply":"2023-05-12T09:28:12.437787Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"56.28243572395129\n2321\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"iVBORw0KGgoAAAANSUhEUgAAAkIAAAGdCAYAAAD+JxxnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+BklEQVR4nO3de3BU9f3/8VcSkw0BNuFiElICRFEhykVCDfutOlxCFsw4UqODymhEhIFv4jSkBZv+MNzawWK5WYJpqxA6SgU61VagIWuQUMsCEki5CaMWv7Ff2GAVWAiwWZLz+6OT82UJQhYWVjnPx8xO3fN5n7Of/byT8OrZc5IIwzAMAQAAWFBkuCcAAAAQLgQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWbeEewLfZs3NzTpy5Ig6duyoiIiIcE8HAAC0gWEYOnXqlFJSUhQZeflzPgShyzhy5IhSU1PDPQ0AAHAVvvjiC3Xv3v2yNQShy+jYsaOk/yyk3W4P6bH9fr8qKyuVnZ2t6OjokB4bbUMPwo8ehB89CD96EHper1epqanmv+OXQxC6jJaPw+x2+3UJQnFxcbLb7Xzhhwk9CD96EH70IPzowfXTlstauFgaAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABY1i3hnoDV3TNro3xNEUHv9/nLOddhNgAAWAtnhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGVdUxB6+eWXFRERocLCQnPbuXPnlJ+fry5duqhDhw7Kzc1VfX19wH51dXXKyclRXFycEhMTNW3aNJ0/fz6gZvPmzRo0aJBsNpt69+6t8vLyVq9fWlqqXr16KTY2VpmZmdqxY0fAeFvmAgAArOuqg9BHH32k3/zmN+rfv3/A9qlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7uqIHT69GmNGzdOv/vd79SpUydz+8mTJ/XGG29o4cKFGj58uDIyMrRixQpt3bpV27ZtkyRVVlbqwIEDevPNNzVw4ECNHj1ac+fOVWlpqRobGyVJZWVlSktL04IFC9S3b18VFBToscce06JFi8zXWrhwoSZOnKjx48crPT1dZWVliouL0/Lly9s8FwAAYG1XFYTy8/OVk5OjrKysgO01NTXy+/0B2/v06aMePXrI7XZLktxut/r166ekpCSzxul0yuv1av/+/WbNxcd2Op3mMRobG1VTUxNQExkZqaysLLOmLXMBAADWdkuwO7z99tvatWuXPvroo1ZjHo9HMTExSkhICNielJQkj8dj1lwYglrGW8YuV+P1enX27FkdP35cTU1Nl6w5ePBgm+dyMZ/PJ5/PZz73er2SJL/fL7/ff8l9rlbL8WyRxjXtj6vXsoasZfjQg/CjB+FHD0IvmLUMKgh98cUX+tGPfiSXy6XY2NigJ/ZtN2/ePM2ePbvV9srKSsXFxV2X15w7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtUEFoZqaGh07dkyDBg0ytzU1NWnLli1aunSpNm7cqMbGRp04cSLgTEx9fb2Sk5MlScnJya3u7mq5k+vCmovv7qqvr5fdble7du0UFRWlqKioS9ZceIwrzeVixcXFKioqMp97vV6lpqYqOztbdru9LUvUZn6/Xy6XSy/tjJSvOSLo/ffNcoZ0PlbU0oORI0cqOjo63NOxJHoQfvQg/OhB6LV8otMWQQWhESNGaO/evQHbxo8frz59+ujFF19UamqqoqOjVVVVpdzcXEnSoUOHVFdXJ4fDIUlyOBz6xS9+oWPHjikxMVHSf1Kw3W5Xenq6WXPxGQ+Xy2UeIyYmRhkZGaqqqtKYMWMkSc3NzaqqqlJBQYEkKSMj44pzuZjNZpPNZmu1PTo6+rp9cfqaI+RrCj4I8c0SOtezv2gbehB+9CD86EHoBLOOQQWhjh076p577gnY1r59e3Xp0sXcPmHCBBUVFalz586y2+164YUX5HA4NGTIEElSdna20tPT9fTTT2v+/PnyeDyaMWOG8vPzzRAyefJkLV26VNOnT9dzzz2nTZs2ac2aNVq/fr35ukVFRcrLy9PgwYN13333afHixWpoaND48eMlSfHx8VecCwAAsLagL5a+kkWLFikyMlK5ubny+XxyOp1atmyZOR4VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv/7OGjs2LH68ssvVVJSIo/Ho4EDB6qioiLgAuorzQUAAFjbNQehzZs3BzyPjY1VaWmpSktLv3Gfnj17XvFi36FDh2r37t2XrSkoKDA/CruUtswFAABYF39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQei1115T//79ZbfbZbfb5XA49Ne//tUcHzp0qCIiIgIekydPDjhGXV2dcnJyFBcXp8TERE2bNk3nz58PqNm8ebMGDRokm82m3r17q7y8vNVcSktL1atXL8XGxiozM1M7duwIGD937pzy8/PVpUsXdejQQbm5uaqvrw/m7QIAgJtcUEGoe/fuevnll1VTU6OdO3dq+PDheuSRR7R//36zZuLEiTp69Kj5mD9/vjnW1NSknJwcNTY2auvWrVq5cqXKy8tVUlJi1hw+fFg5OTkaNmyYamtrVVhYqOeff14bN240a1avXq2ioiLNnDlTu3bt0oABA+R0OnXs2DGzZurUqXrvvfe0du1aVVdX68iRI3r00UevapEAAMDNKagg9PDDD+uhhx7SHXfcoTvvvFO/+MUv1KFDB23bts2siYuLU3Jysvmw2+3mWGVlpQ4cOKA333xTAwcO1OjRozV37lyVlpaqsbFRklRWVqa0tDQtWLBAffv2VUFBgR577DEtWrTIPM7ChQs1ceJEjR8/Xunp6SorK1NcXJyWL18uSTp58qTeeOMNLVy4UMOHD1dGRoZWrFihrVu3BswVAABY2y1Xu2NTU5PWrl2rhoYGORwOc/tbb72lN998U8nJyXr44Yf10ksvKS4uTpLkdrvVr18/JSUlmfVOp1NTpkzR/v37de+998rtdisrKyvgtZxOpwoLCyVJjY2NqqmpUXFxsTkeGRmprKwsud1uSVJNTY38fn/Acfr06aMePXrI7XZryJAhl3xPPp9PPp/PfO71eiVJfr9ffr//apbpG7UczxZpXNP+uHota8hahg89CD96EH70IPSCWcugg9DevXvlcDh07tw5dejQQe+8847S09MlSU899ZR69uyplJQU7dmzRy+++KIOHTqkP/3pT5Ikj8cTEIIkmc89Hs9la7xer86ePavjx4+rqanpkjUHDx40jxETE6OEhIRWNS2vcynz5s3T7NmzW22vrKw0w1yozR3cfFX7bdiwIcQzsS6XyxXuKVgePQg/ehB+9CB0zpw50+baoIPQXXfdpdraWp08eVJ//OMflZeXp+rqaqWnp2vSpElmXb9+/dStWzeNGDFCn332mW6//fZgX+qGKy4uVlFRkfnc6/UqNTVV2dnZAR/xhYLf75fL5dJLOyPla44Iev99s5whnY8VtfRg5MiRio6ODvd0LIkehB89CD96EHotn+i0RdBBKCYmRr1795YkZWRk6KOPPtKSJUv0m9/8plVtZmamJOnTTz/V7bffruTk5FZ3d7XcyZWcnGz+78V3d9XX18tut6tdu3aKiopSVFTUJWsuPEZjY6NOnDgRcFbowppLsdlsstlsrbZHR0dfty9OX3OEfE3BByG+WULnevYXbUMPwo8ehB89CJ1g1vGaf49Qc3NzwHU1F6qtrZUkdevWTZLkcDi0d+/egLu7XC6X7Ha7+fGaw+FQVVVVwHFcLpd5HVJMTIwyMjICapqbm1VVVWXWZGRkKDo6OqDm0KFDqqurC7ieCQAAWFtQZ4SKi4s1evRo9ejRQ6dOndKqVau0efNmbdy4UZ999plWrVqlhx56SF26dNGePXs0depUPfjgg+rfv78kKTs7W+np6Xr66ac1f/58eTwezZgxQ/n5+eaZmMmTJ2vp0qWaPn26nnvuOW3atElr1qzR+vXrzXkUFRUpLy9PgwcP1n333afFixeroaFB48ePlyTFx8drwoQJKioqUufOnWW32/XCCy/I4XB844XSAADAeoIKQseOHdMzzzyjo0ePKj4+Xv3799fGjRs1cuRIffHFF3r//ffNUJKamqrc3FzNmDHD3D8qKkrr1q3TlClT5HA41L59e+Xl5WnOnDlmTVpamtavX6+pU6dqyZIl6t69u15//XU5nf93TczYsWP15ZdfqqSkRB6PRwMHDlRFRUXABdSLFi1SZGSkcnNz5fP55HQ6tWzZsmtZKwAAcJMJKgi98cYb3ziWmpqq6urqKx6jZ8+eV7zjaejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyJvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6gg9Nprr6l///6y2+2y2+1yOBz661//ao6fO3dO+fn56tKlizp06KDc3FzV19cHHKOurk45OTmKi4tTYmKipk2bpvPnzwfUbN68WYMGDZLNZlPv3r1VXl7eai6lpaXq1auXYmNjlZmZqR07dgSMt2UuAADA2oIKQt27d9fLL7+smpoa7dy5U8OHD9cjjzyi/fv3S5KmTp2q9957T2vXrlV1dbWOHDmiRx991Ny/qalJOTk5amxs1NatW7Vy5UqVl5erpKTErDl8+LBycnI0bNgw1dbWqrCwUM8//7w2btxo1qxevVpFRUWaOXOmdu3apQEDBsjpdOrYsWNmzZXmAgAAEFQQevjhh/XQQw/pjjvu0J133qlf/OIX6tChg7Zt26aTJ0/qjTfe0MKFCzV8+HBlZGRoxYoV2rp1q7Zt2yZJqqys1IEDB/Tmm29q4MCBGj16tObOnavS0lI1NjZKksrKypSWlqYFCxaob9++Kigo0GOPPaZFixaZ81i4cKEmTpyo8ePHKz09XWVlZYqLi9Py5cslqU1zAQAAuOVqd2xqatLatWvV0NAgh8Ohmpoa+f1+ZWVlmTV9+vRRjx495Ha7NWTIELndbvXr109JSUlmjdPp1JQpU7R//37de++9crvdAcdoqSksLJQkNTY2qqamRsXFxeZ4ZGSksrKy5Ha7JalNc7kUn88nn89nPvd6vZIkv98vv99/lSt1aS3Hs0Ua17Q/rl7LGrKW4UMPwo8ehB89CL1g1jLoILR37145HA6dO3dOHTp00DvvvKP09HTV1tYqJiZGCQkJAfVJSUnyeDySJI/HExCCWsZbxi5X4/V6dfbsWR0/flxNTU2XrDl48KB5jCvN5VLmzZun2bNnt9peWVmpuLi4b9zvWswd3HxV+23YsCHEM7Eul8sV7ilYHj0IP3oQfvQgdM6cOdPm2qCD0F133aXa2lqdPHlSf/zjH5WXl6fq6upgD/OtVFxcrKKiIvO51+tVamqqsrOzZbfbQ/pafr9fLpdLL+2MlK85Iuj9981yhnQ+VtTSg5EjRyo6Ojrc07EkehB+9CD86EHotXyi0xZBB6GYmBj17t1bkpSRkaGPPvpIS5Ys0dixY9XY2KgTJ04EnImpr69XcnKyJCk5ObnV3V0td3JdWHPx3V319fWy2+1q166doqKiFBUVdcmaC49xpblcis1mk81ma7U9Ojr6un1x+poj5GsKPgjxzRI617O/aBt6EH70IPzoQegEs47X/HuEmpub5fP5lJGRoejoaFVVVZljhw4dUl1dnRwOhyTJ4XBo7969AXd3uVwu2e12paenmzUXHqOlpuUYMTExysjICKhpbm5WVVWVWdOWuQAAAAR1Rqi4uFijR49Wjx49dOrUKa1atUqbN2/Wxo0bFR8frwkTJqioqEidO3eW3W7XCy+8IIfDYV6cnJ2drfT0dD399NOaP3++PB6PZsyYofz8fPNMzOTJk7V06VJNnz5dzz33nDZt2qQ1a9Zo/fr15jyKioqUl5enwYMH67777tPixYvV0NCg8ePHS1Kb5gIAABBUEDp27JieeeYZHT16VPHx8erfv782btyokSNHSpIWLVqkyMhI5ebmyufzyel0atmyZeb+UVFRWrdunaZMmSKHw6H27dsrLy9Pc+bMMWvS0tK0fv16TZ06VUuWLFH37t31+uuvy+n8v2tixo4dqy+//FIlJSXyeDwaOHCgKioqAi6gvtJcAAAAIgzDuLr7ty3A6/UqPj5eJ0+evC4XS2/YsEHTd0Rd1TVCn7+cE9L5WFFLDx566CE+lw8TehB+9CD86EHoBfPvN39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQWjevHn6/ve/r44dOyoxMVFjxozRoUOHAmqGDh2qiIiIgMfkyZMDaurq6pSTk6O4uDglJiZq2rRpOn/+fEDN5s2bNWjQINlsNvXu3Vvl5eWt5lNaWqpevXopNjZWmZmZ2rFjR8D4uXPnlJ+fry5duqhDhw7Kzc1VfX19MG8ZAADcxIIKQtXV1crPz9e2bdvkcrnk9/uVnZ2thoaGgLqJEyfq6NGj5mP+/PnmWFNTk3JyctTY2KitW7dq5cqVKi8vV0lJiVlz+PBh5eTkaNiwYaqtrVVhYaGef/55bdy40axZvXq1ioqKNHPmTO3atUsDBgyQ0+nUsWPHzJqpU6fqvffe09q1a1VdXa0jR47o0UcfDXqRAADAzemWYIorKioCnpeXlysxMVE1NTV68MEHze1xcXFKTk6+5DEqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP9XJkyf1xhtvaNWqVRo+fLgkacWKFerbt6+2bdumIUOGBPPWAQDATSioIHSxkydPSpI6d+4csP2tt97Sm2++qeTkZD388MN66aWXFBcXJ0lyu93q16+fkpKSzHqn06kpU6Zo//79uvfee+V2u5WVlRVwTKfTqcLCQklSY2OjampqVFxcbI5HRkYqKytLbrdbklRTUyO/3x9wnD59+qhHjx5yu92XDEI+n08+n8987vV6JUl+v19+vz/o9bmcluPZIo1r2h9Xr2UNWcvwoQfhRw/Cjx6EXjBredVBqLm5WYWFhfrBD36ge+65x9z+1FNPqWfPnkpJSdGePXv04osv6tChQ/rTn/4kSfJ4PAEhSJL53OPxXLbG6/Xq7NmzOn78uJqami5Zc/DgQfMYMTExSkhIaFXT8joXmzdvnmbPnt1qe2VlpRnkQm3u4Oar2m/Dhg0hnol1uVyucE/B8uhB+NGD8KMHoXPmzJk21151EMrPz9e+ffv04YcfBmyfNGmS+d/9+vVTt27dNGLECH322We6/fbbr/blboji4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r37ZWszMzMlSZ9++qluv/12JScnt7q7q+VOrpbripKTk1vd3VVfXy+73a527dopKipKUVFRl6y58BiNjY06ceJEwFmhC2suZrPZZLPZWm2Pjo6+bl+cvuYI+ZqCD0J8s4TO9ewv2oYehB89CD96EDrBrGNQd40ZhqGCggK988472rRpk9LS0q64T21trSSpW7dukiSHw6G9e/cG3N3lcrlkt9uVnp5u1lRVVQUcx+VyyeFwSJJiYmKUkZERUNPc3KyqqiqzJiMjQ9HR0QE1hw4dUl1dnVkDAACsLagzQvn5+Vq1apX+/Oc/q2PHjua1NvHx8WrXrp0+++wzrVq1Sg899JC6dOmiPXv2aOrUqXrwwQfVv39/SVJ2drbS09P19NNPa/78+fJ4PJoxY4by8/PNszGTJ0/W0qVLNX36dD333HPatGmT1qxZo/Xr15tzKSoqUl5engYPHqz77rtPixcvVkNDg3kXWXx8vCZMmKCioiJ17txZdrtdL7zwghwOB3eMAQAASUEGoddee03Sf35p4oVWrFihZ599VjExMXr//ffNUJKamqrc3FzNmDHDrI2KitK6des0ZcoUORwOtW/fXnl5eZozZ45Zk5aWpvXr12vq1KlasmSJunfvrtdff928dV6Sxo4dqy+//FIlJSXyeDwaOHCgKioqAi6gXrRokSIjI5Wbmyufzyen06lly5YFtUAAAODmFVQQMozL3+qdmpqq6urqKx6nZ8+eV7zraejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyHvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6ggNG/ePH3/+99Xx44dlZiYqDFjxujQoUMBNefOnVN+fr66dOmiDh06KDc3V/X19QE1dXV1ysnJUVxcnBITEzVt2jSdP38+oGbz5s0aNGiQbDabevfurfLy8lbzKS0tVa9evRQbG6vMzEzt2LEj6LkAAADrCioIVVdXKz8/X9u2bZPL5ZLf71d2drYaGhrMmqlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7slmOKKioqA5+Xl5UpMTFRNTY0efPBBnTx5Um+88YZWrVql4cOHS5JWrFihvn37atu2bRoyZIgqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP23TXAAAgLUFFYQudvLkSUlS586dJUk1NTXy+/3Kysoya/r06aMePXrI7XZryJAhcrvd6tevn5KSkswap9OpKVOmaP/+/br33nvldrsDjtFSU1hYKElqbGxUTU2NiouLzfHIyEhlZWXJ7Xa3eS4X8/l88vl85nOv1ytJ8vv98vv9V7VG36TleLZI45r2x9VrWUPWMnzoQfjRg/CjB6EXzFpedRBqbm5WYWGhfvCDH+iee+6RJHk8HsXExCghISGgNikpSR6Px6y5MAS1jLeMXa7G6/Xq7NmzOn78uJqami5Zc/DgwTbP5WLz5s3T7NmzW22vrKxUXFzcNy3FNZk7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtVcdhPLz87Vv3z59+OGHV3uIb53i4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r27uT05OVmNjY06ceJEwJmY+vp6JScnmzUX393VcifXhTUX391VX18vu92udu3aKSoqSlFRUZesufAYV5rLxWw2m2w2W6vt0dHR1+2L09ccIV9T8EGIb5bQuZ79RdvQg/CjB+FHD0InmHUM6q4xwzBUUFCgd955R5s2bVJaWlrAeEZGhqKjo1VVVWVuO3TokOrq6uRwOCRJDodDe/fuDbi7y+VyyW63Kz093ay58BgtNS3HiImJUUZGRkBNc3OzqqqqzJq2zAUAAFhbUGeE8vPztWrVKv35z39Wx44dzWtt4uPj1a5dO8XHx2vChAkqKipS586dZbfb9cILL8jhcJgXJ2dnZys9PV1PP/205s+fL4/HoxkzZig/P988GzN58mQtXbpU06dP13PPPadNmzZpzZo1Wr9+vTmXoqIi5eXlafDgwbrvvvu0ePFiNTQ0mHeRtWUuAADA2oIKQq+99pokaejQoQHbV6xYoWeffVaStGjRIkVGRio3N1c+n09Op1PLli0za6OiorRu3TpNmTJFDodD7du3V15enubMmWPWpKWlaf369Zo6daqWLFmi7t276/XXXzdvnZeksWPH6ssvv1RJSYk8Ho8GDhyoioqKgAuorzQXAABgbUEFIcO48q3esbGxKi0tVWlp6TfW9OzZ84p3PQ0dOlS7d+++bE1BQYEKCgquaS4AAMC6+FtjAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsoIOQlu2bNHDDz+slJQURURE6N133w0Yf/bZZxURERHwGDVqVEDN119/rXHjxslutyshIUETJkzQ6dOnA2r27NmjBx54QLGxsUpNTdX8+fNbzWXt2rXq06ePYmNj1a9fP23YsCFg3DAMlZSUqFu3bmrXrp2ysrL0ySefBPuWAQDATSroINTQ0KABAwaotLT0G2tGjRqlo0ePmo8//OEPAePjxo3T/v375XK5tG7dOm3ZskWTJk0yx71er7Kzs9WzZ0/V1NTolVde0axZs/Tb3/7WrNm6dauefPJJTZgwQbt379aYMWM0ZswY7du3z6yZP3++Xn31VZWVlWn79u1q3769nE6nzp07F+zbBgAAN6Fbgt1h9OjRGj169GVrbDabkpOTLzn28ccfq6KiQh999JEGDx4sSfr1r3+thx56SL/61a+UkpKit956S42NjVq+fLliYmJ09913q7a2VgsXLjQD05IlSzRq1ChNmzZNkjR37ly5XC4tXbpUZWVlMgxDixcv1owZM/TII49Ikn7/+98rKSlJ7777rp544olg3zoAALjJBB2E2mLz5s1KTExUp06dNHz4cP385z9Xly5dJElut1sJCQlmCJKkrKwsRUZGavv27frhD38ot9utBx98UDExMWaN0+nUL3/5Sx0/flydOnWS2+1WUVFRwOs6nU7zo7rDhw/L4/EoKyvLHI+Pj1dmZqbcbvclg5DP55PP5zOfe71eSZLf75ff77/2hblAy/FskcY17Y+r17KGrGX40IPwowfhRw9CL5i1DHkQGjVqlB599FGlpaXps88+089+9jONHj1abrdbUVFR8ng8SkxMDJzELbeoc+fO8ng8kiSPx6O0tLSAmqSkJHOsU6dO8ng85rYLay48xoX7XarmYvPmzdPs2bNbba+srFRcXFxblyAocwc3X9V+F18PhavncrnCPQXLowfhRw/Cjx6EzpkzZ9pcG/IgdOGZln79+ql///66/fbbtXnzZo0YMSLULxdSxcXFAWeZvF6vUlNTlZ2dLbvdHtLX8vv9crlcemlnpHzNEUHvv2+WM6TzsaKWHowcOVLR0dHhno4l0YPwowfhRw9Cr+UTnba4Lh+NXei2225T165d9emnn2rEiBFKTk7WsWPHAmrOnz+vr7/+2ryuKDk5WfX19QE1Lc+vVHPheMu2bt26BdQMHDjwknO12Wyy2WyttkdHR1+3L05fc4R8TcEHIb5ZQud69hdtQw/Cjx6EHz0InWDW8br/HqF//etf+uqrr8ww4nA4dOLECdXU1Jg1mzZtUnNzszIzM82aLVu2BHzG53K5dNddd6lTp05mTVVVVcBruVwuORwOSVJaWpqSk5MDarxer7Zv327WAAAAaws6CJ0+fVq1tbWqra2V9J+Lkmtra1VXV6fTp09r2rRp2rZtmz7//HNVVVXpkUceUe/eveV0/uejnL59+2rUqFGaOHGiduzYob///e8qKCjQE088oZSUFEnSU089pZiYGE2YMEH79+/X6tWrtWTJkoCPrX70ox+poqJCCxYs0MGDBzVr1izt3LlTBQUFkqSIiAgVFhbq5z//uf7yl79o7969euaZZ5SSkqIxY8Zc47IBAICbQdAfje3cuVPDhg0zn7eEk7y8PL322mvas2ePVq5cqRMnTiglJUXZ2dmaO3duwEdOb731lgoKCjRixAhFRkYqNzdXr776qjkeHx+vyspK5efnKyMjQ127dlVJSUnA7xr6r//6L61atUozZszQz372M91xxx169913dc8995g106dPV0NDgyZNmqQTJ07o/vvvV0VFhWJjY4N92wAA4CYUdBAaOnSoDOObb/neuHHjFY/RuXNnrVq16rI1/fv319/+9rfL1jz++ON6/PHHv3E8IiJCc+bM0Zw5c644JwAAYD38rTEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZQQehLVu26OGHH1ZKSooiIiL07rvvBowbhqGSkhJ169ZN7dq1U1ZWlj755JOAmq+//lrjxo2T3W5XQkKCJkyYoNOnTwfU7NmzRw888IBiY2OVmpqq+fPnt5rL2rVr1adPH8XGxqpfv37asGFD0HMBAADWFXQQamho0IABA1RaWnrJ8fnz5+vVV19VWVmZtm/frvbt28vpdOrcuXNmzbhx47R//365XC6tW7dOW7Zs0aRJk8xxr9er7Oxs9ezZUzU1NXrllVc0a9Ys/fa3vzVrtm7dqieffFITJkzQ7t27NWbMGI0ZM0b79u0Lai4AAMC6bgl2h9GjR2v06NGXHDMMQ4sXL9aMGTP0yCOPSJJ+//vfKykpSe+++66eeOIJffzxx6qoqNBHH32kwYMHS5J+/etf66GHHtKvfvUrpaSk6K233lJjY6OWL1+umJgY3X333aqtrdXChQvNwLRkyRKNGjVK06ZNkyTNnTtXLpdLS5cuVVlZWZvmAgAArC2k1wgdPnxYHo9HWVlZ5rb4+HhlZmbK7XZLktxutxISEswQJElZWVmKjIzU9u3bzZoHH3xQMTExZo3T6dShQ4d0/Phxs+bC12mpaXmdtswFAABYW9BnhC7H4/FIkpKSkgK2JyUlmWMej0eJiYmBk7jlFnXu3DmgJi0trdUxWsY6deokj8dzxde50lwu5vP55PP5zOder1eS5Pf75ff7L/fWg9ZyPFukcU374+q1rCFrGT70IPzoQfjRg9ALZi1DGoS+6+bNm6fZs2e32l5ZWam4uLjr8ppzBzdf1X4XXxiOq+dyucI9BcujB+FHD8KPHoTOmTNn2lwb0iCUnJwsSaqvr1e3bt3M7fX19Ro4cKBZc+zYsYD9zp8/r6+//trcPzk5WfX19QE1Lc+vVHPh+JXmcrHi4mIVFRWZz71er1JTU5WdnS273X7lBQiC3++Xy+XSSzsj5WuOCHr/fbOcIZ2PFbX0YOTIkYqOjg73dCyJHoQfPQg/ehB6LZ/otEVIg1BaWpqSk5NVVVVlhg2v16vt27drypQpkiSHw6ETJ06opqZGGRkZkqRNmzapublZmZmZZs3/+3//T36/3/yicLlcuuuuu9SpUyezpqqqSoWFhebru1wuORyONs/lYjabTTabrdX26Ojo6/bF6WuOkK8p+CDEN0voXM/+om3oQfjRg/CjB6ETzDoGfbH06dOnVVtbq9raWkn/uSi5trZWdXV1ioiIUGFhoX7+85/rL3/5i/bu3atnnnlGKSkpGjNmjCSpb9++GjVqlCZOnKgdO3bo73//uwoKCvTEE08oJSVFkvTUU08pJiZGEyZM0P79+7V69WotWbIk4GzNj370I1VUVGjBggU6ePCgZs2apZ07d6qgoECS2jQXAABgbUGfEdq5c6eGDRtmPm8JJ3l5eSovL9f06dPV0NCgSZMm6cSJE7r//vtVUVGh2NhYc5+33npLBQUFGjFihCIjI5Wbm6tXX33VHI+Pj1dlZaXy8/OVkZGhrl27qqSkJOB3Df3Xf/2XVq1apRkzZuhnP/uZ7rjjDr377ru65557zJq2zAUAAFhX0EFo6NChMoxvvtMpIiJCc+bM0Zw5c76xpnPnzlq1atVlX6d///7629/+dtmaxx9/XI8//vg1zQUAAFgXf2sMAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYVtB/awzfDr1+uv6q9/385ZwQzgQAgO8uzggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLCnkQmjVrliIiIgIeffr0McfPnTun/Px8denSRR06dFBubq7q6+sDjlFXV6ecnBzFxcUpMTFR06ZN0/nz5wNqNm/erEGDBslms6l3794qLy9vNZfS0lL16tVLsbGxyszM1I4dO0L9dgEAwHfYdTkjdPfdd+vo0aPm48MPPzTHpk6dqvfee09r165VdXW1jhw5okcffdQcb2pqUk5OjhobG7V161atXLlS5eXlKikpMWsOHz6snJwcDRs2TLW1tSosLNTzzz+vjRs3mjWrV69WUVGRZs6cqV27dmnAgAFyOp06duzY9XjLAADgO+i6BKFbbrlFycnJ5qNr166SpJMnT+qNN97QwoULNXz4cGVkZGjFihXaunWrtm3bJkmqrKzUgQMH9Oabb2rgwIEaPXq05s6dq9LSUjU2NkqSysrKlJaWpgULFqhv374qKCjQY489pkWLFplzWLhwoSZOnKjx48crPT1dZWVliouL0/Lly6/HWwYAAN9Bt1yPg37yySdKSUlRbGysHA6H5s2bpx49eqimpkZ+v19ZWVlmbZ8+fdSjRw+53W4NGTJEbrdb/fr1U1JSklnjdDo1ZcoU7d+/X/fee6/cbnfAMVpqCgsLJUmNjY2qqalRcXGxOR4ZGamsrCy53e5vnLfP55PP5zOfe71eSZLf75ff77+mNblYy/FskUZIjxvMa1tdyzqwHuFDD8KPHoQfPQi9YNYy5EEoMzNT5eXluuuuu3T06FHNnj1bDzzwgPbt2yePx6OYmBglJCQE7JOUlCSPxyNJ8ng8ASGoZbxl7HI1Xq9XZ8+e1fHjx9XU1HTJmoMHD37j3OfNm6fZs2e32l5ZWam4uLi2LUCQ5g5uvi7HvZwNGzbc8Nf8NnO5XOGeguXRg/CjB+FHD0LnzJkzba4NeRAaPXq0+d/9+/dXZmamevbsqTVr1qhdu3ahfrmQKi4uVlFRkfnc6/UqNTVV2dnZstvtIX0tv98vl8ull3ZGytccEdJjX8m+Wc4b+nrfVi09GDlypKKjo8M9HUuiB+FHD8KPHoReyyc6bXFdPhq7UEJCgu688059+umnGjlypBobG3XixImAs0L19fVKTk6WJCUnJ7e6u6vlrrILay6+06y+vl52u13t2rVTVFSUoqKiLlnTcoxLsdlsstlsrbZHR0dfty9OX3OEfE03NgjxjRboevYXbUMPwo8ehB89CJ1g1vG6/x6h06dP67PPPlO3bt2UkZGh6OhoVVVVmeOHDh1SXV2dHA6HJMnhcGjv3r0Bd3e5XC7Z7Xalp6ebNRceo6Wm5RgxMTHKyMgIqGlublZVVZVZAwAAEPIg9JOf/ETV1dX6/PPPtXXrVv3whz9UVFSUnnzyScXHx2vChAkqKirSBx98oJqaGo0fP14Oh0NDhgyRJGVnZys9PV1PP/20/vGPf2jjxo2aMWOG8vPzzbM1kydP1j//+U9Nnz5dBw8e1LJly7RmzRpNnTrVnEdRUZF+97vfaeXKlfr44481ZcoUNTQ0aPz48aF+ywAA4Dsq5B+N/etf/9KTTz6pr776Srfeeqvuv/9+bdu2TbfeeqskadGiRYqMjFRubq58Pp+cTqeWLVtm7h8VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv+79mXs2LH68ssvVVJSIo/Ho4EDB6qioqLVBdQAAMC6Qh6E3n777cuOx8bGqrS0VKWlpd9Y07Nnzyve2TR06FDt3r37sjUFBQUqKCi4bA0AALAu/tYYAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwrFvCPQHceL1+uv6q9/385ZwQzgQAgPDijBAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsSwSh0tJS9erVS7GxscrMzNSOHTvCPSUAAPAtcEu4J3C9rV69WkVFRSorK1NmZqYWL14sp9OpQ4cOKTExMdzT+87p9dP1V73v5y/nhHAmAABcu5v+jNDChQs1ceJEjR8/Xunp6SorK1NcXJyWL18e7qkBAIAwu6nPCDU2NqqmpkbFxcXmtsjISGVlZcntdreq9/l88vl85vOTJ09Kkr7++mv5/f6Qzs3v9+vMmTO6xR+ppuaIkB7726r3T9aE5XW3F4+45PaWHnz11VeKjo6+wbOCRA++DehB+NGD0Dt16pQkyTCMK9be1EHo3//+t5qampSUlBSwPSkpSQcPHmxVP2/ePM2ePbvV9rS0tOs2R1x/XReEewYAgHA4deqU4uPjL1tzUwehYBUXF6uoqMh83tzcrK+//lpdunRRRERoz9p4vV6lpqbqiy++kN1uD+mx0Tb0IPzoQfjRg/CjB6FnGIZOnTqllJSUK9be1EGoa9euioqKUn19fcD2+vp6JScnt6q32Wyy2WwB2xISEq7nFGW32/nCDzN6EH70IPzoQfjRg9C60pmgFjf1xdIxMTHKyMhQVVWVua25uVlVVVVyOBxhnBkAAPg2uKnPCElSUVGR8vLyNHjwYN13331avHixGhoaNH78+HBPDQAAhNlNH4TGjh2rL7/8UiUlJfJ4PBo4cKAqKipaXUB9o9lsNs2cObPVR3G4cehB+NGD8KMH4UcPwivCaMu9ZQAAADehm/oaIQAAgMshCAEAAMsiCAEAAMsiCAEAAMsiCIVBaWmpevXqpdjYWGVmZmrHjh3hntJNY9asWYqIiAh49OnTxxw/d+6c8vPz1aVLF3Xo0EG5ubmtfuFmXV2dcnJyFBcXp8TERE2bNk3nz5+/0W/lO2PLli16+OGHlZKSooiICL377rsB44ZhqKSkRN26dVO7du2UlZWlTz75JKDm66+/1rhx42S325WQkKAJEybo9OnTATV79uzRAw88oNjYWKWmpmr+/PnX+619Z1ypB88++2yr74tRo0YF1NCDazNv3jx9//vfV8eOHZWYmKgxY8bo0KFDATWh+vmzefNmDRo0SDabTb1791Z5efn1fns3NYLQDbZ69WoVFRVp5syZ2rVrlwYMGCCn06ljx46Fe2o3jbvvvltHjx41Hx9++KE5NnXqVL333ntau3atqqurdeTIET366KPmeFNTk3JyctTY2KitW7dq5cqVKi8vV0lJSTjeyndCQ0ODBgwYoNLS0kuOz58/X6+++qrKysq0fft2tW/fXk6nU+fOnTNrxo0bp/3798vlcmndunXasmWLJk2aZI57vV5lZ2erZ8+eqqmp0SuvvKJZs2bpt7/97XV/f98FV+qBJI0aNSrg++IPf/hDwDg9uDbV1dXKz8/Xtm3b5HK55Pf7lZ2drYaGBrMmFD9/Dh8+rJycHA0bNky1tbUqLCzU888/r40bN97Q93tTMXBD3XfffUZ+fr75vKmpyUhJSTHmzZsXxlndPGbOnGkMGDDgkmMnTpwwoqOjjbVr15rbPv74Y0OS4Xa7DcMwjA0bNhiRkZGGx+Mxa1577TXDbrcbPp/vus79ZiDJeOedd8znzc3NRnJysvHKK6+Y206cOGHYbDbjD3/4g2EYhnHgwAFDkvHRRx+ZNX/961+NiIgI43//938NwzCMZcuWGZ06dQrowYsvvmjcdddd1/kdffdc3APDMIy8vDzjkUce+cZ96EHoHTt2zJBkVFdXG4YRup8/06dPN+6+++6A1xo7dqzhdDqv91u6aXFG6AZqbGxUTU2NsrKyzG2RkZHKysqS2+0O48xuLp988olSUlJ02223ady4caqrq5Mk1dTUyO/3B6x/nz591KNHD3P93W63+vXrF/ALN51Op7xer/bv339j38hN4PDhw/J4PAFrHh8fr8zMzIA1T0hI0ODBg82arKwsRUZGavv27WbNgw8+qJiYGLPG6XTq0KFDOn78+A16N99tmzdvVmJiou666y5NmTJFX331lTlGD0Lv5MmTkqTOnTtLCt3PH7fbHXCMlhr+Dbl6BKEb6N///reamppa/VbrpKQkeTyeMM3q5pKZmany8nJVVFTotdde0+HDh/XAAw/o1KlT8ng8iomJafWHdC9cf4/Hc8n+tIwhOC1rdrmveY/Ho8TExIDxW265RZ07d6YvITJq1Cj9/ve/V1VVlX75y1+qurpao0ePVlNTkyR6EGrNzc0qLCzUD37wA91zzz2SFLKfP99U4/V6dfbs2evxdm56N/2f2IC1jB492vzv/v37KzMzUz179tSaNWvUrl27MM4MCJ8nnnjC/O9+/fqpf//+uv3227V582aNGDEijDO7OeXn52vfvn0B1yfi24szQjdQ165dFRUV1eougfr6eiUnJ4dpVje3hIQE3Xnnnfr000+VnJysxsZGnThxIqDmwvVPTk6+ZH9axhCcljW73Nd8cnJyq5sFzp8/r6+//pq+XCe33Xabunbtqk8//VQSPQilgoICrVu3Th988IG6d+9ubg/Vz59vqrHb7fyfvatEELqBYmJilJGRoaqqKnNbc3Ozqqqq5HA4wjizm9fp06f12WefqVu3bsrIyFB0dHTA+h86dEh1dXXm+jscDu3duzfgHwWXyyW73a709PQbPv/vurS0NCUnJwesudfr1fbt2wPW/MSJE6qpqTFrNm3apObmZmVmZpo1W7Zskd/vN2tcLpfuuusuderU6Qa9m5vHv/71L3311Vfq1q2bJHoQCoZhqKCgQO+88442bdqktLS0gPFQ/fxxOBwBx2ip4d+QaxDuq7Wt5u233zZsNptRXl5uHDhwwJg0aZKRkJAQcJcArt6Pf/xjY/Pmzcbhw4eNv//970ZWVpbRtWtX49ixY4ZhGMbkyZONHj16GJs2bTJ27txpOBwOw+FwmPufP3/euOeee4zs7GyjtrbWqKioMG699VajuLg4XG/pW+/UqVPG7t27jd27dxuSjIULFxq7d+82/ud//scwDMN4+eWXjYSEBOPPf/6zsWfPHuORRx4x0tLSjLNnz5rHGDVqlHHvvfca27dvNz788EPjjjvuMJ588klz/MSJE0ZSUpLx9NNPG/v27TPefvttIy4uzvjNb35zw9/vt9HlenDq1CnjJz/5ieF2u43Dhw8b77//vjFo0CDjjjvuMM6dO2cegx5cmylTphjx8fHG5s2bjaNHj5qPM2fOmDWh+Pnzz3/+04iLizOmTZtmfPzxx0ZpaakRFRVlVFRU3ND3ezMhCIXBr3/9a6NHjx5GTEyMcd999xnbtm0L95RuGmPHjjW6detmxMTEGN/73veMsWPHGp9++qk5fvbsWeO///u/jU6dOhlxcXHGD3/4Q+Po0aMBx/j888+N0aNHG+3atTO6du1q/PjHPzb8fv+NfivfGR988IEhqdUjLy/PMIz/3EL/0ksvGUlJSYbNZjNGjBhhHDp0KOAYX331lfHkk08aHTp0MOx2uzF+/Hjj1KlTATX/+Mc/jPvvv9+w2WzG9773PePll1++UW/xW+9yPThz5oyRnZ1t3HrrrUZ0dLTRs2dPY+LEia3+zxc9uDaXWn9JxooVK8yaUP38+eCDD4yBAwcaMTExxm233RbwGghehGEYxo0+CwUAAPBtwDVCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsv4/2I1qxbjTQ/MAAAAASUVORK5CYII="},"metadata":{}}]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T09:28:12.439750Z","iopub.execute_input":"2023-05-12T09:28:12.440025Z","iopub.status.idle":"2023-05-12T09:28:13.294348Z","shell.execute_reply.started":"2023-05-12T09:28:12.440001Z","shell.execute_reply":"2023-05-12T09:28:13.293070Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n return tokenizer(text_data.comment_text.values.tolist(), \n max_length=max_len, \n truncation=True, \n padding=\"max_length\",\n add_special_tokens=True,\n return_tensors=\"tf\",\n return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:13.295593Z","iopub.execute_input":"2023-05-12T09:28:13.295906Z","iopub.status.idle":"2023-05-12T09:28:13.303204Z","shell.execute_reply.started":"2023-05-12T09:28:13.295879Z","shell.execute_reply":"2023-05-12T09:28:13.302313Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:28:13.306093Z","iopub.execute_input":"2023-05-12T09:28:13.306418Z","iopub.status.idle":"2023-05-12T09:29:10.312197Z","shell.execute_reply.started":"2023-05-12T09:28:13.306387Z","shell.execute_reply":"2023-05-12T09:29:10.310639Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n .repeat()\n .shuffle(Config.BUFFER_SIZE)\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:29:10.313685Z","iopub.execute_input":"2023-05-12T09:29:10.314019Z","iopub.status.idle":"2023-05-12T09:29:10.349843Z","shell.execute_reply.started":"2023-05-12T09:29:10.313990Z","shell.execute_reply":"2023-05-12T09:29:10.348686Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformer, max_len=Config.MAX_LEN):\n input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n \n roberta_layers = transformer.roberta(input_ids, attention_mask=masks)[0]\n \n \"\"\"intermediate = Dense(1024, activation='relu')(roberta_layers)\n output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\"\"\"\n \n out = GlobalMaxPool1D()(roberta_layers)\n out = Dense(1024, activation=\"relu\")(out)\n out = Dropout(0.1)(out)\n out = Dense(512, activation=\"relu\")(out)\n output = Dense(1, activation=\"sigmoid\")(out)\n model = Model(inputs=[input_ids, masks], outputs=output)\n model.layers[2].trainable = True\n \n model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=tf.keras.metrics.AUC())\n return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:29:10.351128Z","iopub.execute_input":"2023-05-12T09:29:10.351428Z","iopub.status.idle":"2023-05-12T09:29:10.361056Z","shell.execute_reply.started":"2023-05-12T09:29:10.351402Z","shell.execute_reply":"2023-05-12T09:29:10.360061Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n transformer = TFAutoModel.from_pretrained(Config.MODEL)\n model = model_builder(transformer=transformer)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:29:10.362177Z","iopub.execute_input":"2023-05-12T09:29:10.362456Z","iopub.status.idle":"2023-05-12T09:29:58.466635Z","shell.execute_reply.started":"2023-05-12T09:29:10.362432Z","shell.execute_reply":"2023-05-12T09:29:58.465465Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stderr","text":"All model checkpoint layers were used when initializing TFXLMRobertaModel.\n\nAll the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-large.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:29:58.467803Z","iopub.execute_input":"2023-05-12T09:29:58.468177Z","iopub.status.idle":"2023-05-12T09:29:58.525775Z","shell.execute_reply.started":"2023-05-12T09:29:58.468144Z","shell.execute_reply":"2023-05-12T09:29:58.524828Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n roberta (TFXLMRobertaMainLayer TFBaseModelOutputWi 559890432 ['input_ids[0][0]', \n ) thPoolingAndCrossAt 'attention_mask[0][0]'] \n tentions(last_hidde \n n_state=(None, 192, \n 1024), \n pooler_output=(Non \n e, 1024), \n past_key_values=No \n ne, hidden_states=N \n one, attentions=Non \n e, cross_attentions \n =None) \n \n global_max_pooling1d (GlobalMa (None, 1024) 0 ['roberta[0][0]'] \n xPooling1D) \n \n dense (Dense) (None, 1024) 1049600 ['global_max_pooling1d[0][0]'] \n \n dropout_73 (Dropout) (None, 1024) 0 ['dense[0][0]'] \n \n dense_1 (Dense) (None, 512) 524800 ['dropout_73[0][0]'] \n \n dense_2 (Dense) (None, 1) 513 ['dense_1[0][0]'] \n \n==================================================================================================\nTotal params: 561,465,345\nTrainable params: 561,465,345\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n validation_data=val_dataset,\n steps_per_epoch=train_steps_per_epoch,\n epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:29:58.526787Z","iopub.execute_input":"2023-05-12T09:29:58.527133Z","iopub.status.idle":"2023-05-12T10:24:30.583417Z","shell.execute_reply.started":"2023-05-12T09:29:58.527108Z","shell.execute_reply":"2023-05-12T10:24:30.582123Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Epoch 1/2\nWARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stderr","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stdout","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stderr","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stdout","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stderr","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stdout","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stderr","text":"WARNING:tensorflow:Gradients do not exist for variables ['tfxlm_roberta_model/roberta/pooler/dense/kernel:0', 'tfxlm_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n2023-05-12 09:31:20.841711: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n2023-05-12 09:31:22.985898: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0563 - auc: 0.9965","output_type":"stream"},{"name":"stderr","text":"2023-05-12 09:58:25.682020: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 09:58:26.185606: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 1735s 374ms/step - loss: 0.0563 - auc: 0.9965 - val_loss: 0.3167 - val_auc: 0.9004\nEpoch 2/2\n4185/4185 [==============================] - 1535s 367ms/step - loss: 0.0431 - auc: 0.9978 - val_loss: 0.3931 - val_auc: 0.8456\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n steps_per_epoch=val_steps_per_epoch,\n epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:28:32.516881Z","iopub.execute_input":"2023-05-12T10:28:32.517792Z","iopub.status.idle":"2023-05-12T10:30:43.894813Z","shell.execute_reply.started":"2023-05-12T10:28:32.517756Z","shell.execute_reply":"2023-05-12T10:30:43.893586Z"},"trusted":true},"execution_count":20,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 22s 362ms/step - loss: 0.2941 - auc: 0.8735\nEpoch 2/2\n62/62 [==============================] - 108s 363ms/step - loss: 0.1840 - auc: 0.9565\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:37:19.540199Z","iopub.execute_input":"2023-05-12T10:37:19.541140Z","iopub.status.idle":"2023-05-12T10:37:19.689045Z","shell.execute_reply.started":"2023-05-12T10:37:19.541101Z","shell.execute_reply":"2023-05-12T10:37:19.687579Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-3-hiddenstates\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:44:30.231587Z","iopub.execute_input":"2023-05-12T10:44:30.231921Z"},"trusted":true},"execution_count":null,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"}]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"notebook_login()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"m.summary()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
experiment_notebooks/Transformer-Roberta-Pooler-state.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install evaluate seaborn datasets transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:10:38.588160Z","iopub.execute_input":"2023-05-12T08:10:38.588801Z","iopub.status.idle":"2023-05-12T08:11:05.038848Z","shell.execute_reply.started":"2023-05-12T08:10:38.588769Z","shell.execute_reply":"2023-05-12T08:11:05.037913Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:11:23.828906Z","iopub.execute_input":"2023-05-12T08:11:23.829355Z","iopub.status.idle":"2023-05-12T08:12:04.965072Z","shell.execute_reply.started":"2023-05-12T08:11:23.829319Z","shell.execute_reply":"2023-05-12T08:12:04.964003Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 08:11:57.679357494 14 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 08:11:57.679397945 14 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 08:11:57.679401929 14 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 08:11:57.679404612 14 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 08:11:57.679406720 14 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 08:11:57.679409002 14 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 08:11:57.679411626 14 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 08:11:57.679414267 14 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 08:11:57.679416463 14 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 08:11:57.679418575 14 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 08:11:57.679420670 14 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 08:11:57.679422786 14 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 08:11:57.679424925 14 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 08:11:57.679427123 14 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 08:11:57.679611260 14 ev_epoll1_linux.cc:122] grpc epoll fd: 62\nD0512 08:11:57.685069810 14 ev_posix.cc:144] Using polling engine: epoll1\nD0512 08:11:57.685091110 14 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 08:11:57.685503222 14 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 08:11:57.685513372 14 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 08:11:57.685516328 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 08:11:57.685518925 14 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 08:11:57.685521601 14 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 08:11:57.685524245 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 08:11:57.685530262 14 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 08:11:57.685544918 14 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 08:11:57.685567780 14 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 08:11:57.685580119 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 08:11:57.685583175 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 08:11:57.685586100 14 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 08:11:57.685591323 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 08:11:57.685594369 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 08:11:57.685597356 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 08:11:57.685601004 14 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 08:11:57.687522778 14 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 08:11:57.713894001 315 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 08:11:57.732068033 315 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {grpc_status:2, created_time:\"2023-05-12T08:11:57.73203804+00:00\"}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:04.967007Z","iopub.execute_input":"2023-05-12T08:12:04.967631Z","iopub.status.idle":"2023-05-12T08:12:15.185956Z","shell.execute_reply.started":"2023-05-12T08:12:04.967600Z","shell.execute_reply":"2023-05-12T08:12:15.184914Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 3 #2\n MODEL = \"xlm-roberta-large\"\n BUFFER_SIZE = 2048\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 1e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:15.187164Z","iopub.execute_input":"2023-05-12T08:12:15.187478Z","iopub.status.idle":"2023-05-12T08:12:15.193012Z","shell.execute_reply.started":"2023-05-12T08:12:15.187450Z","shell.execute_reply":"2023-05-12T08:12:15.192060Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:15.195496Z","iopub.execute_input":"2023-05-12T08:12:15.195907Z","iopub.status.idle":"2023-05-12T08:12:42.439698Z","shell.execute_reply.started":"2023-05-12T08:12:15.195884Z","shell.execute_reply":"2023-05-12T08:12:42.438494Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1.head()","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:42.440974Z","iopub.execute_input":"2023-05-12T08:12:42.441315Z","iopub.status.idle":"2023-05-12T08:12:42.461414Z","shell.execute_reply.started":"2023-05-12T08:12:42.441285Z","shell.execute_reply":"2023-05-12T08:12:42.460195Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0000997932d777bf</td>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>000103f0d9cfb60f</td>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>000113f07ec002fd</td>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0001b41b1c6bb37e</td>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0001d958c54c6e35</td>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train2.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.462658Z","iopub.execute_input":"2023-05-12T08:12:42.462965Z","iopub.status.idle":"2023-05-12T08:12:42.487874Z","shell.execute_reply.started":"2023-05-12T08:12:42.462921Z","shell.execute_reply":"2023-05-12T08:12:42.487081Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 59848 This is so cool. It's like, 'would you want yo... 0.000000 \\\n1 59849 Thank you!! This would make my life a lot less... 0.000000 \n2 59852 This is such an urgent design problem; kudos t... 0.000000 \n3 59855 Is this something I'll be able to install on m... 0.000000 \n4 59856 haha you guys are a bunch of losers. 0.893617 \n\n severe_toxicity obscene identity_attack insult threat asian atheist \n0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \\\n1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n\n ... article_id rating funny wow sad likes disagree \n0 ... 2006 rejected 0 0 0 0 0 \\\n1 ... 2006 rejected 0 0 0 0 0 \n2 ... 2006 rejected 0 0 0 0 0 \n3 ... 2006 rejected 0 0 0 0 0 \n4 ... 2006 rejected 0 0 0 1 0 \n\n sexual_explicit identity_annotator_count toxicity_annotator_count \n0 0.0 0 4 \n1 0.0 0 4 \n2 0.0 0 4 \n3 0.0 0 4 \n4 0.0 4 47 \n\n[5 rows x 45 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxicity</th>\n <th>obscene</th>\n <th>identity_attack</th>\n <th>insult</th>\n <th>threat</th>\n <th>asian</th>\n <th>atheist</th>\n <th>...</th>\n <th>article_id</th>\n <th>rating</th>\n <th>funny</th>\n <th>wow</th>\n <th>sad</th>\n <th>likes</th>\n <th>disagree</th>\n <th>sexual_explicit</th>\n <th>identity_annotator_count</th>\n <th>toxicity_annotator_count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>59848</td>\n <td>This is so cool. It's like, 'would you want yo...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1</th>\n <td>59849</td>\n <td>Thank you!! This would make my life a lot less...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>59852</td>\n <td>This is such an urgent design problem; kudos t...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>3</th>\n <td>59855</td>\n <td>Is this something I'll be able to install on m...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>4</th>\n <td>59856</td>\n <td>haha you guys are a bunch of losers.</td>\n <td>0.893617</td>\n <td>0.021277</td>\n <td>0.0</td>\n <td>0.021277</td>\n <td>0.87234</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0.0</td>\n <td>4</td>\n <td>47</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 45 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.488844Z","iopub.execute_input":"2023-05-12T08:12:42.489110Z","iopub.status.idle":"2023-05-12T08:12:42.504161Z","shell.execute_reply.started":"2023-05-12T08:12:42.489087Z","shell.execute_reply":"2023-05-12T08:12:42.503316Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" id comment_text lang toxic\n0 0 Este usuario ni siquiera llega al rango de ... es 0\n1 1 Il testo di questa voce pare esser scopiazzato... it 0\n2 2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... es 1\n3 3 Bu maddenin alt başlığı olarak uluslararası i... tr 0\n4 4 Belçika nın şehirlerinin yanında ilçe ve belde... tr 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>es</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>it</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>es</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.505217Z","iopub.execute_input":"2023-05-12T08:12:42.505504Z","iopub.status.idle":"2023-05-12T08:12:42.518947Z","shell.execute_reply.started":"2023-05-12T08:12:42.505480Z","shell.execute_reply":"2023-05-12T08:12:42.518159Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" id content lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>content</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train1[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.519956Z","iopub.execute_input":"2023-05-12T08:12:42.520259Z","iopub.status.idle":"2023-05-12T08:12:42.534176Z","shell.execute_reply.started":"2023-05-12T08:12:42.520234Z","shell.execute_reply":"2023-05-12T08:12:42.533484Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"toxic\n0 202165\n1 21384\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train2[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.537691Z","iopub.execute_input":"2023-05-12T08:12:42.537946Z","iopub.status.idle":"2023-05-12T08:12:42.574451Z","shell.execute_reply.started":"2023-05-12T08:12:42.537925Z","shell.execute_reply":"2023-05-12T08:12:42.573541Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"toxic\n0.000000 1333035\n0.166667 138501\n0.200000 113271\n0.300000 62195\n0.400000 52703\n ... \n0.037609 1\n0.971193 1\n0.988430 1\n0.008309 1\n0.967316 1\nName: count, Length: 3853, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.575526Z","iopub.execute_input":"2023-05-12T08:12:42.575805Z","iopub.status.idle":"2023-05-12T08:12:42.584242Z","shell.execute_reply.started":"2023-05-12T08:12:42.575781Z","shell.execute_reply":"2023-05-12T08:12:42.583468Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"toxic\n0 6770\n1 1230\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.585256Z","iopub.execute_input":"2023-05-12T08:12:42.585532Z","iopub.status.idle":"2023-05-12T08:12:42.596996Z","shell.execute_reply.started":"2023-05-12T08:12:42.585510Z","shell.execute_reply":"2023-05-12T08:12:42.596246Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"lang\ntr 3000\nes 2500\nit 2500\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"test[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.597893Z","iopub.execute_input":"2023-05-12T08:12:42.598151Z","iopub.status.idle":"2023-05-12T08:12:42.612575Z","shell.execute_reply.started":"2023-05-12T08:12:42.598129Z","shell.execute_reply":"2023-05-12T08:12:42.611766Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"lang\ntr 14000\npt 11012\nru 10948\nfr 10920\nit 8494\nes 8438\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.613596Z","iopub.execute_input":"2023-05-12T08:12:42.613863Z","iopub.status.idle":"2023-05-12T08:12:42.689129Z","shell.execute_reply.started":"2023-05-12T08:12:42.613841Z","shell.execute_reply":"2023-05-12T08:12:42.687961Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"train2.toxic.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.690307Z","iopub.execute_input":"2023-05-12T08:12:42.690632Z","iopub.status.idle":"2023-05-12T08:12:42.714449Z","shell.execute_reply.started":"2023-05-12T08:12:42.690603Z","shell.execute_reply":"2023-05-12T08:12:42.712954Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"toxic\n0 1789968\n1 112226\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.717790Z","iopub.execute_input":"2023-05-12T08:12:42.719209Z","iopub.status.idle":"2023-05-12T08:12:43.083471Z","shell.execute_reply.started":"2023-05-12T08:12:42.719178Z","shell.execute_reply":"2023-05-12T08:12:43.082364Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.084697Z","iopub.execute_input":"2023-05-12T08:12:43.084992Z","iopub.status.idle":"2023-05-12T08:12:43.091149Z","shell.execute_reply.started":"2023-05-12T08:12:43.084966Z","shell.execute_reply":"2023-05-12T08:12:43.090173Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"train.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.092288Z","iopub.execute_input":"2023-05-12T08:12:43.092591Z","iopub.status.idle":"2023-05-12T08:12:43.108428Z","shell.execute_reply.started":"2023-05-12T08:12:43.092565Z","shell.execute_reply":"2023-05-12T08:12:43.107388Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Explanation\\nWhy the edits made under my usern... 0\n1 D'aww! He matches this background colour I'm s... 0\n2 Hey man, I'm really not trying to edit war. It... 0\n3 \"\\nMore\\nI can't make any real suggestions on ... 0\n4 You, sir, are my hero. Any chance you remember... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.109560Z","iopub.execute_input":"2023-05-12T08:12:43.109882Z","iopub.status.idle":"2023-05-12T08:12:43.124398Z","shell.execute_reply.started":"2023-05-12T08:12:43.109856Z","shell.execute_reply":"2023-05-12T08:12:43.123602Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Este usuario ni siquiera llega al rango de ... 0\n1 Il testo di questa voce pare esser scopiazzato... 0\n2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... 1\n3 Bu maddenin alt başlığı olarak uluslararası i... 0\n4 Belçika nın şehirlerinin yanında ilçe ve belde... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.125574Z","iopub.execute_input":"2023-05-12T08:12:43.125871Z","iopub.status.idle":"2023-05-12T08:12:43.140740Z","shell.execute_reply.started":"2023-05-12T08:12:43.125845Z","shell.execute_reply":"2023-05-12T08:12:43.139834Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":" id comment_text lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.141938Z","iopub.execute_input":"2023-05-12T08:12:43.142286Z","iopub.status.idle":"2023-05-12T08:12:43.152137Z","shell.execute_reply.started":"2023-05-12T08:12:43.142257Z","shell.execute_reply":"2023-05-12T08:12:43.151267Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.153204Z","iopub.execute_input":"2023-05-12T08:12:43.153504Z","iopub.status.idle":"2023-05-12T08:12:44.735211Z","shell.execute_reply.started":"2023-05-12T08:12:43.153479Z","shell.execute_reply":"2023-05-12T08:12:44.734019Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:44.736464Z","iopub.execute_input":"2023-05-12T08:12:44.736759Z","iopub.status.idle":"2023-05-12T08:12:46.680516Z","shell.execute_reply.started":"2023-05-12T08:12:44.736733Z","shell.execute_reply":"2023-05-12T08:12:46.679299Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stderr","text":"Downloading (…)lve/main/config.json: 100%|██████████| 616/616 [00:00<00:00, 133kB/s]\nDownloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 61.2MB/s]\nDownloading (…)/main/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 38.3MB/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n return tokenizer(text_data.comment_text.values.tolist(), \n max_length=max_len, \n truncation=True, \n padding=\"max_length\",\n add_special_tokens=True,\n return_tensors=\"tf\",\n return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:46.681935Z","iopub.execute_input":"2023-05-12T08:12:46.682277Z","iopub.status.idle":"2023-05-12T08:12:46.688026Z","shell.execute_reply.started":"2023-05-12T08:12:46.682252Z","shell.execute_reply":"2023-05-12T08:12:46.687060Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:46.689142Z","iopub.execute_input":"2023-05-12T08:12:46.689525Z","iopub.status.idle":"2023-05-12T08:13:40.477757Z","shell.execute_reply.started":"2023-05-12T08:12:46.689501Z","shell.execute_reply":"2023-05-12T08:13:40.476350Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n .repeat()\n .shuffle(Config.BUFFER_SIZE)\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:13:40.479182Z","iopub.execute_input":"2023-05-12T08:13:40.479716Z","iopub.status.idle":"2023-05-12T08:13:40.514773Z","shell.execute_reply.started":"2023-05-12T08:13:40.479687Z","shell.execute_reply":"2023-05-12T08:13:40.513645Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformers_layers, max_len=Config.MAX_LEN):\n input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n \n roberta_layers = transformers_layers.roberta(input_ids, attention_mask=masks)[1]\n intermediate = Dense(1024, activation='relu')(roberta_layers)\n output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\n model = Model(inputs=[input_ids, masks], outputs=output)\n model.layers[2].trainable = True\n \n model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=tf.keras.metrics.AUC())\n return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:18.861686Z","iopub.execute_input":"2023-05-12T08:19:18.862636Z","iopub.status.idle":"2023-05-12T08:19:18.872779Z","shell.execute_reply.started":"2023-05-12T08:19:18.862595Z","shell.execute_reply":"2023-05-12T08:19:18.871516Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n transformers_layers = TFAutoModel.from_pretrained(Config.MODEL)\n model = model_builder(transformers_layers=transformers_layers)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:23.540792Z","iopub.execute_input":"2023-05-12T08:19:23.541710Z","iopub.status.idle":"2023-05-12T08:19:58.819514Z","shell.execute_reply.started":"2023-05-12T08:19:23.541670Z","shell.execute_reply":"2023-05-12T08:19:58.818311Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stderr","text":"All model checkpoint layers were used when initializing TFXLMRobertaModel.\n\nAll the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-large.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:58.821255Z","iopub.execute_input":"2023-05-12T08:19:58.821564Z","iopub.status.idle":"2023-05-12T08:19:58.877105Z","shell.execute_reply.started":"2023-05-12T08:19:58.821537Z","shell.execute_reply":"2023-05-12T08:19:58.876009Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stdout","text":"Model: \"model_2\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n roberta (TFXLMRobertaMainLayer TFBaseModelOutputWi 559890432 ['input_ids[0][0]', \n ) thPoolingAndCrossAt 'attention_mask[0][0]'] \n tentions(last_hidde \n n_state=(None, 192, \n 1024), \n pooler_output=(Non \n e, 1024), \n past_key_values=No \n ne, hidden_states=N \n one, attentions=Non \n e, cross_attentions \n =None) \n \n dense_4 (Dense) (None, 1024) 1049600 ['roberta[0][1]'] \n \n output_layer (Dense) (None, 1) 1025 ['dense_4[0][0]'] \n \n==================================================================================================\nTotal params: 560,941,057\nTrainable params: 560,941,057\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n validation_data=val_dataset,\n steps_per_epoch=train_steps_per_epoch,\n epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:20:30.201166Z","iopub.execute_input":"2023-05-12T08:20:30.201570Z","iopub.status.idle":"2023-05-12T09:40:52.828332Z","shell.execute_reply.started":"2023-05-12T08:20:30.201539Z","shell.execute_reply":"2023-05-12T09:40:52.826896Z"},"trusted":true},"execution_count":39,"outputs":[{"name":"stdout","text":"Epoch 1/3\n","output_type":"stream"},{"name":"stderr","text":"2023-05-12 08:21:52.144761: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n2023-05-12 08:21:54.569388: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0501 - auc_2: 0.9972","output_type":"stream"},{"name":"stderr","text":"2023-05-12 08:49:07.665397: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 08:49:08.172000: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 1747s 375ms/step - loss: 0.0501 - auc_2: 0.9972 - val_loss: 0.3338 - val_auc_2: 0.9137\nEpoch 2/3\n4185/4185 [==============================] - 1538s 367ms/step - loss: 0.0420 - auc_2: 0.9981 - val_loss: 0.2931 - val_auc_2: 0.9114\nEpoch 3/3\n4185/4185 [==============================] - 1537s 367ms/step - loss: 0.0369 - auc_2: 0.9985 - val_loss: 0.3070 - val_auc_2: 0.9039\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n steps_per_epoch=val_steps_per_epoch,\n epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:43:06.760317Z","iopub.execute_input":"2023-05-12T09:43:06.760739Z","iopub.status.idle":"2023-05-12T09:43:52.591536Z","shell.execute_reply.started":"2023-05-12T09:43:06.760702Z","shell.execute_reply":"2023-05-12T09:43:52.590324Z"},"trusted":true},"execution_count":41,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 23s 365ms/step - loss: 0.0899 - auc_2: 0.9893\nEpoch 2/2\n62/62 [==============================] - 23s 365ms/step - loss: 0.0800 - auc_2: 0.9916\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:47:56.071510Z","iopub.execute_input":"2023-05-12T09:47:56.072708Z","iopub.status.idle":"2023-05-12T09:49:15.802261Z","shell.execute_reply.started":"2023-05-12T09:47:56.072664Z","shell.execute_reply":"2023-05-12T09:49:15.800711Z"},"trusted":true},"execution_count":42,"outputs":[{"name":"stderr","text":"2023-05-12 09:48:05.583905: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n2023-05-12 09:48:05.992232: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"499/499 [==============================] - 79s 118ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:49:24.580208Z","iopub.execute_input":"2023-05-12T09:49:24.580625Z","iopub.status.idle":"2023-05-12T09:50:44.681561Z","shell.execute_reply.started":"2023-05-12T09:49:24.580595Z","shell.execute_reply":"2023-05-12T09:50:44.680112Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"},{"name":"stdout","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"},{"name":"stderr","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"}]},{"cell_type":"code","source":"import shutil\nshutil.make_archive(\"roberta-fine-tuned-2\",\"zip\",'/kaggle/working/roberta-fine-tuned-2')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:53:15.505782Z","iopub.execute_input":"2023-05-12T09:53:15.506262Z","iopub.status.idle":"2023-05-12T10:00:10.288432Z","shell.execute_reply.started":"2023-05-12T09:53:15.506226Z","shell.execute_reply":"2023-05-12T10:00:10.287215Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"'/kaggle/working/roberta-fine-tuned-2.zip'"},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2-best\", save_format='h5')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:06:24.426264Z","iopub.execute_input":"2023-05-12T10:06:24.426727Z","iopub.status.idle":"2023-05-12T10:06:40.506795Z","shell.execute_reply.started":"2023-05-12T10:06:24.426692Z","shell.execute_reply":"2023-05-12T10:06:40.505341Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"model = tf.keras.models.load_model('/kaggle/working/roberta-fine-tuned-2-best')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:07:36.737706Z","iopub.execute_input":"2023-05-12T10:07:36.738837Z","iopub.status.idle":"2023-05-12T10:07:59.902966Z","shell.execute_reply.started":"2023-05-12T10:07:36.738795Z","shell.execute_reply":"2023-05-12T10:07:59.901400Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"\"\"\"%%capture\n!pip install git+https://github.com/huggingface/huggingface_hub.git@main\n!sudo apt -qq install git-lfs\n!git config --global credential.helper store\"\"\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!huggingface-cli login --token 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","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:12:13.025974Z","iopub.execute_input":"2023-05-12T10:12:13.026917Z","iopub.status.idle":"2023-05-12T10:12:15.351277Z","shell.execute_reply.started":"2023-05-12T10:12:13.026877Z","shell.execute_reply":"2023-05-12T10:12:15.349659Z"},"trusted":true},"execution_count":55,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nToken will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\nToken is valid.\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta-best')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import HfApi\napi = HfApi()\napi.upload_folder(\n folder_path=\"/kaggle/working/\",\n repo_id=\"shivansh-ka/Toxic-Comment-Classifier-Multi\",\n repo_type=\"space\",\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T06:59:23.928089Z","iopub.execute_input":"2023-05-12T06:59:23.928495Z","iopub.status.idle":"2023-05-12T06:59:56.375479Z","shell.execute_reply.started":"2023-05-12T06:59:23.928466Z","shell.execute_reply":"2023-05-12T06:59:56.374295Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\nconfig.json not found in HuggingFace Hub.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"84f3f3229b3e42668708162e27df3168"}},"metadata":{}}]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:06:50.246933Z","iopub.execute_input":"2023-05-12T07:06:50.247789Z","iopub.status.idle":"2023-05-12T07:29:11.940923Z","shell.execute_reply.started":"2023-05-12T07:06:50.247752Z","shell.execute_reply":"2023-05-12T07:29:11.939745Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"499/499 [==============================] - 1341s 3s/step\n","output_type":"stream"}]},{"cell_type":"code","source":"m.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:31:58.337639Z","iopub.execute_input":"2023-05-12T07:31:58.338344Z","iopub.status.idle":"2023-05-12T07:31:58.425154Z","shell.execute_reply.started":"2023-05-12T07:31:58.338300Z","shell.execute_reply":"2023-05-12T07:31:58.424117Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n roberta (Custom>TFXLMRobertaMa {'pooler_output': ( 559890432 ['input_ids[0][0]', \n inLayer) None, 1024), 'attention_mask[0][0]'] \n 'last_hidden_state \n ': (None, 192, 1024 \n )} \n \n dense (Dense) (None, 1024) 1049600 ['roberta[0][1]'] \n \n output_layer (Dense) (None, 1) 1025 ['dense[0][0]'] \n \n==================================================================================================\nTotal params: 560,941,057\nTrainable params: 560,941,057\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"sub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:32:36.768119Z","iopub.execute_input":"2023-05-12T07:32:36.768542Z","iopub.status.idle":"2023-05-12T07:32:36.963761Z","shell.execute_reply.started":"2023-05-12T07:32:36.768512Z","shell.execute_reply":"2023-05-12T07:32:36.962584Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
experiment_notebooks/Transformer-mBert-Hidden-state.ipynb.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:47:53.058331Z","iopub.execute_input":"2023-05-12T11:47:53.058582Z","iopub.status.idle":"2023-05-12T11:48:11.528155Z","shell.execute_reply.started":"2023-05-12T11:47:53.058558Z","shell.execute_reply":"2023-05-12T11:48:11.527021Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D, Dropout\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel\n\nfrom huggingface_hub import notebook_login, push_to_hub_keras, from_pretrained_keras","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:49:51.308904Z","iopub.execute_input":"2023-05-12T11:49:51.309754Z","iopub.status.idle":"2023-05-12T11:50:50.942522Z","shell.execute_reply.started":"2023-05-12T11:49:51.309710Z","shell.execute_reply":"2023-05-12T11:50:50.941305Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 11:50:24.269871341 14 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 11:50:24.269909000 14 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 11:50:24.269913299 14 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 11:50:24.269916201 14 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 11:50:24.269918879 14 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 11:50:24.269921601 14 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 11:50:24.269925176 14 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 11:50:24.269927811 14 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 11:50:24.269930386 14 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 11:50:24.269932936 14 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 11:50:24.269935469 14 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 11:50:24.269938067 14 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 11:50:24.269940970 14 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 11:50:24.269952451 14 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 11:50:24.270176104 14 ev_epoll1_linux.cc:122] grpc epoll fd: 66\nD0512 11:50:24.270188925 14 ev_posix.cc:144] Using polling engine: epoll1\nD0512 11:50:24.270209220 14 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 11:50:24.270651874 14 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 11:50:24.270665013 14 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 11:50:24.270669110 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 11:50:24.270672580 14 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 11:50:24.270675869 14 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 11:50:24.270679086 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 11:50:24.270685803 14 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 11:50:24.270702631 14 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 11:50:24.270729400 14 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 11:50:24.270744660 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 11:50:24.270749029 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 11:50:24.270752476 14 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 11:50:24.270758870 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 11:50:24.270762669 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 11:50:24.270766303 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 11:50:24.270771070 14 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 11:50:24.273060617 14 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 11:50:24.288500755 14 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 11:50:24.296032187 14 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {created_time:\"2023-05-12T11:50:24.29601562+00:00\", grpc_status:2}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:50.944280Z","iopub.execute_input":"2023-05-12T11:50:50.944798Z","iopub.status.idle":"2023-05-12T11:51:00.695080Z","shell.execute_reply.started":"2023-05-12T11:50:50.944771Z","shell.execute_reply":"2023-05-12T11:51:00.693913Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 4\n MODEL = \"bert-base-multilingual-uncased\"\n BUFFER_SIZE = 1000\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 2e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:00.696282Z","iopub.execute_input":"2023-05-12T11:51:00.696560Z","iopub.status.idle":"2023-05-12T11:51:00.702270Z","shell.execute_reply.started":"2023-05-12T11:51:00.696536Z","shell.execute_reply":"2023-05-12T11:51:00.701120Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:00.704483Z","iopub.execute_input":"2023-05-12T11:51:00.704815Z","iopub.status.idle":"2023-05-12T11:51:27.504621Z","shell.execute_reply.started":"2023-05-12T11:51:00.704792Z","shell.execute_reply":"2023-05-12T11:51:27.503561Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:27.505989Z","iopub.execute_input":"2023-05-12T11:51:27.506326Z","iopub.status.idle":"2023-05-12T11:51:27.643971Z","shell.execute_reply.started":"2023-05-12T11:51:27.506301Z","shell.execute_reply":"2023-05-12T11:51:27.643077Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:27.645230Z","iopub.execute_input":"2023-05-12T11:51:27.645543Z","iopub.status.idle":"2023-05-12T11:51:27.973540Z","shell.execute_reply.started":"2023-05-12T11:51:27.645516Z","shell.execute_reply":"2023-05-12T11:51:27.972585Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:27.974748Z","iopub.execute_input":"2023-05-12T11:51:27.975079Z","iopub.status.idle":"2023-05-12T11:51:27.983728Z","shell.execute_reply.started":"2023-05-12T11:51:27.975052Z","shell.execute_reply":"2023-05-12T11:51:27.982961Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:27.984792Z","iopub.execute_input":"2023-05-12T11:51:27.985120Z","iopub.status.idle":"2023-05-12T11:51:27.995577Z","shell.execute_reply.started":"2023-05-12T11:51:27.985070Z","shell.execute_reply":"2023-05-12T11:51:27.994720Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:27.996790Z","iopub.execute_input":"2023-05-12T11:51:27.997135Z","iopub.status.idle":"2023-05-12T11:51:29.490559Z","shell.execute_reply.started":"2023-05-12T11:51:27.997088Z","shell.execute_reply":"2023-05-12T11:51:29.489600Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:29.493564Z","iopub.execute_input":"2023-05-12T11:51:29.493937Z","iopub.status.idle":"2023-05-12T11:51:32.280356Z","shell.execute_reply.started":"2023-05-12T11:51:29.493911Z","shell.execute_reply":"2023-05-12T11:51:32.279438Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"56.28243572395129\n2321\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"iVBORw0KGgoAAAANSUhEUgAAAkIAAAGdCAYAAAD+JxxnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+BklEQVR4nO3de3BU9f3/8VcSkw0BNuFiElICRFEhykVCDfutOlxCFsw4UqODymhEhIFv4jSkBZv+MNzawWK5WYJpqxA6SgU61VagIWuQUMsCEki5CaMWv7Ff2GAVWAiwWZLz+6OT82UJQhYWVjnPx8xO3fN5n7Of/byT8OrZc5IIwzAMAQAAWFBkuCcAAAAQLgQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWbeEewLfZs3NzTpy5Ig6duyoiIiIcE8HAAC0gWEYOnXqlFJSUhQZeflzPgShyzhy5IhSU1PDPQ0AAHAVvvjiC3Xv3v2yNQShy+jYsaOk/yyk3W4P6bH9fr8qKyuVnZ2t6OjokB4bbUMPwo8ehB89CD96EHper1epqanmv+OXQxC6jJaPw+x2+3UJQnFxcbLb7Xzhhwk9CD96EH70IPzowfXTlstauFgaAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABY1i3hnoDV3TNro3xNEUHv9/nLOddhNgAAWAtnhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGVdUxB6+eWXFRERocLCQnPbuXPnlJ+fry5duqhDhw7Kzc1VfX19wH51dXXKyclRXFycEhMTNW3aNJ0/fz6gZvPmzRo0aJBsNpt69+6t8vLyVq9fWlqqXr16KTY2VpmZmdqxY0fAeFvmAgAArOuqg9BHH32k3/zmN+rfv3/A9qlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7uqIHT69GmNGzdOv/vd79SpUydz+8mTJ/XGG29o4cKFGj58uDIyMrRixQpt3bpV27ZtkyRVVlbqwIEDevPNNzVw4ECNHj1ac+fOVWlpqRobGyVJZWVlSktL04IFC9S3b18VFBToscce06JFi8zXWrhwoSZOnKjx48crPT1dZWVliouL0/Lly9s8FwAAYG1XFYTy8/OVk5OjrKysgO01NTXy+/0B2/v06aMePXrI7XZLktxut/r166ekpCSzxul0yuv1av/+/WbNxcd2Op3mMRobG1VTUxNQExkZqaysLLOmLXMBAADWdkuwO7z99tvatWuXPvroo1ZjHo9HMTExSkhICNielJQkj8dj1lwYglrGW8YuV+P1enX27FkdP35cTU1Nl6w5ePBgm+dyMZ/PJ5/PZz73er2SJL/fL7/ff8l9rlbL8WyRxjXtj6vXsoasZfjQg/CjB+FHD0IvmLUMKgh98cUX+tGPfiSXy6XY2NigJ/ZtN2/ePM2ePbvV9srKSsXFxV2X15w7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtUEFoZqaGh07dkyDBg0ytzU1NWnLli1aunSpNm7cqMbGRp04cSLgTEx9fb2Sk5MlScnJya3u7mq5k+vCmovv7qqvr5fdble7du0UFRWlqKioS9ZceIwrzeVixcXFKioqMp97vV6lpqYqOztbdru9LUvUZn6/Xy6XSy/tjJSvOSLo/ffNcoZ0PlbU0oORI0cqOjo63NOxJHoQfvQg/OhB6LV8otMWQQWhESNGaO/evQHbxo8frz59+ujFF19UamqqoqOjVVVVpdzcXEnSoUOHVFdXJ4fDIUlyOBz6xS9+oWPHjikxMVHSf1Kw3W5Xenq6WXPxGQ+Xy2UeIyYmRhkZGaqqqtKYMWMkSc3NzaqqqlJBQYEkKSMj44pzuZjNZpPNZmu1PTo6+rp9cfqaI+RrCj4I8c0SOtezv2gbehB+9CD86EHoBLOOQQWhjh076p577gnY1r59e3Xp0sXcPmHCBBUVFalz586y2+164YUX5HA4NGTIEElSdna20tPT9fTTT2v+/PnyeDyaMWOG8vPzzRAyefJkLV26VNOnT9dzzz2nTZs2ac2aNVq/fr35ukVFRcrLy9PgwYN13333afHixWpoaND48eMlSfHx8VecCwAAsLagL5a+kkWLFikyMlK5ubny+XxyOp1atmyZOR4VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv/7OGjs2LH68ssvVVJSIo/Ho4EDB6qioiLgAuorzQUAAFjbNQehzZs3BzyPjY1VaWmpSktLv3Gfnj17XvFi36FDh2r37t2XrSkoKDA/CruUtswFAABYF39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQei1115T//79ZbfbZbfb5XA49Ne//tUcHzp0qCIiIgIekydPDjhGXV2dcnJyFBcXp8TERE2bNk3nz58PqNm8ebMGDRokm82m3r17q7y8vNVcSktL1atXL8XGxiozM1M7duwIGD937pzy8/PVpUsXdejQQbm5uaqvrw/m7QIAgJtcUEGoe/fuevnll1VTU6OdO3dq+PDheuSRR7R//36zZuLEiTp69Kj5mD9/vjnW1NSknJwcNTY2auvWrVq5cqXKy8tVUlJi1hw+fFg5OTkaNmyYamtrVVhYqOeff14bN240a1avXq2ioiLNnDlTu3bt0oABA+R0OnXs2DGzZurUqXrvvfe0du1aVVdX68iRI3r00UevapEAAMDNKagg9PDDD+uhhx7SHXfcoTvvvFO/+MUv1KFDB23bts2siYuLU3Jysvmw2+3mWGVlpQ4cOKA333xTAwcO1OjRozV37lyVlpaqsbFRklRWVqa0tDQtWLBAffv2VUFBgR577DEtWrTIPM7ChQs1ceJEjR8/Xunp6SorK1NcXJyWL18uSTp58qTeeOMNLVy4UMOHD1dGRoZWrFihrVu3BswVAABY2y1Xu2NTU5PWrl2rhoYGORwOc/tbb72lN998U8nJyXr44Yf10ksvKS4uTpLkdrvVr18/JSUlmfVOp1NTpkzR/v37de+998rtdisrKyvgtZxOpwoLCyVJjY2NqqmpUXFxsTkeGRmprKwsud1uSVJNTY38fn/Acfr06aMePXrI7XZryJAhl3xPPp9PPp/PfO71eiVJfr9ffr//apbpG7UczxZpXNP+uHota8hahg89CD96EH70IPSCWcugg9DevXvlcDh07tw5dejQQe+8847S09MlSU899ZR69uyplJQU7dmzRy+++KIOHTqkP/3pT5Ikj8cTEIIkmc89Hs9la7xer86ePavjx4+rqanpkjUHDx40jxETE6OEhIRWNS2vcynz5s3T7NmzW22vrKw0w1yozR3cfFX7bdiwIcQzsS6XyxXuKVgePQg/ehB+9CB0zpw50+baoIPQXXfdpdraWp08eVJ//OMflZeXp+rqaqWnp2vSpElmXb9+/dStWzeNGDFCn332mW6//fZgX+qGKy4uVlFRkfnc6/UqNTVV2dnZAR/xhYLf75fL5dJLOyPla44Iev99s5whnY8VtfRg5MiRio6ODvd0LIkehB89CD96EHotn+i0RdBBKCYmRr1795YkZWRk6KOPPtKSJUv0m9/8plVtZmamJOnTTz/V7bffruTk5FZ3d7XcyZWcnGz+78V3d9XX18tut6tdu3aKiopSVFTUJWsuPEZjY6NOnDgRcFbowppLsdlsstlsrbZHR0dfty9OX3OEfE3BByG+WULnevYXbUMPwo8ehB89CJ1g1vGaf49Qc3NzwHU1F6qtrZUkdevWTZLkcDi0d+/egLu7XC6X7Ha7+fGaw+FQVVVVwHFcLpd5HVJMTIwyMjICapqbm1VVVWXWZGRkKDo6OqDm0KFDqqurC7ieCQAAWFtQZ4SKi4s1evRo9ejRQ6dOndKqVau0efNmbdy4UZ999plWrVqlhx56SF26dNGePXs0depUPfjgg+rfv78kKTs7W+np6Xr66ac1f/58eTwezZgxQ/n5+eaZmMmTJ2vp0qWaPn26nnvuOW3atElr1qzR+vXrzXkUFRUpLy9PgwcP1n333afFixeroaFB48ePlyTFx8drwoQJKioqUufOnWW32/XCCy/I4XB844XSAADAeoIKQseOHdMzzzyjo0ePKj4+Xv3799fGjRs1cuRIffHFF3r//ffNUJKamqrc3FzNmDHD3D8qKkrr1q3TlClT5HA41L59e+Xl5WnOnDlmTVpamtavX6+pU6dqyZIl6t69u15//XU5nf93TczYsWP15ZdfqqSkRB6PRwMHDlRFRUXABdSLFi1SZGSkcnNz5fP55HQ6tWzZsmtZKwAAcJMJKgi98cYb3ziWmpqq6urqKx6jZ8+eV7zjaejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyJvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6gg9Nprr6l///6y2+2y2+1yOBz661//ao6fO3dO+fn56tKlizp06KDc3FzV19cHHKOurk45OTmKi4tTYmKipk2bpvPnzwfUbN68WYMGDZLNZlPv3r1VXl7eai6lpaXq1auXYmNjlZmZqR07dgSMt2UuAADA2oIKQt27d9fLL7+smpoa7dy5U8OHD9cjjzyi/fv3S5KmTp2q9957T2vXrlV1dbWOHDmiRx991Ny/qalJOTk5amxs1NatW7Vy5UqVl5erpKTErDl8+LBycnI0bNgw1dbWqrCwUM8//7w2btxo1qxevVpFRUWaOXOmdu3apQEDBsjpdOrYsWNmzZXmAgAAEFQQevjhh/XQQw/pjjvu0J133qlf/OIX6tChg7Zt26aTJ0/qjTfe0MKFCzV8+HBlZGRoxYoV2rp1q7Zt2yZJqqys1IEDB/Tmm29q4MCBGj16tObOnavS0lI1NjZKksrKypSWlqYFCxaob9++Kigo0GOPPaZFixaZ81i4cKEmTpyo8ePHKz09XWVlZYqLi9Py5cslqU1zAQAAuOVqd2xqatLatWvV0NAgh8Ohmpoa+f1+ZWVlmTV9+vRRjx495Ha7NWTIELndbvXr109JSUlmjdPp1JQpU7R//37de++9crvdAcdoqSksLJQkNTY2qqamRsXFxeZ4ZGSksrKy5Ha7JalNc7kUn88nn89nPvd6vZIkv98vv99/lSt1aS3Hs0Ua17Q/rl7LGrKW4UMPwo8ehB89CL1g1jLoILR37145HA6dO3dOHTp00DvvvKP09HTV1tYqJiZGCQkJAfVJSUnyeDySJI/HExCCWsZbxi5X4/V6dfbsWR0/flxNTU2XrDl48KB5jCvN5VLmzZun2bNnt9peWVmpuLi4b9zvWswd3HxV+23YsCHEM7Eul8sV7ilYHj0IP3oQfvQgdM6cOdPm2qCD0F133aXa2lqdPHlSf/zjH5WXl6fq6upgD/OtVFxcrKKiIvO51+tVamqqsrOzZbfbQ/pafr9fLpdLL+2MlK85Iuj9981yhnQ+VtTSg5EjRyo6Ojrc07EkehB+9CD86EHotXyi0xZBB6GYmBj17t1bkpSRkaGPPvpIS5Ys0dixY9XY2KgTJ04EnImpr69XcnKyJCk5ObnV3V0td3JdWHPx3V319fWy2+1q166doqKiFBUVdcmaC49xpblcis1mk81ma7U9Ojr6un1x+poj5GsKPgjxzRI617O/aBt6EH70IPzoQegEs47X/HuEmpub5fP5lJGRoejoaFVVVZljhw4dUl1dnRwOhyTJ4XBo7969AXd3uVwu2e12paenmzUXHqOlpuUYMTExysjICKhpbm5WVVWVWdOWuQAAAAR1Rqi4uFijR49Wjx49dOrUKa1atUqbN2/Wxo0bFR8frwkTJqioqEidO3eW3W7XCy+8IIfDYV6cnJ2drfT0dD399NOaP3++PB6PZsyYofz8fPNMzOTJk7V06VJNnz5dzz33nDZt2qQ1a9Zo/fr15jyKioqUl5enwYMH67777tPixYvV0NCg8ePHS1Kb5gIAABBUEDp27JieeeYZHT16VPHx8erfv782btyokSNHSpIWLVqkyMhI5ebmyufzyel0atmyZeb+UVFRWrdunaZMmSKHw6H27dsrLy9Pc+bMMWvS0tK0fv16TZ06VUuWLFH37t31+uuvy+n8v2tixo4dqy+//FIlJSXyeDwaOHCgKioqAi6gvtJcAAAAIgzDuLr7ty3A6/UqPj5eJ0+evC4XS2/YsEHTd0Rd1TVCn7+cE9L5WFFLDx566CE+lw8TehB+9CD86EHoBfPvN39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQWjevHn6/ve/r44dOyoxMVFjxozRoUOHAmqGDh2qiIiIgMfkyZMDaurq6pSTk6O4uDglJiZq2rRpOn/+fEDN5s2bNWjQINlsNvXu3Vvl5eWt5lNaWqpevXopNjZWmZmZ2rFjR8D4uXPnlJ+fry5duqhDhw7Kzc1VfX19MG8ZAADcxIIKQtXV1crPz9e2bdvkcrnk9/uVnZ2thoaGgLqJEyfq6NGj5mP+/PnmWFNTk3JyctTY2KitW7dq5cqVKi8vV0lJiVlz+PBh5eTkaNiwYaqtrVVhYaGef/55bdy40axZvXq1ioqKNHPmTO3atUsDBgyQ0+nUsWPHzJqpU6fqvffe09q1a1VdXa0jR47o0UcfDXqRAADAzemWYIorKioCnpeXlysxMVE1NTV68MEHze1xcXFKTk6+5DEqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP9XJkyf1xhtvaNWqVRo+fLgkacWKFerbt6+2bdumIUOGBPPWAQDATSioIHSxkydPSpI6d+4csP2tt97Sm2++qeTkZD388MN66aWXFBcXJ0lyu93q16+fkpKSzHqn06kpU6Zo//79uvfee+V2u5WVlRVwTKfTqcLCQklSY2OjampqVFxcbI5HRkYqKytLbrdbklRTUyO/3x9wnD59+qhHjx5yu92XDEI+n08+n8987vV6JUl+v19+vz/o9bmcluPZIo1r2h9Xr2UNWcvwoQfhRw/Cjx6EXjBredVBqLm5WYWFhfrBD36ge+65x9z+1FNPqWfPnkpJSdGePXv04osv6tChQ/rTn/4kSfJ4PAEhSJL53OPxXLbG6/Xq7NmzOn78uJqami5Zc/DgQfMYMTExSkhIaFXT8joXmzdvnmbPnt1qe2VlpRnkQm3u4Oar2m/Dhg0hnol1uVyucE/B8uhB+NGD8KMHoXPmzJk21151EMrPz9e+ffv04YcfBmyfNGmS+d/9+vVTt27dNGLECH322We6/fbbr/blboji4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r37ZWszMzMlSZ9++qluv/12JScnt7q7q+VOrpbripKTk1vd3VVfXy+73a527dopKipKUVFRl6y58BiNjY06ceJEwFmhC2suZrPZZLPZWm2Pjo6+bl+cvuYI+ZqCD0J8s4TO9ewv2oYehB89CD96EDrBrGNQd40ZhqGCggK988472rRpk9LS0q64T21trSSpW7dukiSHw6G9e/cG3N3lcrlkt9uVnp5u1lRVVQUcx+VyyeFwSJJiYmKUkZERUNPc3KyqqiqzJiMjQ9HR0QE1hw4dUl1dnVkDAACsLagzQvn5+Vq1apX+/Oc/q2PHjua1NvHx8WrXrp0+++wzrVq1Sg899JC6dOmiPXv2aOrUqXrwwQfVv39/SVJ2drbS09P19NNPa/78+fJ4PJoxY4by8/PNszGTJ0/W0qVLNX36dD333HPatGmT1qxZo/Xr15tzKSoqUl5engYPHqz77rtPixcvVkNDg3kXWXx8vCZMmKCioiJ17txZdrtdL7zwghwOB3eMAQAASUEGoddee03Sf35p4oVWrFihZ599VjExMXr//ffNUJKamqrc3FzNmDHDrI2KitK6des0ZcoUORwOtW/fXnl5eZozZ45Zk5aWpvXr12vq1KlasmSJunfvrtdff928dV6Sxo4dqy+//FIlJSXyeDwaOHCgKioqAi6gXrRokSIjI5Wbmyufzyen06lly5YFtUAAAODmFVQQMozL3+qdmpqq6urqKx6nZ8+eV7zraejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyHvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6ggNG/ePH3/+99Xx44dlZiYqDFjxujQoUMBNefOnVN+fr66dOmiDh06KDc3V/X19QE1dXV1ysnJUVxcnBITEzVt2jSdP38+oGbz5s0aNGiQbDabevfurfLy8lbzKS0tVa9evRQbG6vMzEzt2LEj6LkAAADrCioIVVdXKz8/X9u2bZPL5ZLf71d2drYaGhrMmqlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7slmOKKioqA5+Xl5UpMTFRNTY0efPBBnTx5Um+88YZWrVql4cOHS5JWrFihvn37atu2bRoyZIgqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP23TXAAAgLUFFYQudvLkSUlS586dJUk1NTXy+/3Kysoya/r06aMePXrI7XZryJAhcrvd6tevn5KSkswap9OpKVOmaP/+/br33nvldrsDjtFSU1hYKElqbGxUTU2NiouLzfHIyEhlZWXJ7Xa3eS4X8/l88vl85nOv1ytJ8vv98vv9V7VG36TleLZI45r2x9VrWUPWMnzoQfjRg/CjB6EXzFpedRBqbm5WYWGhfvCDH+iee+6RJHk8HsXExCghISGgNikpSR6Px6y5MAS1jLeMXa7G6/Xq7NmzOn78uJqami5Zc/DgwTbP5WLz5s3T7NmzW22vrKxUXFzcNy3FNZk7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtVcdhPLz87Vv3z59+OGHV3uIb53i4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r27uT05OVmNjY06ceJEwJmY+vp6JScnmzUX393VcifXhTUX391VX18vu92udu3aKSoqSlFRUZesufAYV5rLxWw2m2w2W6vt0dHR1+2L09ccIV9T8EGIb5bQuZ79RdvQg/CjB+FHD0InmHUM6q4xwzBUUFCgd955R5s2bVJaWlrAeEZGhqKjo1VVVWVuO3TokOrq6uRwOCRJDodDe/fuDbi7y+VyyW63Kz093ay58BgtNS3HiImJUUZGRkBNc3OzqqqqzJq2zAUAAFhbUGeE8vPztWrVKv35z39Wx44dzWtt4uPj1a5dO8XHx2vChAkqKipS586dZbfb9cILL8jhcJgXJ2dnZys9PV1PP/205s+fL4/HoxkzZig/P988GzN58mQtXbpU06dP13PPPadNmzZpzZo1Wr9+vTmXoqIi5eXlafDgwbrvvvu0ePFiNTQ0mHeRtWUuAADA2oIKQq+99pokaejQoQHbV6xYoWeffVaStGjRIkVGRio3N1c+n09Op1PLli0za6OiorRu3TpNmTJFDodD7du3V15enubMmWPWpKWlaf369Zo6daqWLFmi7t276/XXXzdvnZeksWPH6ssvv1RJSYk8Ho8GDhyoioqKgAuorzQXAABgbUEFIcO48q3esbGxKi0tVWlp6TfW9OzZ84p3PQ0dOlS7d+++bE1BQYEKCgquaS4AAMC6+FtjAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsoIOQlu2bNHDDz+slJQURURE6N133w0Yf/bZZxURERHwGDVqVEDN119/rXHjxslutyshIUETJkzQ6dOnA2r27NmjBx54QLGxsUpNTdX8+fNbzWXt2rXq06ePYmNj1a9fP23YsCFg3DAMlZSUqFu3bmrXrp2ysrL0ySefBPuWAQDATSroINTQ0KABAwaotLT0G2tGjRqlo0ePmo8//OEPAePjxo3T/v375XK5tG7dOm3ZskWTJk0yx71er7Kzs9WzZ0/V1NTolVde0axZs/Tb3/7WrNm6dauefPJJTZgwQbt379aYMWM0ZswY7du3z6yZP3++Xn31VZWVlWn79u1q3769nE6nzp07F+zbBgAAN6Fbgt1h9OjRGj169GVrbDabkpOTLzn28ccfq6KiQh999JEGDx4sSfr1r3+thx56SL/61a+UkpKit956S42NjVq+fLliYmJ09913q7a2VgsXLjQD05IlSzRq1ChNmzZNkjR37ly5XC4tXbpUZWVlMgxDixcv1owZM/TII49Ikn7/+98rKSlJ7777rp544olg3zoAALjJBB2E2mLz5s1KTExUp06dNHz4cP385z9Xly5dJElut1sJCQlmCJKkrKwsRUZGavv27frhD38ot9utBx98UDExMWaN0+nUL3/5Sx0/flydOnWS2+1WUVFRwOs6nU7zo7rDhw/L4/EoKyvLHI+Pj1dmZqbcbvclg5DP55PP5zOfe71eSZLf75ff77/2hblAy/FskcY17Y+r17KGrGX40IPwowfhRw9CL5i1DHkQGjVqlB599FGlpaXps88+089+9jONHj1abrdbUVFR8ng8SkxMDJzELbeoc+fO8ng8kiSPx6O0tLSAmqSkJHOsU6dO8ng85rYLay48xoX7XarmYvPmzdPs2bNbba+srFRcXFxblyAocwc3X9V+F18PhavncrnCPQXLowfhRw/Cjx6EzpkzZ9pcG/IgdOGZln79+ql///66/fbbtXnzZo0YMSLULxdSxcXFAWeZvF6vUlNTlZ2dLbvdHtLX8vv9crlcemlnpHzNEUHvv2+WM6TzsaKWHowcOVLR0dHhno4l0YPwowfhRw9Cr+UTnba4Lh+NXei2225T165d9emnn2rEiBFKTk7WsWPHAmrOnz+vr7/+2ryuKDk5WfX19QE1Lc+vVHPheMu2bt26BdQMHDjwknO12Wyy2WyttkdHR1+3L05fc4R8TcEHIb5ZQud69hdtQw/Cjx6EHz0InWDW8br/HqF//etf+uqrr8ww4nA4dOLECdXU1Jg1mzZtUnNzszIzM82aLVu2BHzG53K5dNddd6lTp05mTVVVVcBruVwuORwOSVJaWpqSk5MDarxer7Zv327WAAAAaws6CJ0+fVq1tbWqra2V9J+Lkmtra1VXV6fTp09r2rRp2rZtmz7//HNVVVXpkUceUe/eveV0/uejnL59+2rUqFGaOHGiduzYob///e8qKCjQE088oZSUFEnSU089pZiYGE2YMEH79+/X6tWrtWTJkoCPrX70ox+poqJCCxYs0MGDBzVr1izt3LlTBQUFkqSIiAgVFhbq5z//uf7yl79o7969euaZZ5SSkqIxY8Zc47IBAICbQdAfje3cuVPDhg0zn7eEk7y8PL322mvas2ePVq5cqRMnTiglJUXZ2dmaO3duwEdOb731lgoKCjRixAhFRkYqNzdXr776qjkeHx+vyspK5efnKyMjQ127dlVJSUnA7xr6r//6L61atUozZszQz372M91xxx169913dc8995g106dPV0NDgyZNmqQTJ07o/vvvV0VFhWJjY4N92wAA4CYUdBAaOnSoDOObb/neuHHjFY/RuXNnrVq16rI1/fv319/+9rfL1jz++ON6/PHHv3E8IiJCc+bM0Zw5c644JwAAYD38rTEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZQQehLVu26OGHH1ZKSooiIiL07rvvBowbhqGSkhJ169ZN7dq1U1ZWlj755JOAmq+//lrjxo2T3W5XQkKCJkyYoNOnTwfU7NmzRw888IBiY2OVmpqq+fPnt5rL2rVr1adPH8XGxqpfv37asGFD0HMBAADWFXQQamho0IABA1RaWnrJ8fnz5+vVV19VWVmZtm/frvbt28vpdOrcuXNmzbhx47R//365XC6tW7dOW7Zs0aRJk8xxr9er7Oxs9ezZUzU1NXrllVc0a9Ys/fa3vzVrtm7dqieffFITJkzQ7t27NWbMGI0ZM0b79u0Lai4AAMC6bgl2h9GjR2v06NGXHDMMQ4sXL9aMGTP0yCOPSJJ+//vfKykpSe+++66eeOIJffzxx6qoqNBHH32kwYMHS5J+/etf66GHHtKvfvUrpaSk6K233lJjY6OWL1+umJgY3X333aqtrdXChQvNwLRkyRKNGjVK06ZNkyTNnTtXLpdLS5cuVVlZWZvmAgAArC2k1wgdPnxYHo9HWVlZ5rb4+HhlZmbK7XZLktxutxISEswQJElZWVmKjIzU9u3bzZoHH3xQMTExZo3T6dShQ4d0/Phxs+bC12mpaXmdtswFAABYW9BnhC7H4/FIkpKSkgK2JyUlmWMej0eJiYmBk7jlFnXu3DmgJi0trdUxWsY6deokj8dzxde50lwu5vP55PP5zOder1eS5Pf75ff7L/fWg9ZyPFukcU374+q1rCFrGT70IPzoQfjRg9ALZi1DGoS+6+bNm6fZs2e32l5ZWam4uLjr8ppzBzdf1X4XXxiOq+dyucI9BcujB+FHD8KPHoTOmTNn2lwb0iCUnJwsSaqvr1e3bt3M7fX19Ro4cKBZc+zYsYD9zp8/r6+//trcPzk5WfX19QE1Lc+vVHPh+JXmcrHi4mIVFRWZz71er1JTU5WdnS273X7lBQiC3++Xy+XSSzsj5WuOCHr/fbOcIZ2PFbX0YOTIkYqOjg73dCyJHoQfPQg/ehB6LZ/otEVIg1BaWpqSk5NVVVVlhg2v16vt27drypQpkiSHw6ETJ06opqZGGRkZkqRNmzapublZmZmZZs3/+3//T36/3/yicLlcuuuuu9SpUyezpqqqSoWFhebru1wuORyONs/lYjabTTabrdX26Ojo6/bF6WuOkK8p+CDEN0voXM/+om3oQfjRg/CjB6ETzDoGfbH06dOnVVtbq9raWkn/uSi5trZWdXV1ioiIUGFhoX7+85/rL3/5i/bu3atnnnlGKSkpGjNmjCSpb9++GjVqlCZOnKgdO3bo73//uwoKCvTEE08oJSVFkvTUU08pJiZGEyZM0P79+7V69WotWbIk4GzNj370I1VUVGjBggU6ePCgZs2apZ07d6qgoECS2jQXAABgbUGfEdq5c6eGDRtmPm8JJ3l5eSovL9f06dPV0NCgSZMm6cSJE7r//vtVUVGh2NhYc5+33npLBQUFGjFihCIjI5Wbm6tXX33VHI+Pj1dlZaXy8/OVkZGhrl27qqSkJOB3Df3Xf/2XVq1apRkzZuhnP/uZ7rjjDr377ru65557zJq2zAUAAFhX0EFo6NChMoxvvtMpIiJCc+bM0Zw5c76xpnPnzlq1atVlX6d///7629/+dtmaxx9/XI8//vg1zQUAAFgXf2sMAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYVtB/awzfDr1+uv6q9/385ZwQzgQAgO8uzggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLCnkQmjVrliIiIgIeffr0McfPnTun/Px8denSRR06dFBubq7q6+sDjlFXV6ecnBzFxcUpMTFR06ZN0/nz5wNqNm/erEGDBslms6l3794qLy9vNZfS0lL16tVLsbGxyszM1I4dO0L9dgEAwHfYdTkjdPfdd+vo0aPm48MPPzTHpk6dqvfee09r165VdXW1jhw5okcffdQcb2pqUk5OjhobG7V161atXLlS5eXlKikpMWsOHz6snJwcDRs2TLW1tSosLNTzzz+vjRs3mjWrV69WUVGRZs6cqV27dmnAgAFyOp06duzY9XjLAADgO+i6BKFbbrlFycnJ5qNr166SpJMnT+qNN97QwoULNXz4cGVkZGjFihXaunWrtm3bJkmqrKzUgQMH9Oabb2rgwIEaPXq05s6dq9LSUjU2NkqSysrKlJaWpgULFqhv374qKCjQY489pkWLFplzWLhwoSZOnKjx48crPT1dZWVliouL0/Lly6/HWwYAAN9Bt1yPg37yySdKSUlRbGysHA6H5s2bpx49eqimpkZ+v19ZWVlmbZ8+fdSjRw+53W4NGTJEbrdb/fr1U1JSklnjdDo1ZcoU7d+/X/fee6/cbnfAMVpqCgsLJUmNjY2qqalRcXGxOR4ZGamsrCy53e5vnLfP55PP5zOfe71eSZLf75ff77+mNblYy/FskUZIjxvMa1tdyzqwHuFDD8KPHoQfPQi9YNYy5EEoMzNT5eXluuuuu3T06FHNnj1bDzzwgPbt2yePx6OYmBglJCQE7JOUlCSPxyNJ8ng8ASGoZbxl7HI1Xq9XZ8+e1fHjx9XU1HTJmoMHD37j3OfNm6fZs2e32l5ZWam4uLi2LUCQ5g5uvi7HvZwNGzbc8Nf8NnO5XOGeguXRg/CjB+FHD0LnzJkzba4NeRAaPXq0+d/9+/dXZmamevbsqTVr1qhdu3ahfrmQKi4uVlFRkfnc6/UqNTVV2dnZstvtIX0tv98vl8ull3ZGytccEdJjX8m+Wc4b+nrfVi09GDlypKKjo8M9HUuiB+FHD8KPHoReyyc6bXFdPhq7UEJCgu688059+umnGjlypBobG3XixImAs0L19fVKTk6WJCUnJ7e6u6vlrrILay6+06y+vl52u13t2rVTVFSUoqKiLlnTcoxLsdlsstlsrbZHR0dfty9OX3OEfE03NgjxjRboevYXbUMPwo8ehB89CJ1g1vG6/x6h06dP67PPPlO3bt2UkZGh6OhoVVVVmeOHDh1SXV2dHA6HJMnhcGjv3r0Bd3e5XC7Z7Xalp6ebNRceo6Wm5RgxMTHKyMgIqGlublZVVZVZAwAAEPIg9JOf/ETV1dX6/PPPtXXrVv3whz9UVFSUnnzyScXHx2vChAkqKirSBx98oJqaGo0fP14Oh0NDhgyRJGVnZys9PV1PP/20/vGPf2jjxo2aMWOG8vPzzbM1kydP1j//+U9Nnz5dBw8e1LJly7RmzRpNnTrVnEdRUZF+97vfaeXKlfr44481ZcoUNTQ0aPz48aF+ywAA4Dsq5B+N/etf/9KTTz6pr776Srfeeqvuv/9+bdu2TbfeeqskadGiRYqMjFRubq58Pp+cTqeWLVtm7h8VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv+79mXs2LH68ssvVVJSIo/Ho4EDB6qioqLVBdQAAMC6Qh6E3n777cuOx8bGqrS0VKWlpd9Y07Nnzyve2TR06FDt3r37sjUFBQUqKCi4bA0AALAu/tYYAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwrFvCPQHceL1+uv6q9/385ZwQzgQAgPDijBAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsSwSh0tJS9erVS7GxscrMzNSOHTvCPSUAAPAtcEu4J3C9rV69WkVFRSorK1NmZqYWL14sp9OpQ4cOKTExMdzT+87p9dP1V73v5y/nhHAmAABcu5v+jNDChQs1ceJEjR8/Xunp6SorK1NcXJyWL18e7qkBAIAwu6nPCDU2NqqmpkbFxcXmtsjISGVlZcntdreq9/l88vl85vOTJ09Kkr7++mv5/f6Qzs3v9+vMmTO6xR+ppuaIkB7726r3T9aE5XW3F4+45PaWHnz11VeKjo6+wbOCRA++DehB+NGD0Dt16pQkyTCMK9be1EHo3//+t5qampSUlBSwPSkpSQcPHmxVP2/ePM2ePbvV9rS0tOs2R1x/XReEewYAgHA4deqU4uPjL1tzUwehYBUXF6uoqMh83tzcrK+//lpdunRRRERoz9p4vV6lpqbqiy++kN1uD+mx0Tb0IPzoQfjRg/CjB6FnGIZOnTqllJSUK9be1EGoa9euioqKUn19fcD2+vp6JScnt6q32Wyy2WwB2xISEq7nFGW32/nCDzN6EH70IPzoQfjRg9C60pmgFjf1xdIxMTHKyMhQVVWVua25uVlVVVVyOBxhnBkAAPg2uKnPCElSUVGR8vLyNHjwYN13331avHixGhoaNH78+HBPDQAAhNlNH4TGjh2rL7/8UiUlJfJ4PBo4cKAqKipaXUB9o9lsNs2cObPVR3G4cehB+NGD8KMH4UcPwivCaMu9ZQAAADehm/oaIQAAgMshCAEAAMsiCAEAAMsiCAEAAMsiCIVBaWmpevXqpdjYWGVmZmrHjh3hntJNY9asWYqIiAh49OnTxxw/d+6c8vPz1aVLF3Xo0EG5ubmtfuFmXV2dcnJyFBcXp8TERE2bNk3nz5+/0W/lO2PLli16+OGHlZKSooiICL377rsB44ZhqKSkRN26dVO7du2UlZWlTz75JKDm66+/1rhx42S325WQkKAJEybo9OnTATV79uzRAw88oNjYWKWmpmr+/PnX+619Z1ypB88++2yr74tRo0YF1NCDazNv3jx9//vfV8eOHZWYmKgxY8bo0KFDATWh+vmzefNmDRo0SDabTb1791Z5efn1fns3NYLQDbZ69WoVFRVp5syZ2rVrlwYMGCCn06ljx46Fe2o3jbvvvltHjx41Hx9++KE5NnXqVL333ntau3atqqurdeTIET366KPmeFNTk3JyctTY2KitW7dq5cqVKi8vV0lJSTjeyndCQ0ODBgwYoNLS0kuOz58/X6+++qrKysq0fft2tW/fXk6nU+fOnTNrxo0bp/3798vlcmndunXasmWLJk2aZI57vV5lZ2erZ8+eqqmp0SuvvKJZs2bpt7/97XV/f98FV+qBJI0aNSrg++IPf/hDwDg9uDbV1dXKz8/Xtm3b5HK55Pf7lZ2drYaGBrMmFD9/Dh8+rJycHA0bNky1tbUqLCzU888/r40bN97Q93tTMXBD3XfffUZ+fr75vKmpyUhJSTHmzZsXxlndPGbOnGkMGDDgkmMnTpwwoqOjjbVr15rbPv74Y0OS4Xa7DcMwjA0bNhiRkZGGx+Mxa1577TXDbrcbPp/vus79ZiDJeOedd8znzc3NRnJysvHKK6+Y206cOGHYbDbjD3/4g2EYhnHgwAFDkvHRRx+ZNX/961+NiIgI43//938NwzCMZcuWGZ06dQrowYsvvmjcdddd1/kdffdc3APDMIy8vDzjkUce+cZ96EHoHTt2zJBkVFdXG4YRup8/06dPN+6+++6A1xo7dqzhdDqv91u6aXFG6AZqbGxUTU2NsrKyzG2RkZHKysqS2+0O48xuLp988olSUlJ02223ady4caqrq5Mk1dTUyO/3B6x/nz591KNHD3P93W63+vXrF/ALN51Op7xer/bv339j38hN4PDhw/J4PAFrHh8fr8zMzIA1T0hI0ODBg82arKwsRUZGavv27WbNgw8+qJiYGLPG6XTq0KFDOn78+A16N99tmzdvVmJiou666y5NmTJFX331lTlGD0Lv5MmTkqTOnTtLCt3PH7fbHXCMlhr+Dbl6BKEb6N///reamppa/VbrpKQkeTyeMM3q5pKZmany8nJVVFTotdde0+HDh/XAAw/o1KlT8ng8iomJafWHdC9cf4/Hc8n+tIwhOC1rdrmveY/Ho8TExIDxW265RZ07d6YvITJq1Cj9/ve/V1VVlX75y1+qurpao0ePVlNTkyR6EGrNzc0qLCzUD37wA91zzz2SFLKfP99U4/V6dfbs2evxdm56N/2f2IC1jB492vzv/v37KzMzUz179tSaNWvUrl27MM4MCJ8nnnjC/O9+/fqpf//+uv3227V582aNGDEijDO7OeXn52vfvn0B1yfi24szQjdQ165dFRUV1eougfr6eiUnJ4dpVje3hIQE3Xnnnfr000+VnJysxsZGnThxIqDmwvVPTk6+ZH9axhCcljW73Nd8cnJyq5sFzp8/r6+//pq+XCe33Xabunbtqk8//VQSPQilgoICrVu3Th988IG6d+9ubg/Vz59vqrHb7fyfvatEELqBYmJilJGRoaqqKnNbc3Ozqqqq5HA4wjizm9fp06f12WefqVu3bsrIyFB0dHTA+h86dEh1dXXm+jscDu3duzfgHwWXyyW73a709PQbPv/vurS0NCUnJwesudfr1fbt2wPW/MSJE6qpqTFrNm3apObmZmVmZpo1W7Zskd/vN2tcLpfuuusuderU6Qa9m5vHv/71L3311Vfq1q2bJHoQCoZhqKCgQO+88442bdqktLS0gPFQ/fxxOBwBx2ip4d+QaxDuq7Wt5u233zZsNptRXl5uHDhwwJg0aZKRkJAQcJcArt6Pf/xjY/Pmzcbhw4eNv//970ZWVpbRtWtX49ixY4ZhGMbkyZONHj16GJs2bTJ27txpOBwOw+FwmPufP3/euOeee4zs7GyjtrbWqKioMG699VajuLg4XG/pW+/UqVPG7t27jd27dxuSjIULFxq7d+82/ud//scwDMN4+eWXjYSEBOPPf/6zsWfPHuORRx4x0tLSjLNnz5rHGDVqlHHvvfca27dvNz788EPjjjvuMJ588klz/MSJE0ZSUpLx9NNPG/v27TPefvttIy4uzvjNb35zw9/vt9HlenDq1CnjJz/5ieF2u43Dhw8b77//vjFo0CDjjjvuMM6dO2cegx5cmylTphjx8fHG5s2bjaNHj5qPM2fOmDWh+Pnzz3/+04iLizOmTZtmfPzxx0ZpaakRFRVlVFRU3ND3ezMhCIXBr3/9a6NHjx5GTEyMcd999xnbtm0L95RuGmPHjjW6detmxMTEGN/73veMsWPHGp9++qk5fvbsWeO///u/jU6dOhlxcXHGD3/4Q+Po0aMBx/j888+N0aNHG+3atTO6du1q/PjHPzb8fv+NfivfGR988IEhqdUjLy/PMIz/3EL/0ksvGUlJSYbNZjNGjBhhHDp0KOAYX331lfHkk08aHTp0MOx2uzF+/Hjj1KlTATX/+Mc/jPvvv9+w2WzG9773PePll1++UW/xW+9yPThz5oyRnZ1t3HrrrUZ0dLTRs2dPY+LEia3+zxc9uDaXWn9JxooVK8yaUP38+eCDD4yBAwcaMTExxm233RbwGghehGEYxo0+CwUAAPBtwDVCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsv4/2I1qxbjTQ/MAAAAASUVORK5CYII="},"metadata":{}}]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T11:51:32.281478Z","iopub.execute_input":"2023-05-12T11:51:32.281773Z","iopub.status.idle":"2023-05-12T11:51:33.112847Z","shell.execute_reply.started":"2023-05-12T11:51:32.281748Z","shell.execute_reply":"2023-05-12T11:51:33.111873Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stderr","text":"Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.1kB/s]\nDownloading (…)lve/main/config.json: 100%|██████████| 625/625 [00:00<00:00, 382kB/s]\nDownloading (…)solve/main/vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 13.9MB/s]\nDownloading (…)/main/tokenizer.json: 100%|██████████| 1.72M/1.72M [00:00<00:00, 97.0MB/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n return tokenizer(text_data.comment_text.values.tolist(), \n max_length=max_len, \n truncation=True, \n padding=\"max_length\",\n add_special_tokens=True,\n return_tensors=\"tf\",\n return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:33.113957Z","iopub.execute_input":"2023-05-12T11:51:33.114337Z","iopub.status.idle":"2023-05-12T11:51:33.120170Z","shell.execute_reply.started":"2023-05-12T11:51:33.114311Z","shell.execute_reply":"2023-05-12T11:51:33.119390Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:33.121147Z","iopub.execute_input":"2023-05-12T11:51:33.121426Z","iopub.status.idle":"2023-05-12T11:52:16.634801Z","shell.execute_reply.started":"2023-05-12T11:51:33.121402Z","shell.execute_reply":"2023-05-12T11:52:16.633568Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n .repeat()\n .shuffle(Config.BUFFER_SIZE)\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:16.636039Z","iopub.execute_input":"2023-05-12T11:52:16.636353Z","iopub.status.idle":"2023-05-12T11:52:16.668330Z","shell.execute_reply.started":"2023-05-12T11:52:16.636312Z","shell.execute_reply":"2023-05-12T11:52:16.667443Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformer, max_len=Config.MAX_LEN):\n input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n \n bert_layers = transformer.bert(input_ids, attention_mask=masks)[0]\n \n \"\"\"intermediate = Dense(1024, activation='relu')(bert_layers)\n output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\"\"\"\n \n out = GlobalMaxPool1D()(bert_layers)\n out = Dense(768, activation=\"relu\")(out)\n out = Dropout(0.1)(out)\n out = Dense(384, activation=\"relu\")(out)\n output = Dense(1, activation=\"sigmoid\")(out)\n model = Model(inputs=[input_ids, masks], outputs=output)\n model.layers[2].trainable = True\n \n model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=tf.keras.metrics.AUC())\n return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:16.669447Z","iopub.execute_input":"2023-05-12T11:52:16.669741Z","iopub.status.idle":"2023-05-12T11:52:16.679253Z","shell.execute_reply.started":"2023-05-12T11:52:16.669715Z","shell.execute_reply":"2023-05-12T11:52:16.678393Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n transformer = TFAutoModel.from_pretrained(Config.MODEL)\n model = model_builder(transformer=transformer)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:16.680310Z","iopub.execute_input":"2023-05-12T11:52:16.680659Z","iopub.status.idle":"2023-05-12T11:52:51.834986Z","shell.execute_reply.started":"2023-05-12T11:52:16.680636Z","shell.execute_reply":"2023-05-12T11:52:51.833917Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stderr","text":"Downloading tf_model.h5: 100%|██████████| 999M/999M [00:10<00:00, 99.5MB/s] \nSome layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']\n- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nAll the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:51.836335Z","iopub.execute_input":"2023-05-12T11:52:51.836733Z","iopub.status.idle":"2023-05-12T11:52:51.881506Z","shell.execute_reply.started":"2023-05-12T11:52:51.836705Z","shell.execute_reply":"2023-05-12T11:52:51.880546Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n bert (TFBertMainLayer) TFBaseModelOutputWi 167356416 ['input_ids[0][0]', \n thPoolingAndCrossAt 'attention_mask[0][0]'] \n tentions(last_hidde \n n_state=(None, 192, \n 768), \n pooler_output=(Non \n e, 768), \n past_key_values=No \n ne, hidden_states=N \n one, attentions=Non \n e, cross_attentions \n =None) \n \n global_max_pooling1d (GlobalMa (None, 768) 0 ['bert[0][0]'] \n xPooling1D) \n \n dense (Dense) (None, 768) 590592 ['global_max_pooling1d[0][0]'] \n \n dropout_37 (Dropout) (None, 768) 0 ['dense[0][0]'] \n \n dense_1 (Dense) (None, 384) 295296 ['dropout_37[0][0]'] \n \n dense_2 (Dense) (None, 1) 385 ['dense_1[0][0]'] \n \n==================================================================================================\nTotal params: 168,242,689\nTrainable params: 168,242,689\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n validation_data=val_dataset,\n steps_per_epoch=train_steps_per_epoch,\n epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:51.882653Z","iopub.execute_input":"2023-05-12T11:52:51.883060Z","iopub.status.idle":"2023-05-12T12:13:12.713842Z","shell.execute_reply.started":"2023-05-12T11:52:51.883032Z","shell.execute_reply":"2023-05-12T12:13:12.712319Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Epoch 1/2\nWARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stderr","text":"WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0486 - auc: 0.9973","output_type":"stream"},{"name":"stderr","text":"2023-05-12 12:03:44.473468: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 12:03:44.751296: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 666s 136ms/step - loss: 0.0486 - auc: 0.9973 - val_loss: 0.6711 - val_auc: 0.7589\nEpoch 2/2\n4185/4185 [==============================] - 553s 132ms/step - loss: 0.0420 - auc: 0.9980 - val_loss: 0.6677 - val_auc: 0.7927\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:14:40.714960Z","iopub.execute_input":"2023-05-12T12:14:40.715901Z","iopub.status.idle":"2023-05-12T12:14:44.186043Z","shell.execute_reply.started":"2023-05-12T12:14:40.715867Z","shell.execute_reply":"2023-05-12T12:14:44.184917Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stdout","text":"63/63 [==============================] - 3s 39ms/step - loss: 0.1322 - auc: 0.9915\n","output_type":"stream"},{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"[0.1321573555469513, 0.9915268421173096]"},"metadata":{}}]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n steps_per_epoch=val_steps_per_epoch,\n epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:13:29.464786Z","iopub.execute_input":"2023-05-12T12:13:29.465239Z","iopub.status.idle":"2023-05-12T12:14:32.640293Z","shell.execute_reply.started":"2023-05-12T12:13:29.465206Z","shell.execute_reply":"2023-05-12T12:14:32.639052Z"},"trusted":true},"execution_count":20,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 8s 131ms/step - loss: 0.3363 - auc: 0.8208\nEpoch 2/2\n62/62 [==============================] - 54s 131ms/step - loss: 0.1986 - auc: 0.9492\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:14:57.376394Z","iopub.execute_input":"2023-05-12T12:14:57.377409Z","iopub.status.idle":"2023-05-12T12:15:33.710506Z","shell.execute_reply.started":"2023-05-12T12:14:57.377371Z","shell.execute_reply":"2023-05-12T12:15:33.709250Z"},"trusted":true},"execution_count":22,"outputs":[{"name":"stderr","text":"2023-05-12 12:15:02.970518: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n2023-05-12 12:15:03.224869: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"499/499 [==============================] - 36s 49ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"model.save(\"mbert-fine-tuned-2-hiddenstates\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:44:30.231587Z","iopub.execute_input":"2023-05-12T10:44:30.231921Z"},"trusted":true},"execution_count":null,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"}]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"!huggingface-cli login --token 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","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"m.summary()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
experiment_notebooks/Transformer-mBert-Pooler-state.ipynb.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:47:51.413800Z","iopub.execute_input":"2023-05-12T11:47:51.414070Z","iopub.status.idle":"2023-05-12T11:48:18.602918Z","shell.execute_reply.started":"2023-05-12T11:47:51.414046Z","shell.execute_reply":"2023-05-12T11:48:18.601877Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:49:44.573340Z","iopub.execute_input":"2023-05-12T11:49:44.574079Z","iopub.status.idle":"2023-05-12T11:50:25.814142Z","shell.execute_reply.started":"2023-05-12T11:49:44.574046Z","shell.execute_reply":"2023-05-12T11:50:25.812749Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 11:50:18.442119879 14 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 11:50:18.442154551 14 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 11:50:18.442158337 14 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 11:50:18.442160973 14 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 11:50:18.442163321 14 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 11:50:18.442165969 14 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 11:50:18.442168796 14 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 11:50:18.442171109 14 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 11:50:18.442173402 14 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 11:50:18.442175638 14 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 11:50:18.442177867 14 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 11:50:18.442181062 14 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 11:50:18.442183630 14 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 11:50:18.442185959 14 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 11:50:18.442394344 14 ev_epoll1_linux.cc:122] grpc epoll fd: 62\nD0512 11:50:18.453257763 14 ev_posix.cc:144] Using polling engine: epoll1\nD0512 11:50:18.453301358 14 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 11:50:18.453762003 14 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 11:50:18.453774538 14 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 11:50:18.453779385 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 11:50:18.453782660 14 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 11:50:18.453786243 14 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 11:50:18.453789942 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 11:50:18.453797356 14 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 11:50:18.453818829 14 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 11:50:18.453851056 14 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 11:50:18.453873781 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 11:50:18.453877823 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 11:50:18.453881490 14 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 11:50:18.453888362 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 11:50:18.453892163 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 11:50:18.453896027 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 11:50:18.453901564 14 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 11:50:18.456269287 14 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 11:50:18.476859295 376 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 11:50:18.484409363 376 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {created_time:\"2023-05-12T11:50:18.484390999+00:00\", grpc_status:2}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:25.816399Z","iopub.execute_input":"2023-05-12T11:50:25.817031Z","iopub.status.idle":"2023-05-12T11:50:35.943243Z","shell.execute_reply.started":"2023-05-12T11:50:25.816998Z","shell.execute_reply":"2023-05-12T11:50:35.942201Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 3 #2\n MODEL = \"bert-base-multilingual-uncased\"\n BUFFER_SIZE = 2048\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 1e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:35.944622Z","iopub.execute_input":"2023-05-12T11:50:35.944945Z","iopub.status.idle":"2023-05-12T11:50:35.950932Z","shell.execute_reply.started":"2023-05-12T11:50:35.944916Z","shell.execute_reply":"2023-05-12T11:50:35.949929Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:35.953167Z","iopub.execute_input":"2023-05-12T11:50:35.953494Z","iopub.status.idle":"2023-05-12T11:51:03.310955Z","shell.execute_reply.started":"2023-05-12T11:50:35.953467Z","shell.execute_reply":"2023-05-12T11:51:03.309809Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1.head()","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:42.440974Z","iopub.execute_input":"2023-05-12T08:12:42.441315Z","iopub.status.idle":"2023-05-12T08:12:42.461414Z","shell.execute_reply.started":"2023-05-12T08:12:42.441285Z","shell.execute_reply":"2023-05-12T08:12:42.460195Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0000997932d777bf</td>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>000103f0d9cfb60f</td>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>000113f07ec002fd</td>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0001b41b1c6bb37e</td>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0001d958c54c6e35</td>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train2.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.462658Z","iopub.execute_input":"2023-05-12T08:12:42.462965Z","iopub.status.idle":"2023-05-12T08:12:42.487874Z","shell.execute_reply.started":"2023-05-12T08:12:42.462921Z","shell.execute_reply":"2023-05-12T08:12:42.487081Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 59848 This is so cool. It's like, 'would you want yo... 0.000000 \\\n1 59849 Thank you!! This would make my life a lot less... 0.000000 \n2 59852 This is such an urgent design problem; kudos t... 0.000000 \n3 59855 Is this something I'll be able to install on m... 0.000000 \n4 59856 haha you guys are a bunch of losers. 0.893617 \n\n severe_toxicity obscene identity_attack insult threat asian atheist \n0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \\\n1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n\n ... article_id rating funny wow sad likes disagree \n0 ... 2006 rejected 0 0 0 0 0 \\\n1 ... 2006 rejected 0 0 0 0 0 \n2 ... 2006 rejected 0 0 0 0 0 \n3 ... 2006 rejected 0 0 0 0 0 \n4 ... 2006 rejected 0 0 0 1 0 \n\n sexual_explicit identity_annotator_count toxicity_annotator_count \n0 0.0 0 4 \n1 0.0 0 4 \n2 0.0 0 4 \n3 0.0 0 4 \n4 0.0 4 47 \n\n[5 rows x 45 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxicity</th>\n <th>obscene</th>\n <th>identity_attack</th>\n <th>insult</th>\n <th>threat</th>\n <th>asian</th>\n <th>atheist</th>\n <th>...</th>\n <th>article_id</th>\n <th>rating</th>\n <th>funny</th>\n <th>wow</th>\n <th>sad</th>\n <th>likes</th>\n <th>disagree</th>\n <th>sexual_explicit</th>\n <th>identity_annotator_count</th>\n <th>toxicity_annotator_count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>59848</td>\n <td>This is so cool. It's like, 'would you want yo...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1</th>\n <td>59849</td>\n <td>Thank you!! This would make my life a lot less...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>59852</td>\n <td>This is such an urgent design problem; kudos t...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>3</th>\n <td>59855</td>\n <td>Is this something I'll be able to install on m...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>4</th>\n <td>59856</td>\n <td>haha you guys are a bunch of losers.</td>\n <td>0.893617</td>\n <td>0.021277</td>\n <td>0.0</td>\n <td>0.021277</td>\n <td>0.87234</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0.0</td>\n <td>4</td>\n <td>47</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 45 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.488844Z","iopub.execute_input":"2023-05-12T08:12:42.489110Z","iopub.status.idle":"2023-05-12T08:12:42.504161Z","shell.execute_reply.started":"2023-05-12T08:12:42.489087Z","shell.execute_reply":"2023-05-12T08:12:42.503316Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" id comment_text lang toxic\n0 0 Este usuario ni siquiera llega al rango de ... es 0\n1 1 Il testo di questa voce pare esser scopiazzato... it 0\n2 2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... es 1\n3 3 Bu maddenin alt başlığı olarak uluslararası i... tr 0\n4 4 Belçika nın şehirlerinin yanında ilçe ve belde... tr 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>es</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>it</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>es</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.505217Z","iopub.execute_input":"2023-05-12T08:12:42.505504Z","iopub.status.idle":"2023-05-12T08:12:42.518947Z","shell.execute_reply.started":"2023-05-12T08:12:42.505480Z","shell.execute_reply":"2023-05-12T08:12:42.518159Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" id content lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>content</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train1[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.519956Z","iopub.execute_input":"2023-05-12T08:12:42.520259Z","iopub.status.idle":"2023-05-12T08:12:42.534176Z","shell.execute_reply.started":"2023-05-12T08:12:42.520234Z","shell.execute_reply":"2023-05-12T08:12:42.533484Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"toxic\n0 202165\n1 21384\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train2[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.537691Z","iopub.execute_input":"2023-05-12T08:12:42.537946Z","iopub.status.idle":"2023-05-12T08:12:42.574451Z","shell.execute_reply.started":"2023-05-12T08:12:42.537925Z","shell.execute_reply":"2023-05-12T08:12:42.573541Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"toxic\n0.000000 1333035\n0.166667 138501\n0.200000 113271\n0.300000 62195\n0.400000 52703\n ... \n0.037609 1\n0.971193 1\n0.988430 1\n0.008309 1\n0.967316 1\nName: count, Length: 3853, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.575526Z","iopub.execute_input":"2023-05-12T08:12:42.575805Z","iopub.status.idle":"2023-05-12T08:12:42.584242Z","shell.execute_reply.started":"2023-05-12T08:12:42.575781Z","shell.execute_reply":"2023-05-12T08:12:42.583468Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"toxic\n0 6770\n1 1230\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.585256Z","iopub.execute_input":"2023-05-12T08:12:42.585532Z","iopub.status.idle":"2023-05-12T08:12:42.596996Z","shell.execute_reply.started":"2023-05-12T08:12:42.585510Z","shell.execute_reply":"2023-05-12T08:12:42.596246Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"lang\ntr 3000\nes 2500\nit 2500\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"test[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.597893Z","iopub.execute_input":"2023-05-12T08:12:42.598151Z","iopub.status.idle":"2023-05-12T08:12:42.612575Z","shell.execute_reply.started":"2023-05-12T08:12:42.598129Z","shell.execute_reply":"2023-05-12T08:12:42.611766Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"lang\ntr 14000\npt 11012\nru 10948\nfr 10920\nit 8494\nes 8438\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.312161Z","iopub.execute_input":"2023-05-12T11:51:03.312475Z","iopub.status.idle":"2023-05-12T11:51:03.453706Z","shell.execute_reply.started":"2023-05-12T11:51:03.312450Z","shell.execute_reply":"2023-05-12T11:51:03.452741Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"train2.toxic.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.454767Z","iopub.execute_input":"2023-05-12T11:51:03.455331Z","iopub.status.idle":"2023-05-12T11:51:03.481303Z","shell.execute_reply.started":"2023-05-12T11:51:03.455304Z","shell.execute_reply":"2023-05-12T11:51:03.480425Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"toxic\n0 1789968\n1 112226\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.482311Z","iopub.execute_input":"2023-05-12T11:51:03.482966Z","iopub.status.idle":"2023-05-12T11:51:03.827807Z","shell.execute_reply.started":"2023-05-12T11:51:03.482940Z","shell.execute_reply":"2023-05-12T11:51:03.826717Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.829068Z","iopub.execute_input":"2023-05-12T11:51:03.829375Z","iopub.status.idle":"2023-05-12T11:51:03.834997Z","shell.execute_reply.started":"2023-05-12T11:51:03.829350Z","shell.execute_reply":"2023-05-12T11:51:03.834118Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"train.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.835982Z","iopub.execute_input":"2023-05-12T11:51:03.836269Z","iopub.status.idle":"2023-05-12T11:51:03.854775Z","shell.execute_reply.started":"2023-05-12T11:51:03.836227Z","shell.execute_reply":"2023-05-12T11:51:03.853871Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Explanation\\nWhy the edits made under my usern... 0\n1 D'aww! He matches this background colour I'm s... 0\n2 Hey man, I'm really not trying to edit war. It... 0\n3 \"\\nMore\\nI can't make any real suggestions on ... 0\n4 You, sir, are my hero. Any chance you remember... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.858708Z","iopub.execute_input":"2023-05-12T11:51:03.859118Z","iopub.status.idle":"2023-05-12T11:51:03.866689Z","shell.execute_reply.started":"2023-05-12T11:51:03.859092Z","shell.execute_reply":"2023-05-12T11:51:03.865871Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Este usuario ni siquiera llega al rango de ... 0\n1 Il testo di questa voce pare esser scopiazzato... 0\n2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... 1\n3 Bu maddenin alt başlığı olarak uluslararası i... 0\n4 Belçika nın şehirlerinin yanında ilçe ve belde... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.867828Z","iopub.execute_input":"2023-05-12T11:51:03.868255Z","iopub.status.idle":"2023-05-12T11:51:03.881894Z","shell.execute_reply.started":"2023-05-12T11:51:03.868213Z","shell.execute_reply":"2023-05-12T11:51:03.881141Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":" id comment_text lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.882947Z","iopub.execute_input":"2023-05-12T11:51:03.883338Z","iopub.status.idle":"2023-05-12T11:51:03.892723Z","shell.execute_reply.started":"2023-05-12T11:51:03.883311Z","shell.execute_reply":"2023-05-12T11:51:03.891955Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.893692Z","iopub.execute_input":"2023-05-12T11:51:03.894038Z","iopub.status.idle":"2023-05-12T11:51:05.368808Z","shell.execute_reply.started":"2023-05-12T11:51:03.894014Z","shell.execute_reply":"2023-05-12T11:51:05.367736Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:05.369914Z","iopub.execute_input":"2023-05-12T11:51:05.370196Z","iopub.status.idle":"2023-05-12T11:51:08.102915Z","shell.execute_reply.started":"2023-05-12T11:51:05.370173Z","shell.execute_reply":"2023-05-12T11:51:08.101871Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"56.28243572395129\n2321\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"iVBORw0KGgoAAAANSUhEUgAAAkIAAAGdCAYAAAD+JxxnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+BklEQVR4nO3de3BU9f3/8VcSkw0BNuFiElICRFEhykVCDfutOlxCFsw4UqODymhEhIFv4jSkBZv+MNzawWK5WYJpqxA6SgU61VagIWuQUMsCEki5CaMWv7Ff2GAVWAiwWZLz+6OT82UJQhYWVjnPx8xO3fN5n7Of/byT8OrZc5IIwzAMAQAAWFBkuCcAAAAQLgQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWbeEewLfZs3NzTpy5Ig6duyoiIiIcE8HAAC0gWEYOnXqlFJSUhQZeflzPgShyzhy5IhSU1PDPQ0AAHAVvvjiC3Xv3v2yNQShy+jYsaOk/yyk3W4P6bH9fr8qKyuVnZ2t6OjokB4bbUMPwo8ehB89CD96EHper1epqanmv+OXQxC6jJaPw+x2+3UJQnFxcbLb7Xzhhwk9CD96EH70IPzowfXTlstauFgaAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABY1i3hnoDV3TNro3xNEUHv9/nLOddhNgAAWAtnhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGVdUxB6+eWXFRERocLCQnPbuXPnlJ+fry5duqhDhw7Kzc1VfX19wH51dXXKyclRXFycEhMTNW3aNJ0/fz6gZvPmzRo0aJBsNpt69+6t8vLyVq9fWlqqXr16KTY2VpmZmdqxY0fAeFvmAgAArOuqg9BHH32k3/zmN+rfv3/A9qlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7uqIHT69GmNGzdOv/vd79SpUydz+8mTJ/XGG29o4cKFGj58uDIyMrRixQpt3bpV27ZtkyRVVlbqwIEDevPNNzVw4ECNHj1ac+fOVWlpqRobGyVJZWVlSktL04IFC9S3b18VFBToscce06JFi8zXWrhwoSZOnKjx48crPT1dZWVliouL0/Lly9s8FwAAYG1XFYTy8/OVk5OjrKysgO01NTXy+/0B2/v06aMePXrI7XZLktxut/r166ekpCSzxul0yuv1av/+/WbNxcd2Op3mMRobG1VTUxNQExkZqaysLLOmLXMBAADWdkuwO7z99tvatWuXPvroo1ZjHo9HMTExSkhICNielJQkj8dj1lwYglrGW8YuV+P1enX27FkdP35cTU1Nl6w5ePBgm+dyMZ/PJ5/PZz73er2SJL/fL7/ff8l9rlbL8WyRxjXtj6vXsoasZfjQg/CjB+FHD0IvmLUMKgh98cUX+tGPfiSXy6XY2NigJ/ZtN2/ePM2ePbvV9srKSsXFxV2X15w7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtUEFoZqaGh07dkyDBg0ytzU1NWnLli1aunSpNm7cqMbGRp04cSLgTEx9fb2Sk5MlScnJya3u7mq5k+vCmovv7qqvr5fdble7du0UFRWlqKioS9ZceIwrzeVixcXFKioqMp97vV6lpqYqOztbdru9LUvUZn6/Xy6XSy/tjJSvOSLo/ffNcoZ0PlbU0oORI0cqOjo63NOxJHoQfvQg/OhB6LV8otMWQQWhESNGaO/evQHbxo8frz59+ujFF19UamqqoqOjVVVVpdzcXEnSoUOHVFdXJ4fDIUlyOBz6xS9+oWPHjikxMVHSf1Kw3W5Xenq6WXPxGQ+Xy2UeIyYmRhkZGaqqqtKYMWMkSc3NzaqqqlJBQYEkKSMj44pzuZjNZpPNZmu1PTo6+rp9cfqaI+RrCj4I8c0SOtezv2gbehB+9CD86EHoBLOOQQWhjh076p577gnY1r59e3Xp0sXcPmHCBBUVFalz586y2+164YUX5HA4NGTIEElSdna20tPT9fTTT2v+/PnyeDyaMWOG8vPzzRAyefJkLV26VNOnT9dzzz2nTZs2ac2aNVq/fr35ukVFRcrLy9PgwYN13333afHixWpoaND48eMlSfHx8VecCwAAsLagL5a+kkWLFikyMlK5ubny+XxyOp1atmyZOR4VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv/7OGjs2LH68ssvVVJSIo/Ho4EDB6qioiLgAuorzQUAAFjbNQehzZs3BzyPjY1VaWmpSktLv3Gfnj17XvFi36FDh2r37t2XrSkoKDA/CruUtswFAABYF39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQei1115T//79ZbfbZbfb5XA49Ne//tUcHzp0qCIiIgIekydPDjhGXV2dcnJyFBcXp8TERE2bNk3nz58PqNm8ebMGDRokm82m3r17q7y8vNVcSktL1atXL8XGxiozM1M7duwIGD937pzy8/PVpUsXdejQQbm5uaqvrw/m7QIAgJtcUEGoe/fuevnll1VTU6OdO3dq+PDheuSRR7R//36zZuLEiTp69Kj5mD9/vjnW1NSknJwcNTY2auvWrVq5cqXKy8tVUlJi1hw+fFg5OTkaNmyYamtrVVhYqOeff14bN240a1avXq2ioiLNnDlTu3bt0oABA+R0OnXs2DGzZurUqXrvvfe0du1aVVdX68iRI3r00UevapEAAMDNKagg9PDDD+uhhx7SHXfcoTvvvFO/+MUv1KFDB23bts2siYuLU3Jysvmw2+3mWGVlpQ4cOKA333xTAwcO1OjRozV37lyVlpaqsbFRklRWVqa0tDQtWLBAffv2VUFBgR577DEtWrTIPM7ChQs1ceJEjR8/Xunp6SorK1NcXJyWL18uSTp58qTeeOMNLVy4UMOHD1dGRoZWrFihrVu3BswVAABY2y1Xu2NTU5PWrl2rhoYGORwOc/tbb72lN998U8nJyXr44Yf10ksvKS4uTpLkdrvVr18/JSUlmfVOp1NTpkzR/v37de+998rtdisrKyvgtZxOpwoLCyVJjY2NqqmpUXFxsTkeGRmprKwsud1uSVJNTY38fn/Acfr06aMePXrI7XZryJAhl3xPPp9PPp/PfO71eiVJfr9ffr//apbpG7UczxZpXNP+uHota8hahg89CD96EH70IPSCWcugg9DevXvlcDh07tw5dejQQe+8847S09MlSU899ZR69uyplJQU7dmzRy+++KIOHTqkP/3pT5Ikj8cTEIIkmc89Hs9la7xer86ePavjx4+rqanpkjUHDx40jxETE6OEhIRWNS2vcynz5s3T7NmzW22vrKw0w1yozR3cfFX7bdiwIcQzsS6XyxXuKVgePQg/ehB+9CB0zpw50+baoIPQXXfdpdraWp08eVJ//OMflZeXp+rqaqWnp2vSpElmXb9+/dStWzeNGDFCn332mW6//fZgX+qGKy4uVlFRkfnc6/UqNTVV2dnZAR/xhYLf75fL5dJLOyPla44Iev99s5whnY8VtfRg5MiRio6ODvd0LIkehB89CD96EHotn+i0RdBBKCYmRr1795YkZWRk6KOPPtKSJUv0m9/8plVtZmamJOnTTz/V7bffruTk5FZ3d7XcyZWcnGz+78V3d9XX18tut6tdu3aKiopSVFTUJWsuPEZjY6NOnDgRcFbowppLsdlsstlsrbZHR0dfty9OX3OEfE3BByG+WULnevYXbUMPwo8ehB89CJ1g1vGaf49Qc3NzwHU1F6qtrZUkdevWTZLkcDi0d+/egLu7XC6X7Ha7+fGaw+FQVVVVwHFcLpd5HVJMTIwyMjICapqbm1VVVWXWZGRkKDo6OqDm0KFDqqurC7ieCQAAWFtQZ4SKi4s1evRo9ejRQ6dOndKqVau0efNmbdy4UZ999plWrVqlhx56SF26dNGePXs0depUPfjgg+rfv78kKTs7W+np6Xr66ac1f/58eTwezZgxQ/n5+eaZmMmTJ2vp0qWaPn26nnvuOW3atElr1qzR+vXrzXkUFRUpLy9PgwcP1n333afFixeroaFB48ePlyTFx8drwoQJKioqUufOnWW32/XCCy/I4XB844XSAADAeoIKQseOHdMzzzyjo0ePKj4+Xv3799fGjRs1cuRIffHFF3r//ffNUJKamqrc3FzNmDHD3D8qKkrr1q3TlClT5HA41L59e+Xl5WnOnDlmTVpamtavX6+pU6dqyZIl6t69u15//XU5nf93TczYsWP15ZdfqqSkRB6PRwMHDlRFRUXABdSLFi1SZGSkcnNz5fP55HQ6tWzZsmtZKwAAcJMJKgi98cYb3ziWmpqq6urqKx6jZ8+eV7zjaejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyJvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6gg9Nprr6l///6y2+2y2+1yOBz661//ao6fO3dO+fn56tKlizp06KDc3FzV19cHHKOurk45OTmKi4tTYmKipk2bpvPnzwfUbN68WYMGDZLNZlPv3r1VXl7eai6lpaXq1auXYmNjlZmZqR07dgSMt2UuAADA2oIKQt27d9fLL7+smpoa7dy5U8OHD9cjjzyi/fv3S5KmTp2q9957T2vXrlV1dbWOHDmiRx991Ny/qalJOTk5amxs1NatW7Vy5UqVl5erpKTErDl8+LBycnI0bNgw1dbWqrCwUM8//7w2btxo1qxevVpFRUWaOXOmdu3apQEDBsjpdOrYsWNmzZXmAgAAEFQQevjhh/XQQw/pjjvu0J133qlf/OIX6tChg7Zt26aTJ0/qjTfe0MKFCzV8+HBlZGRoxYoV2rp1q7Zt2yZJqqys1IEDB/Tmm29q4MCBGj16tObOnavS0lI1NjZKksrKypSWlqYFCxaob9++Kigo0GOPPaZFixaZ81i4cKEmTpyo8ePHKz09XWVlZYqLi9Py5cslqU1zAQAAuOVqd2xqatLatWvV0NAgh8Ohmpoa+f1+ZWVlmTV9+vRRjx495Ha7NWTIELndbvXr109JSUlmjdPp1JQpU7R//37de++9crvdAcdoqSksLJQkNTY2qqamRsXFxeZ4ZGSksrKy5Ha7JalNc7kUn88nn89nPvd6vZIkv98vv99/lSt1aS3Hs0Ua17Q/rl7LGrKW4UMPwo8ehB89CL1g1jLoILR37145HA6dO3dOHTp00DvvvKP09HTV1tYqJiZGCQkJAfVJSUnyeDySJI/HExCCWsZbxi5X4/V6dfbsWR0/flxNTU2XrDl48KB5jCvN5VLmzZun2bNnt9peWVmpuLi4b9zvWswd3HxV+23YsCHEM7Eul8sV7ilYHj0IP3oQfvQgdM6cOdPm2qCD0F133aXa2lqdPHlSf/zjH5WXl6fq6upgD/OtVFxcrKKiIvO51+tVamqqsrOzZbfbQ/pafr9fLpdLL+2MlK85Iuj9981yhnQ+VtTSg5EjRyo6Ojrc07EkehB+9CD86EHotXyi0xZBB6GYmBj17t1bkpSRkaGPPvpIS5Ys0dixY9XY2KgTJ04EnImpr69XcnKyJCk5ObnV3V0td3JdWHPx3V319fWy2+1q166doqKiFBUVdcmaC49xpblcis1mk81ma7U9Ojr6un1x+poj5GsKPgjxzRI617O/aBt6EH70IPzoQegEs47X/HuEmpub5fP5lJGRoejoaFVVVZljhw4dUl1dnRwOhyTJ4XBo7969AXd3uVwu2e12paenmzUXHqOlpuUYMTExysjICKhpbm5WVVWVWdOWuQAAAAR1Rqi4uFijR49Wjx49dOrUKa1atUqbN2/Wxo0bFR8frwkTJqioqEidO3eW3W7XCy+8IIfDYV6cnJ2drfT0dD399NOaP3++PB6PZsyYofz8fPNMzOTJk7V06VJNnz5dzz33nDZt2qQ1a9Zo/fr15jyKioqUl5enwYMH67777tPixYvV0NCg8ePHS1Kb5gIAABBUEDp27JieeeYZHT16VPHx8erfv782btyokSNHSpIWLVqkyMhI5ebmyufzyel0atmyZeb+UVFRWrdunaZMmSKHw6H27dsrLy9Pc+bMMWvS0tK0fv16TZ06VUuWLFH37t31+uuvy+n8v2tixo4dqy+//FIlJSXyeDwaOHCgKioqAi6gvtJcAAAAIgzDuLr7ty3A6/UqPj5eJ0+evC4XS2/YsEHTd0Rd1TVCn7+cE9L5WFFLDx566CE+lw8TehB+9CD86EHoBfPvN39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQWjevHn6/ve/r44dOyoxMVFjxozRoUOHAmqGDh2qiIiIgMfkyZMDaurq6pSTk6O4uDglJiZq2rRpOn/+fEDN5s2bNWjQINlsNvXu3Vvl5eWt5lNaWqpevXopNjZWmZmZ2rFjR8D4uXPnlJ+fry5duqhDhw7Kzc1VfX19MG8ZAADcxIIKQtXV1crPz9e2bdvkcrnk9/uVnZ2thoaGgLqJEyfq6NGj5mP+/PnmWFNTk3JyctTY2KitW7dq5cqVKi8vV0lJiVlz+PBh5eTkaNiwYaqtrVVhYaGef/55bdy40axZvXq1ioqKNHPmTO3atUsDBgyQ0+nUsWPHzJqpU6fqvffe09q1a1VdXa0jR47o0UcfDXqRAADAzemWYIorKioCnpeXlysxMVE1NTV68MEHze1xcXFKTk6+5DEqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP9XJkyf1xhtvaNWqVRo+fLgkacWKFerbt6+2bdumIUOGBPPWAQDATSioIHSxkydPSpI6d+4csP2tt97Sm2++qeTkZD388MN66aWXFBcXJ0lyu93q16+fkpKSzHqn06kpU6Zo//79uvfee+V2u5WVlRVwTKfTqcLCQklSY2OjampqVFxcbI5HRkYqKytLbrdbklRTUyO/3x9wnD59+qhHjx5yu92XDEI+n08+n8987vV6JUl+v19+vz/o9bmcluPZIo1r2h9Xr2UNWcvwoQfhRw/Cjx6EXjBredVBqLm5WYWFhfrBD36ge+65x9z+1FNPqWfPnkpJSdGePXv04osv6tChQ/rTn/4kSfJ4PAEhSJL53OPxXLbG6/Xq7NmzOn78uJqami5Zc/DgQfMYMTExSkhIaFXT8joXmzdvnmbPnt1qe2VlpRnkQm3u4Oar2m/Dhg0hnol1uVyucE/B8uhB+NGD8KMHoXPmzJk21151EMrPz9e+ffv04YcfBmyfNGmS+d/9+vVTt27dNGLECH322We6/fbbr/blboji4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r37ZWszMzMlSZ9++qluv/12JScnt7q7q+VOrpbripKTk1vd3VVfXy+73a527dopKipKUVFRl6y58BiNjY06ceJEwFmhC2suZrPZZLPZWm2Pjo6+bl+cvuYI+ZqCD0J8s4TO9ewv2oYehB89CD96EDrBrGNQd40ZhqGCggK988472rRpk9LS0q64T21trSSpW7dukiSHw6G9e/cG3N3lcrlkt9uVnp5u1lRVVQUcx+VyyeFwSJJiYmKUkZERUNPc3KyqqiqzJiMjQ9HR0QE1hw4dUl1dnVkDAACsLagzQvn5+Vq1apX+/Oc/q2PHjua1NvHx8WrXrp0+++wzrVq1Sg899JC6dOmiPXv2aOrUqXrwwQfVv39/SVJ2drbS09P19NNPa/78+fJ4PJoxY4by8/PNszGTJ0/W0qVLNX36dD333HPatGmT1qxZo/Xr15tzKSoqUl5engYPHqz77rtPixcvVkNDg3kXWXx8vCZMmKCioiJ17txZdrtdL7zwghwOB3eMAQAASUEGoddee03Sf35p4oVWrFihZ599VjExMXr//ffNUJKamqrc3FzNmDHDrI2KitK6des0ZcoUORwOtW/fXnl5eZozZ45Zk5aWpvXr12vq1KlasmSJunfvrtdff928dV6Sxo4dqy+//FIlJSXyeDwaOHCgKioqAi6gXrRokSIjI5Wbmyufzyen06lly5YFtUAAAODmFVQQMozL3+qdmpqq6urqKx6nZ8+eV7zraejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyHvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6ggNG/ePH3/+99Xx44dlZiYqDFjxujQoUMBNefOnVN+fr66dOmiDh06KDc3V/X19QE1dXV1ysnJUVxcnBITEzVt2jSdP38+oGbz5s0aNGiQbDabevfurfLy8lbzKS0tVa9evRQbG6vMzEzt2LEj6LkAAADrCioIVVdXKz8/X9u2bZPL5ZLf71d2drYaGhrMmqlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7slmOKKioqA5+Xl5UpMTFRNTY0efPBBnTx5Um+88YZWrVql4cOHS5JWrFihvn37atu2bRoyZIgqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP23TXAAAgLUFFYQudvLkSUlS586dJUk1NTXy+/3Kysoya/r06aMePXrI7XZryJAhcrvd6tevn5KSkswap9OpKVOmaP/+/br33nvldrsDjtFSU1hYKElqbGxUTU2NiouLzfHIyEhlZWXJ7Xa3eS4X8/l88vl85nOv1ytJ8vv98vv9V7VG36TleLZI45r2x9VrWUPWMnzoQfjRg/CjB6EXzFpedRBqbm5WYWGhfvCDH+iee+6RJHk8HsXExCghISGgNikpSR6Px6y5MAS1jLeMXa7G6/Xq7NmzOn78uJqami5Zc/DgwTbP5WLz5s3T7NmzW22vrKxUXFzcNy3FNZk7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtVcdhPLz87Vv3z59+OGHV3uIb53i4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r27uT05OVmNjY06ceJEwJmY+vp6JScnmzUX393VcifXhTUX391VX18vu92udu3aKSoqSlFRUZesufAYV5rLxWw2m2w2W6vt0dHR1+2L09ccIV9T8EGIb5bQuZ79RdvQg/CjB+FHD0InmHUM6q4xwzBUUFCgd955R5s2bVJaWlrAeEZGhqKjo1VVVWVuO3TokOrq6uRwOCRJDodDe/fuDbi7y+VyyW63Kz093ay58BgtNS3HiImJUUZGRkBNc3OzqqqqzJq2zAUAAFhbUGeE8vPztWrVKv35z39Wx44dzWtt4uPj1a5dO8XHx2vChAkqKipS586dZbfb9cILL8jhcJgXJ2dnZys9PV1PP/205s+fL4/HoxkzZig/P988GzN58mQtXbpU06dP13PPPadNmzZpzZo1Wr9+vTmXoqIi5eXlafDgwbrvvvu0ePFiNTQ0mHeRtWUuAADA2oIKQq+99pokaejQoQHbV6xYoWeffVaStGjRIkVGRio3N1c+n09Op1PLli0za6OiorRu3TpNmTJFDodD7du3V15enubMmWPWpKWlaf369Zo6daqWLFmi7t276/XXXzdvnZeksWPH6ssvv1RJSYk8Ho8GDhyoioqKgAuorzQXAABgbUEFIcO48q3esbGxKi0tVWlp6TfW9OzZ84p3PQ0dOlS7d+++bE1BQYEKCgquaS4AAMC6+FtjAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsoIOQlu2bNHDDz+slJQURURE6N133w0Yf/bZZxURERHwGDVqVEDN119/rXHjxslutyshIUETJkzQ6dOnA2r27NmjBx54QLGxsUpNTdX8+fNbzWXt2rXq06ePYmNj1a9fP23YsCFg3DAMlZSUqFu3bmrXrp2ysrL0ySefBPuWAQDATSroINTQ0KABAwaotLT0G2tGjRqlo0ePmo8//OEPAePjxo3T/v375XK5tG7dOm3ZskWTJk0yx71er7Kzs9WzZ0/V1NTolVde0axZs/Tb3/7WrNm6dauefPJJTZgwQbt379aYMWM0ZswY7du3z6yZP3++Xn31VZWVlWn79u1q3769nE6nzp07F+zbBgAAN6Fbgt1h9OjRGj169GVrbDabkpOTLzn28ccfq6KiQh999JEGDx4sSfr1r3+thx56SL/61a+UkpKit956S42NjVq+fLliYmJ09913q7a2VgsXLjQD05IlSzRq1ChNmzZNkjR37ly5XC4tXbpUZWVlMgxDixcv1owZM/TII49Ikn7/+98rKSlJ7777rp544olg3zoAALjJBB2E2mLz5s1KTExUp06dNHz4cP385z9Xly5dJElut1sJCQlmCJKkrKwsRUZGavv27frhD38ot9utBx98UDExMWaN0+nUL3/5Sx0/flydOnWS2+1WUVFRwOs6nU7zo7rDhw/L4/EoKyvLHI+Pj1dmZqbcbvclg5DP55PP5zOfe71eSZLf75ff77/2hblAy/FskcY17Y+r17KGrGX40IPwowfhRw9CL5i1DHkQGjVqlB599FGlpaXps88+089+9jONHj1abrdbUVFR8ng8SkxMDJzELbeoc+fO8ng8kiSPx6O0tLSAmqSkJHOsU6dO8ng85rYLay48xoX7XarmYvPmzdPs2bNbba+srFRcXFxblyAocwc3X9V+F18PhavncrnCPQXLowfhRw/Cjx6EzpkzZ9pcG/IgdOGZln79+ql///66/fbbtXnzZo0YMSLULxdSxcXFAWeZvF6vUlNTlZ2dLbvdHtLX8vv9crlcemlnpHzNEUHvv2+WM6TzsaKWHowcOVLR0dHhno4l0YPwowfhRw9Cr+UTnba4Lh+NXei2225T165d9emnn2rEiBFKTk7WsWPHAmrOnz+vr7/+2ryuKDk5WfX19QE1Lc+vVHPheMu2bt26BdQMHDjwknO12Wyy2WyttkdHR1+3L05fc4R8TcEHIb5ZQud69hdtQw/Cjx6EHz0InWDW8br/HqF//etf+uqrr8ww4nA4dOLECdXU1Jg1mzZtUnNzszIzM82aLVu2BHzG53K5dNddd6lTp05mTVVVVcBruVwuORwOSVJaWpqSk5MDarxer7Zv327WAAAAaws6CJ0+fVq1tbWqra2V9J+Lkmtra1VXV6fTp09r2rRp2rZtmz7//HNVVVXpkUceUe/eveV0/uejnL59+2rUqFGaOHGiduzYob///e8qKCjQE088oZSUFEnSU089pZiYGE2YMEH79+/X6tWrtWTJkoCPrX70ox+poqJCCxYs0MGDBzVr1izt3LlTBQUFkqSIiAgVFhbq5z//uf7yl79o7969euaZZ5SSkqIxY8Zc47IBAICbQdAfje3cuVPDhg0zn7eEk7y8PL322mvas2ePVq5cqRMnTiglJUXZ2dmaO3duwEdOb731lgoKCjRixAhFRkYqNzdXr776qjkeHx+vyspK5efnKyMjQ127dlVJSUnA7xr6r//6L61atUozZszQz372M91xxx169913dc8995g106dPV0NDgyZNmqQTJ07o/vvvV0VFhWJjY4N92wAA4CYUdBAaOnSoDOObb/neuHHjFY/RuXNnrVq16rI1/fv319/+9rfL1jz++ON6/PHHv3E8IiJCc+bM0Zw5c644JwAAYD38rTEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZQQehLVu26OGHH1ZKSooiIiL07rvvBowbhqGSkhJ169ZN7dq1U1ZWlj755JOAmq+//lrjxo2T3W5XQkKCJkyYoNOnTwfU7NmzRw888IBiY2OVmpqq+fPnt5rL2rVr1adPH8XGxqpfv37asGFD0HMBAADWFXQQamho0IABA1RaWnrJ8fnz5+vVV19VWVmZtm/frvbt28vpdOrcuXNmzbhx47R//365XC6tW7dOW7Zs0aRJk8xxr9er7Oxs9ezZUzU1NXrllVc0a9Ys/fa3vzVrtm7dqieffFITJkzQ7t27NWbMGI0ZM0b79u0Lai4AAMC6bgl2h9GjR2v06NGXHDMMQ4sXL9aMGTP0yCOPSJJ+//vfKykpSe+++66eeOIJffzxx6qoqNBHH32kwYMHS5J+/etf66GHHtKvfvUrpaSk6K233lJjY6OWL1+umJgY3X333aqtrdXChQvNwLRkyRKNGjVK06ZNkyTNnTtXLpdLS5cuVVlZWZvmAgAArC2k1wgdPnxYHo9HWVlZ5rb4+HhlZmbK7XZLktxutxISEswQJElZWVmKjIzU9u3bzZoHH3xQMTExZo3T6dShQ4d0/Phxs+bC12mpaXmdtswFAABYW9BnhC7H4/FIkpKSkgK2JyUlmWMej0eJiYmBk7jlFnXu3DmgJi0trdUxWsY6deokj8dzxde50lwu5vP55PP5zOder1eS5Pf75ff7L/fWg9ZyPFukcU374+q1rCFrGT70IPzoQfjRg9ALZi1DGoS+6+bNm6fZs2e32l5ZWam4uLjr8ppzBzdf1X4XXxiOq+dyucI9BcujB+FHD8KPHoTOmTNn2lwb0iCUnJwsSaqvr1e3bt3M7fX19Ro4cKBZc+zYsYD9zp8/r6+//trcPzk5WfX19QE1Lc+vVHPh+JXmcrHi4mIVFRWZz71er1JTU5WdnS273X7lBQiC3++Xy+XSSzsj5WuOCHr/fbOcIZ2PFbX0YOTIkYqOjg73dCyJHoQfPQg/ehB6LZ/otEVIg1BaWpqSk5NVVVVlhg2v16vt27drypQpkiSHw6ETJ06opqZGGRkZkqRNmzapublZmZmZZs3/+3//T36/3/yicLlcuuuuu9SpUyezpqqqSoWFhebru1wuORyONs/lYjabTTabrdX26Ojo6/bF6WuOkK8p+CDEN0voXM/+om3oQfjRg/CjB6ETzDoGfbH06dOnVVtbq9raWkn/uSi5trZWdXV1ioiIUGFhoX7+85/rL3/5i/bu3atnnnlGKSkpGjNmjCSpb9++GjVqlCZOnKgdO3bo73//uwoKCvTEE08oJSVFkvTUU08pJiZGEyZM0P79+7V69WotWbIk4GzNj370I1VUVGjBggU6ePCgZs2apZ07d6qgoECS2jQXAABgbUGfEdq5c6eGDRtmPm8JJ3l5eSovL9f06dPV0NCgSZMm6cSJE7r//vtVUVGh2NhYc5+33npLBQUFGjFihCIjI5Wbm6tXX33VHI+Pj1dlZaXy8/OVkZGhrl27qqSkJOB3Df3Xf/2XVq1apRkzZuhnP/uZ7rjjDr377ru65557zJq2zAUAAFhX0EFo6NChMoxvvtMpIiJCc+bM0Zw5c76xpnPnzlq1atVlX6d///7629/+dtmaxx9/XI8//vg1zQUAAFgXf2sMAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYVtB/awzfDr1+uv6q9/385ZwQzgQAgO8uzggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLCnkQmjVrliIiIgIeffr0McfPnTun/Px8denSRR06dFBubq7q6+sDjlFXV6ecnBzFxcUpMTFR06ZN0/nz5wNqNm/erEGDBslms6l3794qLy9vNZfS0lL16tVLsbGxyszM1I4dO0L9dgEAwHfYdTkjdPfdd+vo0aPm48MPPzTHpk6dqvfee09r165VdXW1jhw5okcffdQcb2pqUk5OjhobG7V161atXLlS5eXlKikpMWsOHz6snJwcDRs2TLW1tSosLNTzzz+vjRs3mjWrV69WUVGRZs6cqV27dmnAgAFyOp06duzY9XjLAADgO+i6BKFbbrlFycnJ5qNr166SpJMnT+qNN97QwoULNXz4cGVkZGjFihXaunWrtm3bJkmqrKzUgQMH9Oabb2rgwIEaPXq05s6dq9LSUjU2NkqSysrKlJaWpgULFqhv374qKCjQY489pkWLFplzWLhwoSZOnKjx48crPT1dZWVliouL0/Lly6/HWwYAAN9Bt1yPg37yySdKSUlRbGysHA6H5s2bpx49eqimpkZ+v19ZWVlmbZ8+fdSjRw+53W4NGTJEbrdb/fr1U1JSklnjdDo1ZcoU7d+/X/fee6/cbnfAMVpqCgsLJUmNjY2qqalRcXGxOR4ZGamsrCy53e5vnLfP55PP5zOfe71eSZLf75ff77+mNblYy/FskUZIjxvMa1tdyzqwHuFDD8KPHoQfPQi9YNYy5EEoMzNT5eXluuuuu3T06FHNnj1bDzzwgPbt2yePx6OYmBglJCQE7JOUlCSPxyNJ8ng8ASGoZbxl7HI1Xq9XZ8+e1fHjx9XU1HTJmoMHD37j3OfNm6fZs2e32l5ZWam4uLi2LUCQ5g5uvi7HvZwNGzbc8Nf8NnO5XOGeguXRg/CjB+FHD0LnzJkzba4NeRAaPXq0+d/9+/dXZmamevbsqTVr1qhdu3ahfrmQKi4uVlFRkfnc6/UqNTVV2dnZstvtIX0tv98vl8ull3ZGytccEdJjX8m+Wc4b+nrfVi09GDlypKKjo8M9HUuiB+FHD8KPHoReyyc6bXFdPhq7UEJCgu688059+umnGjlypBobG3XixImAs0L19fVKTk6WJCUnJ7e6u6vlrrILay6+06y+vl52u13t2rVTVFSUoqKiLlnTcoxLsdlsstlsrbZHR0dfty9OX3OEfE03NgjxjRboevYXbUMPwo8ehB89CJ1g1vG6/x6h06dP67PPPlO3bt2UkZGh6OhoVVVVmeOHDh1SXV2dHA6HJMnhcGjv3r0Bd3e5XC7Z7Xalp6ebNRceo6Wm5RgxMTHKyMgIqGlublZVVZVZAwAAEPIg9JOf/ETV1dX6/PPPtXXrVv3whz9UVFSUnnzyScXHx2vChAkqKirSBx98oJqaGo0fP14Oh0NDhgyRJGVnZys9PV1PP/20/vGPf2jjxo2aMWOG8vPzzbM1kydP1j//+U9Nnz5dBw8e1LJly7RmzRpNnTrVnEdRUZF+97vfaeXKlfr44481ZcoUNTQ0aPz48aF+ywAA4Dsq5B+N/etf/9KTTz6pr776Srfeeqvuv/9+bdu2TbfeeqskadGiRYqMjFRubq58Pp+cTqeWLVtm7h8VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv+79mXs2LH68ssvVVJSIo/Ho4EDB6qioqLVBdQAAMC6Qh6E3n777cuOx8bGqrS0VKWlpd9Y07Nnzyve2TR06FDt3r37sjUFBQUqKCi4bA0AALAu/tYYAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwrFvCPQHceL1+uv6q9/385ZwQzgQAgPDijBAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsSwSh0tJS9erVS7GxscrMzNSOHTvCPSUAAPAtcEu4J3C9rV69WkVFRSorK1NmZqYWL14sp9OpQ4cOKTExMdzT+87p9dP1V73v5y/nhHAmAABcu5v+jNDChQs1ceJEjR8/Xunp6SorK1NcXJyWL18e7qkBAIAwu6nPCDU2NqqmpkbFxcXmtsjISGVlZcntdreq9/l88vl85vOTJ09Kkr7++mv5/f6Qzs3v9+vMmTO6xR+ppuaIkB7726r3T9aE5XW3F4+45PaWHnz11VeKjo6+wbOCRA++DehB+NGD0Dt16pQkyTCMK9be1EHo3//+t5qampSUlBSwPSkpSQcPHmxVP2/ePM2ePbvV9rS0tOs2R1x/XReEewYAgHA4deqU4uPjL1tzUwehYBUXF6uoqMh83tzcrK+//lpdunRRRERoz9p4vV6lpqbqiy++kN1uD+mx0Tb0IPzoQfjRg/CjB6FnGIZOnTqllJSUK9be1EGoa9euioqKUn19fcD2+vp6JScnt6q32Wyy2WwB2xISEq7nFGW32/nCDzN6EH70IPzoQfjRg9C60pmgFjf1xdIxMTHKyMhQVVWVua25uVlVVVVyOBxhnBkAAPg2uKnPCElSUVGR8vLyNHjwYN13331avHixGhoaNH78+HBPDQAAhNlNH4TGjh2rL7/8UiUlJfJ4PBo4cKAqKipaXUB9o9lsNs2cObPVR3G4cehB+NGD8KMH4UcPwivCaMu9ZQAAADehm/oaIQAAgMshCAEAAMsiCAEAAMsiCAEAAMsiCIVBaWmpevXqpdjYWGVmZmrHjh3hntJNY9asWYqIiAh49OnTxxw/d+6c8vPz1aVLF3Xo0EG5ubmtfuFmXV2dcnJyFBcXp8TERE2bNk3nz5+/0W/lO2PLli16+OGHlZKSooiICL377rsB44ZhqKSkRN26dVO7du2UlZWlTz75JKDm66+/1rhx42S325WQkKAJEybo9OnTATV79uzRAw88oNjYWKWmpmr+/PnX+619Z1ypB88++2yr74tRo0YF1NCDazNv3jx9//vfV8eOHZWYmKgxY8bo0KFDATWh+vmzefNmDRo0SDabTb1791Z5efn1fns3NYLQDbZ69WoVFRVp5syZ2rVrlwYMGCCn06ljx46Fe2o3jbvvvltHjx41Hx9++KE5NnXqVL333ntau3atqqurdeTIET366KPmeFNTk3JyctTY2KitW7dq5cqVKi8vV0lJSTjeyndCQ0ODBgwYoNLS0kuOz58/X6+++qrKysq0fft2tW/fXk6nU+fOnTNrxo0bp/3798vlcmndunXasmWLJk2aZI57vV5lZ2erZ8+eqqmp0SuvvKJZs2bpt7/97XV/f98FV+qBJI0aNSrg++IPf/hDwDg9uDbV1dXKz8/Xtm3b5HK55Pf7lZ2drYaGBrMmFD9/Dh8+rJycHA0bNky1tbUqLCzU888/r40bN97Q93tTMXBD3XfffUZ+fr75vKmpyUhJSTHmzZsXxlndPGbOnGkMGDDgkmMnTpwwoqOjjbVr15rbPv74Y0OS4Xa7DcMwjA0bNhiRkZGGx+Mxa1577TXDbrcbPp/vus79ZiDJeOedd8znzc3NRnJysvHKK6+Y206cOGHYbDbjD3/4g2EYhnHgwAFDkvHRRx+ZNX/961+NiIgI43//938NwzCMZcuWGZ06dQrowYsvvmjcdddd1/kdffdc3APDMIy8vDzjkUce+cZ96EHoHTt2zJBkVFdXG4YRup8/06dPN+6+++6A1xo7dqzhdDqv91u6aXFG6AZqbGxUTU2NsrKyzG2RkZHKysqS2+0O48xuLp988olSUlJ02223ady4caqrq5Mk1dTUyO/3B6x/nz591KNHD3P93W63+vXrF/ALN51Op7xer/bv339j38hN4PDhw/J4PAFrHh8fr8zMzIA1T0hI0ODBg82arKwsRUZGavv27WbNgw8+qJiYGLPG6XTq0KFDOn78+A16N99tmzdvVmJiou666y5NmTJFX331lTlGD0Lv5MmTkqTOnTtLCt3PH7fbHXCMlhr+Dbl6BKEb6N///reamppa/VbrpKQkeTyeMM3q5pKZmany8nJVVFTotdde0+HDh/XAAw/o1KlT8ng8iomJafWHdC9cf4/Hc8n+tIwhOC1rdrmveY/Ho8TExIDxW265RZ07d6YvITJq1Cj9/ve/V1VVlX75y1+qurpao0ePVlNTkyR6EGrNzc0qLCzUD37wA91zzz2SFLKfP99U4/V6dfbs2evxdm56N/2f2IC1jB492vzv/v37KzMzUz179tSaNWvUrl27MM4MCJ8nnnjC/O9+/fqpf//+uv3227V582aNGDEijDO7OeXn52vfvn0B1yfi24szQjdQ165dFRUV1eougfr6eiUnJ4dpVje3hIQE3Xnnnfr000+VnJysxsZGnThxIqDmwvVPTk6+ZH9axhCcljW73Nd8cnJyq5sFzp8/r6+//pq+XCe33Xabunbtqk8//VQSPQilgoICrVu3Th988IG6d+9ubg/Vz59vqrHb7fyfvatEELqBYmJilJGRoaqqKnNbc3Ozqqqq5HA4wjizm9fp06f12WefqVu3bsrIyFB0dHTA+h86dEh1dXXm+jscDu3duzfgHwWXyyW73a709PQbPv/vurS0NCUnJwesudfr1fbt2wPW/MSJE6qpqTFrNm3apObmZmVmZpo1W7Zskd/vN2tcLpfuuusuderU6Qa9m5vHv/71L3311Vfq1q2bJHoQCoZhqKCgQO+88442bdqktLS0gPFQ/fxxOBwBx2ip4d+QaxDuq7Wt5u233zZsNptRXl5uHDhwwJg0aZKRkJAQcJcArt6Pf/xjY/Pmzcbhw4eNv//970ZWVpbRtWtX49ixY4ZhGMbkyZONHj16GJs2bTJ27txpOBwOw+FwmPufP3/euOeee4zs7GyjtrbWqKioMG699VajuLg4XG/pW+/UqVPG7t27jd27dxuSjIULFxq7d+82/ud//scwDMN4+eWXjYSEBOPPf/6zsWfPHuORRx4x0tLSjLNnz5rHGDVqlHHvvfca27dvNz788EPjjjvuMJ588klz/MSJE0ZSUpLx9NNPG/v27TPefvttIy4uzvjNb35zw9/vt9HlenDq1CnjJz/5ieF2u43Dhw8b77//vjFo0CDjjjvuMM6dO2cegx5cmylTphjx8fHG5s2bjaNHj5qPM2fOmDWh+Pnzz3/+04iLizOmTZtmfPzxx0ZpaakRFRVlVFRU3ND3ezMhCIXBr3/9a6NHjx5GTEyMcd999xnbtm0L95RuGmPHjjW6detmxMTEGN/73veMsWPHGp9++qk5fvbsWeO///u/jU6dOhlxcXHGD3/4Q+Po0aMBx/j888+N0aNHG+3atTO6du1q/PjHPzb8fv+NfivfGR988IEhqdUjLy/PMIz/3EL/0ksvGUlJSYbNZjNGjBhhHDp0KOAYX331lfHkk08aHTp0MOx2uzF+/Hjj1KlTATX/+Mc/jPvvv9+w2WzG9773PePll1++UW/xW+9yPThz5oyRnZ1t3HrrrUZ0dLTRs2dPY+LEia3+zxc9uDaXWn9JxooVK8yaUP38+eCDD4yBAwcaMTExxm233RbwGghehGEYxo0+CwUAAPBtwDVCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsv4/2I1qxbjTQ/MAAAAASUVORK5CYII="},"metadata":{}}]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T11:51:08.104083Z","iopub.execute_input":"2023-05-12T11:51:08.104505Z","iopub.status.idle":"2023-05-12T11:51:08.889863Z","shell.execute_reply.started":"2023-05-12T11:51:08.104477Z","shell.execute_reply":"2023-05-12T11:51:08.888809Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stderr","text":"Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.6kB/s]\nDownloading (…)lve/main/config.json: 100%|██████████| 625/625 [00:00<00:00, 383kB/s]\nDownloading (…)solve/main/vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 14.0MB/s]\nDownloading (…)/main/tokenizer.json: 100%|██████████| 1.72M/1.72M [00:00<00:00, 90.1MB/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n return tokenizer(text_data.comment_text.values.tolist(), \n max_length=max_len, \n truncation=True, \n padding=\"max_length\",\n add_special_tokens=True,\n return_tensors=\"tf\",\n return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:08.890965Z","iopub.execute_input":"2023-05-12T11:51:08.891271Z","iopub.status.idle":"2023-05-12T11:51:08.896368Z","shell.execute_reply.started":"2023-05-12T11:51:08.891227Z","shell.execute_reply":"2023-05-12T11:51:08.895589Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:08.897392Z","iopub.execute_input":"2023-05-12T11:51:08.897682Z","iopub.status.idle":"2023-05-12T11:51:49.986967Z","shell.execute_reply.started":"2023-05-12T11:51:08.897657Z","shell.execute_reply":"2023-05-12T11:51:49.985562Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n .repeat()\n .shuffle(Config.BUFFER_SIZE)\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:49.988465Z","iopub.execute_input":"2023-05-12T11:51:49.988785Z","iopub.status.idle":"2023-05-12T11:51:50.023604Z","shell.execute_reply.started":"2023-05-12T11:51:49.988758Z","shell.execute_reply":"2023-05-12T11:51:50.022512Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformers_layers, max_len=Config.MAX_LEN):\n input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n \n bert_layers = transformers_layers.bert(input_ids, attention_mask=masks)[1]\n intermediate = Dense(1024, activation='relu')(bert_layers)\n output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\n model = Model(inputs=[input_ids, masks], outputs=output)\n model.layers[2].trainable = True\n \n model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=tf.keras.metrics.AUC())\n return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:50.024852Z","iopub.execute_input":"2023-05-12T11:51:50.025144Z","iopub.status.idle":"2023-05-12T11:51:50.033876Z","shell.execute_reply.started":"2023-05-12T11:51:50.025120Z","shell.execute_reply":"2023-05-12T11:51:50.032937Z"},"trusted":true},"execution_count":20,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n transformers_layers = TFAutoModel.from_pretrained(Config.MODEL)\n model = model_builder(transformers_layers=transformers_layers)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:50.034973Z","iopub.execute_input":"2023-05-12T11:51:50.035277Z","iopub.status.idle":"2023-05-12T11:52:52.408470Z","shell.execute_reply.started":"2023-05-12T11:51:50.035228Z","shell.execute_reply":"2023-05-12T11:52:52.407200Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stderr","text":"Downloading tf_model.h5: 100%|██████████| 999M/999M [00:17<00:00, 58.2MB/s] \nSome layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']\n- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nAll the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:52.409793Z","iopub.execute_input":"2023-05-12T11:52:52.410082Z","iopub.status.idle":"2023-05-12T11:52:52.449580Z","shell.execute_reply.started":"2023-05-12T11:52:52.410057Z","shell.execute_reply":"2023-05-12T11:52:52.448645Z"},"trusted":true},"execution_count":22,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n bert (TFBertMainLayer) TFBaseModelOutputWi 167356416 ['input_ids[0][0]', \n thPoolingAndCrossAt 'attention_mask[0][0]'] \n tentions(last_hidde \n n_state=(None, 192, \n 768), \n pooler_output=(Non \n e, 768), \n past_key_values=No \n ne, hidden_states=N \n one, attentions=Non \n e, cross_attentions \n =None) \n \n dense (Dense) (None, 1024) 787456 ['bert[0][1]'] \n \n output_layer (Dense) (None, 1) 1025 ['dense[0][0]'] \n \n==================================================================================================\nTotal params: 168,144,897\nTrainable params: 168,144,897\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n validation_data=val_dataset,\n steps_per_epoch=train_steps_per_epoch,\n epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:52.450624Z","iopub.execute_input":"2023-05-12T11:52:52.450887Z","iopub.status.idle":"2023-05-12T12:22:32.594613Z","shell.execute_reply.started":"2023-05-12T11:52:52.450864Z","shell.execute_reply":"2023-05-12T12:22:32.593393Z"},"trusted":true},"execution_count":23,"outputs":[{"name":"stdout","text":"Epoch 1/3\n","output_type":"stream"},{"name":"stderr","text":"2023-05-12 11:53:33.890629: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_406/ReadVariableOp.\n2023-05-12 11:53:34.988485: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_406/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"2998/4185 [====================>.........] - ETA: 2:36 - loss: 0.0710 - auc: 0.9958","output_type":"stream"},{"text":"IOPub message rate exceeded.\nThe notebook server will temporarily stop sending output\nto the client in order to avoid crashing it.\nTo change this limit, set the config variable\n`--NotebookApp.iopub_msg_rate_limit`.\n\nCurrent values:\nNotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\nNotebookApp.rate_limit_window=3.0 (secs)\n\n","name":"stderr","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0512 - auc: 0.9971","output_type":"stream"},{"name":"stderr","text":"2023-05-12 12:03:47.008155: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 12:03:47.287188: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 669s 137ms/step - loss: 0.0512 - auc: 0.9971 - val_loss: 0.3909 - val_auc: 0.8110\nEpoch 2/3\n4072/4185 [============================>.] - ETA: 14s - loss: 0.0427 - auc: 0.9980","output_type":"stream"},{"text":"IOPub message rate exceeded.\nThe notebook server will temporarily stop sending output\nto the client in order to avoid crashing it.\nTo change this limit, set the config variable\n`--NotebookApp.iopub_msg_rate_limit`.\n\nCurrent values:\nNotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\nNotebookApp.rate_limit_window=3.0 (secs)\n\n","name":"stderr","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 555s 133ms/step - loss: 0.0358 - auc: 0.9986 - val_loss: 0.3950 - val_auc: 0.8189\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n steps_per_epoch=val_steps_per_epoch,\n epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:24:25.248856Z","iopub.execute_input":"2023-05-12T12:24:25.249821Z","iopub.status.idle":"2023-05-12T12:25:30.559794Z","shell.execute_reply.started":"2023-05-12T12:24:25.249786Z","shell.execute_reply":"2023-05-12T12:25:30.558520Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 8s 131ms/step - loss: 0.2848 - auc: 0.8819\nEpoch 2/2\n62/62 [==============================] - 56s 132ms/step - loss: 0.1757 - auc: 0.9617\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:26:42.649251Z","iopub.execute_input":"2023-05-12T12:26:42.649715Z","iopub.status.idle":"2023-05-12T12:27:19.803541Z","shell.execute_reply.started":"2023-05-12T12:26:42.649683Z","shell.execute_reply":"2023-05-12T12:27:19.802272Z"},"trusted":true},"execution_count":25,"outputs":[{"name":"stderr","text":"2023-05-12 12:26:48.382290: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n2023-05-12 12:26:48.652698: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"499/499 [==============================] - 37s 49ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"model.save(\"mbert-fine-tuned-1-pooler\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:49:24.580208Z","iopub.execute_input":"2023-05-12T09:49:24.580625Z","iopub.status.idle":"2023-05-12T09:50:44.681561Z","shell.execute_reply.started":"2023-05-12T09:49:24.580595Z","shell.execute_reply":"2023-05-12T09:50:44.680112Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"},{"name":"stdout","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"},{"name":"stderr","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"}]},{"cell_type":"code","source":"import shutil\nshutil.make_archive(\"roberta-fine-tuned-2\",\"zip\",'/kaggle/working/roberta-fine-tuned-2')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:53:15.505782Z","iopub.execute_input":"2023-05-12T09:53:15.506262Z","iopub.status.idle":"2023-05-12T10:00:10.288432Z","shell.execute_reply.started":"2023-05-12T09:53:15.506226Z","shell.execute_reply":"2023-05-12T10:00:10.287215Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"'/kaggle/working/roberta-fine-tuned-2.zip'"},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2-best\", save_format='h5')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:06:24.426264Z","iopub.execute_input":"2023-05-12T10:06:24.426727Z","iopub.status.idle":"2023-05-12T10:06:40.506795Z","shell.execute_reply.started":"2023-05-12T10:06:24.426692Z","shell.execute_reply":"2023-05-12T10:06:40.505341Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"model = tf.keras.models.load_model('/kaggle/working/roberta-fine-tuned-2-best')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:07:36.737706Z","iopub.execute_input":"2023-05-12T10:07:36.738837Z","iopub.status.idle":"2023-05-12T10:07:59.902966Z","shell.execute_reply.started":"2023-05-12T10:07:36.738795Z","shell.execute_reply":"2023-05-12T10:07:59.901400Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"!huggingface-cli login --token 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","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:12:13.025974Z","iopub.execute_input":"2023-05-12T10:12:13.026917Z","iopub.status.idle":"2023-05-12T10:12:15.351277Z","shell.execute_reply.started":"2023-05-12T10:12:13.026877Z","shell.execute_reply":"2023-05-12T10:12:15.349659Z"},"trusted":true},"execution_count":55,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nToken will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\nToken is valid.\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta-best')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import HfApi\napi = HfApi()\napi.upload_folder(\n folder_path=\"/kaggle/working/\",\n repo_id=\"shivansh-ka/Toxic-Comment-Classifier-Multi\",\n repo_type=\"space\",\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T06:59:23.928089Z","iopub.execute_input":"2023-05-12T06:59:23.928495Z","iopub.status.idle":"2023-05-12T06:59:56.375479Z","shell.execute_reply.started":"2023-05-12T06:59:23.928466Z","shell.execute_reply":"2023-05-12T06:59:56.374295Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\nconfig.json not found in HuggingFace Hub.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"84f3f3229b3e42668708162e27df3168"}},"metadata":{}}]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:06:50.246933Z","iopub.execute_input":"2023-05-12T07:06:50.247789Z","iopub.status.idle":"2023-05-12T07:29:11.940923Z","shell.execute_reply.started":"2023-05-12T07:06:50.247752Z","shell.execute_reply":"2023-05-12T07:29:11.939745Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"499/499 [==============================] - 1341s 3s/step\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}