Spaces:

shivansh-ka
/

Toxic-Comment-Classifier

Sleeping

App Files Files Community

shivansh-ka commited on May 12, 2023

Commit

0734f8e

•

1 Parent(s): 3e80ddd

experiment notebook added

Browse files

Files changed (10) hide show

eda/eda.ipynb +0 -0
experiment_notebooks/Experiment 1.ipynb +334 -0
experiment_notebooks/Experiment 2.ipynb +398 -0
experiment_notebooks/Experiment 3.ipynb +399 -0
experiment_notebooks/Experiment 4.ipynb +1475 -0
experiment_notebooks/Experiment 5.ipynb +1003 -0
experiment_notebooks/Transformer-Roberta-Hidden-state.ipynb.ipynb +1 -0
experiment_notebooks/Transformer-Roberta-Pooler-state.ipynb +1 -0
experiment_notebooks/Transformer-mBert-Hidden-state.ipynb.ipynb +1 -0
experiment_notebooks/Transformer-mBert-Pooler-state.ipynb.ipynb +1 -0

eda/eda.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

experiment_notebooks/Experiment 1.ipynb ADDED Viewed

	@@ -0,0 +1,334 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c30c254",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import tensorflow as tf\n",
+    "#import tensorflow_gpu\n",
+    "import urllib\n",
+    "from tensorflow.keras.layers import TextVectorization\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
+    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
+    "from sklearn.metrics import roc_auc_score, f1_score\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import re\n",
+    "import string\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('omw-1.4')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('wordnet2022')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2487874b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf_tpu_or_gpu(device: str='gpu'):\n",
+    "    if device.lower() == 'gpu':\n",
+    "        print(\"Setting up GPU.....\")\n",
+    "        device_name = tf.test.gpu_device_name()\n",
+    "        if \"GPU\" not in device_name:\n",
+    "            print(\"GPU device not found\")\n",
+    "        print('Found GPU at: {}'.format(device_name))\n",
+    "        config = tf.compat.v1.ConfigProto() \n",
+    "        config.gpu_options.allow_growth = True \n",
+    "        sess = tf.compat.v1.Session(config=config) \n",
+    "        tf.compat.v1.keras.backend.set_session(sess)\n",
+    "        print(config)\n",
+    "    \n",
+    "    elif device.lower() == 'tpu':\n",
+    "        print(\"Setting up TPU.....\")\n",
+    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
+    "        print('Running on TPU ', tpu.master())\n",
+    "        tf.config.experimental_connect_to_cluster(tpu)\n",
+    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
+    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
+    "\n",
+    "    else:\n",
+    "        raise Exception(\"Wrong Device Paramter Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fb1df02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_tpu_or_gpu(device='tpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3377596d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Config:\n",
+    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
+    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
+    "    VOCAB_SIZE = 200000\n",
+    "    OUTPUT_DIM = 1800\n",
+    "    BUFFER_SIZE = 160000\n",
+    "    BATCH_SIZE = 16*8\n",
+    "    EPOCHS = 10\n",
+    "    BASE_LOG_DIR = \"log_dir\"\n",
+    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ca4db64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
+    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f687273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = data['comment_text']\n",
+    "y = data[data.columns[2:]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "403cbd7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e012a53e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6db618c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
+    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
+    "                               output_mode='int')\n",
+    "vectorizer.adapt(X.values)\n",
+    "vectorized_text = vectorizer(X.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5b25ecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
+    "dataset = dataset.cache()\n",
+    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
+    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a60be072",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = dataset.take(int(len(dataset)*0.8))\n",
+    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
+    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d4c3d18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def callbacks(base_dir=\".\"):\n",
+    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
+    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
+    "    os.makedirs(ckpt_file,exist_ok=True)\n",
+    "\n",
+    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "      filepath = ckpt_file,\n",
+    "      save_best_only = True)\n",
+    "\n",
+    "    callback_list = [early_stopping,\n",
+    "                     ckpt_cb]\n",
+    "    return callback_list\n",
+    "callbacks_list = callbacks()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cf70d04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_model():\n",
+    "    LAYERS = [\n",
+    "              Embedding(Config.VOCAB_SIZE+1, 32),,\n",
+    "              Bidirectional(LSTM(64, activation='tanh')),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(256, activation='relu'),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(6, activation='sigmoid')]\n",
+    "    \n",
+    "    model = Sequential(LAYERS)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26a56966",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tpu_strategy.scope():\n",
+    "    model = create_model()\n",
+    "    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
+    "                  loss=tf.keras.losses.binary_crossentropy,\n",
+    "                  metrics=AUC(multi_label=True, num_labels=6))\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "891727f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = model.fit(train, \n",
+    "                    epochs=Config.EPOCHS,\n",
+    "                    steps_per_epoch=len(train),\n",
+    "                    validation_data=val,\n",
+    "                    callbacks=callbacks_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "533cd762",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
+    "    y_pred = model.predict(pred_data)\n",
+    "    try:\n",
+    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        \n",
+    "    print(f\"Precision: {precision}\\n\"\n",
+    "          f\"Recall: {recall}\\n\"\n",
+    "          f\"F1-Score: {f1}\\n\"\n",
+    "          f\"ROC-AUC-Score: {auc}\")\n",
+    "    return (precision, recall, f1, auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2f19754",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"baseline_model_1.h5\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "314be9bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = np.concatenate([x for x, y in train])\n",
+    "y_train = np.concatenate([y for x, y in train])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec45f5ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_val = np.concatenate([x for x, y in val])\n",
+    "y_val = np.concatenate([y for x, y in val])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

experiment_notebooks/Experiment 2.ipynb ADDED Viewed

	@@ -0,0 +1,398 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c30c254",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import tensorflow as tf\n",
+    "#import tensorflow_gpu\n",
+    "import urllib\n",
+    "from tensorflow.keras.layers import TextVectorization\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
+    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
+    "from sklearn.metrics import roc_auc_score, f1_score\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import re\n",
+    "import string\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('omw-1.4')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('wordnet2022')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2487874b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf_tpu_or_gpu(device: str='gpu'):\n",
+    "    if device.lower() == 'gpu':\n",
+    "        print(\"Setting up GPU.....\")\n",
+    "        device_name = tf.test.gpu_device_name()\n",
+    "        if \"GPU\" not in device_name:\n",
+    "            print(\"GPU device not found\")\n",
+    "        print('Found GPU at: {}'.format(device_name))\n",
+    "        config = tf.compat.v1.ConfigProto() \n",
+    "        config.gpu_options.allow_growth = True \n",
+    "        sess = tf.compat.v1.Session(config=config) \n",
+    "        tf.compat.v1.keras.backend.set_session(sess)\n",
+    "        print(config)\n",
+    "    \n",
+    "    elif device.lower() == 'tpu':\n",
+    "        print(\"Setting up TPU.....\")\n",
+    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
+    "        print('Running on TPU ', tpu.master())\n",
+    "        tf.config.experimental_connect_to_cluster(tpu)\n",
+    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
+    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
+    "\n",
+    "    else:\n",
+    "        raise Exception(\"Wrong Device Paramter Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fb1df02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_tpu_or_gpu(device='tpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3377596d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Config:\n",
+    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
+    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
+    "    VOCAB_SIZE = 200000\n",
+    "    OUTPUT_DIM = 1800\n",
+    "    BUFFER_SIZE = 160000\n",
+    "    BATCH_SIZE = 16*8\n",
+    "    EPOCHS = 10\n",
+    "    BASE_LOG_DIR = \"log_dir\"\n",
+    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ca4db64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
+    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f687273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = data['comment_text']\n",
+    "y = data[data.columns[2:]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "403cbd7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e012a53e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d383e72a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Text_Cleaner:\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "        self.STOPWORDS = stopwords.words('english')\n",
+    "        self.wordnet = WordNetLemmatizer()\n",
+    "        \n",
+    "    def new_line_code(self, x:str)->str:\n",
+    "        pattern = \"\\n\"\n",
+    "        x = re.sub(pattern,' ', x).strip().lower()\n",
+    "        return x\n",
+    "\n",
+    "    def remove_punctuations(self, x:str)->str:\n",
+    "        x = x.translate(str.maketrans('','',string.punctuation))\n",
+    "        return x\n",
+    "\n",
+    "    def remove_stopwords(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            if word not in self.STOPWORDS:\n",
+    "                sent.append(word)\n",
+    "        return ' '.join(sent)\n",
+    "\n",
+    "    def lemmatization(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            sent.append(self.wordnet.lemmatize(word))\n",
+    "        return ' '.join(sent)\n",
+    "    \n",
+    "    def clean_text(self):\n",
+    "        self.data = self.data.apply(self.new_line_code)\n",
+    "        self.data = self.data.apply(self.remove_punctuations)\n",
+    "        self.data = self.data.apply(self.remove_stopwords)\n",
+    "        self.data = self.data.apply(self.lemmatization)\n",
+    "        self.data = self.data.apply(lambda x: x.strip())\n",
+    "        return self.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b121fd12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = Text_Cleaner(X).clean_text()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81c860cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5b374af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
+    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
+    "                               output_mode='int')\n",
+    "vectorizer.adapt(X.values)\n",
+    "vectorized_text = vectorizer(X.values)`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5b25ecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
+    "dataset = dataset.cache()\n",
+    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
+    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a60be072",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = dataset.take(int(len(dataset)*0.8))\n",
+    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
+    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d4c3d18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def callbacks(base_dir=\".\"):\n",
+    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
+    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
+    "    os.makedirs(ckpt_file,exist_ok=True)\n",
+    "\n",
+    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "      filepath = ckpt_file,\n",
+    "      save_best_only = True)\n",
+    "\n",
+    "    callback_list = [early_stopping,\n",
+    "                     ckpt_cb]\n",
+    "    return callback_list\n",
+    "callbacks_list = callbacks()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cf70d04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_model():\n",
+    "    LAYERS = [\n",
+    "              Embedding(Config.VOCAB_SIZE+1, 32),,\n",
+    "              Bidirectional(LSTM(64, activation='tanh')),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(256, activation='relu'),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(6, activation='sigmoid')]\n",
+    "    \n",
+    "    model = Sequential(LAYERS)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26a56966",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tpu_strategy.scope():\n",
+    "    model = create_model()\n",
+    "    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
+    "                  loss=tf.keras.losses.binary_crossentropy,\n",
+    "                  metrics=AUC(multi_label=True, num_labels=6))\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "891727f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = model.fit(train, \n",
+    "                    epochs=Config.EPOCHS,\n",
+    "                    steps_per_epoch=len(train),\n",
+    "                    validation_data=val,\n",
+    "                    callbacks=callbacks_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "533cd762",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
+    "    y_pred = model.predict(pred_data)\n",
+    "    try:\n",
+    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        \n",
+    "    print(f\"Precision: {precision}\\n\"\n",
+    "          f\"Recall: {recall}\\n\"\n",
+    "          f\"F1-Score: {f1}\\n\"\n",
+    "          f\"ROC-AUC-Score: {auc}\")\n",
+    "    return (precision, recall, f1, auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2f19754",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"model_2.h5\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "314be9bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = np.concatenate([x for x, y in train])\n",
+    "y_train = np.concatenate([y for x, y in train])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec45f5ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_val = np.concatenate([x for x, y in val])\n",
+    "y_val = np.concatenate([y for x, y in val])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

experiment_notebooks/Experiment 3.ipynb ADDED Viewed

	@@ -0,0 +1,399 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c30c254",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import tensorflow as tf\n",
+    "#import tensorflow_gpu\n",
+    "import urllib\n",
+    "from tensorflow.keras.layers import TextVectorization\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
+    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
+    "from sklearn.metrics import roc_auc_score, f1_score\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import re\n",
+    "import string\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('omw-1.4')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('wordnet2022')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2487874b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf_tpu_or_gpu(device: str='gpu'):\n",
+    "    if device.lower() == 'gpu':\n",
+    "        print(\"Setting up GPU.....\")\n",
+    "        device_name = tf.test.gpu_device_name()\n",
+    "        if \"GPU\" not in device_name:\n",
+    "            print(\"GPU device not found\")\n",
+    "        print('Found GPU at: {}'.format(device_name))\n",
+    "        config = tf.compat.v1.ConfigProto() \n",
+    "        config.gpu_options.allow_growth = True \n",
+    "        sess = tf.compat.v1.Session(config=config) \n",
+    "        tf.compat.v1.keras.backend.set_session(sess)\n",
+    "        print(config)\n",
+    "    \n",
+    "    elif device.lower() == 'tpu':\n",
+    "        print(\"Setting up TPU.....\")\n",
+    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
+    "        print('Running on TPU ', tpu.master())\n",
+    "        tf.config.experimental_connect_to_cluster(tpu)\n",
+    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
+    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
+    "\n",
+    "    else:\n",
+    "        raise Exception(\"Wrong Device Paramter Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fb1df02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_tpu_or_gpu(device='tpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3377596d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Config:\n",
+    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
+    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
+    "    VOCAB_SIZE = 200000\n",
+    "    OUTPUT_DIM = 1800\n",
+    "    BUFFER_SIZE = 160000\n",
+    "    BATCH_SIZE = 16*8\n",
+    "    EPOCHS = 10\n",
+    "    BASE_LOG_DIR = \"log_dir\"\n",
+    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ca4db64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
+    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f687273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = data['comment_text']\n",
+    "y = data[data.columns[2:]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "403cbd7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e012a53e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d383e72a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Text_Cleaner:\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "        self.STOPWORDS = stopwords.words('english')\n",
+    "        self.wordnet = WordNetLemmatizer()\n",
+    "        \n",
+    "    def new_line_code(self, x:str)->str:\n",
+    "        pattern = \"\\n\"\n",
+    "        x = re.sub(pattern,' ', x).strip().lower()\n",
+    "        return x\n",
+    "\n",
+    "    def remove_punctuations(self, x:str)->str:\n",
+    "        x = x.translate(str.maketrans('','',string.punctuation))\n",
+    "        return x\n",
+    "\n",
+    "    def remove_stopwords(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            if word not in self.STOPWORDS:\n",
+    "                sent.append(word)\n",
+    "        return ' '.join(sent)\n",
+    "\n",
+    "    def lemmatization(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            sent.append(self.wordnet.lemmatize(word))\n",
+    "        return ' '.join(sent)\n",
+    "    \n",
+    "    def clean_text(self):\n",
+    "        self.data = self.data.apply(self.new_line_code)\n",
+    "        self.data = self.data.apply(self.remove_punctuations)\n",
+    "        self.data = self.data.apply(self.remove_stopwords)\n",
+    "        self.data = self.data.apply(self.lemmatization)\n",
+    "        self.data = self.data.apply(lambda x: x.strip())\n",
+    "        return self.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b121fd12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = Text_Cleaner(X).clean_text()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81c860cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5b374af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
+    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
+    "                               output_mode='int')\n",
+    "vectorizer.adapt(X.values)\n",
+    "vectorized_text = vectorizer(X.values)`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5b25ecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
+    "dataset = dataset.cache()\n",
+    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
+    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a60be072",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = dataset.take(int(len(dataset)*0.8))\n",
+    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
+    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d4c3d18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def callbacks(base_dir=\".\"):\n",
+    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
+    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
+    "    os.makedirs(ckpt_file,exist_ok=True)\n",
+    "\n",
+    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "      filepath = ckpt_file,\n",
+    "      save_best_only = True)\n",
+    "\n",
+    "    callback_list = [early_stopping,\n",
+    "                     ckpt_cb]\n",
+    "    return callback_list\n",
+    "callbacks_list = callbacks()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cf70d04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_model():\n",
+    "    LAYERS = [\n",
+    "              Embedding(Config.VOCAB_SIZE+1, 32),\n",
+    "              Bidirectional(LSTM(64, return_sequences=True, activation='tanh')),\n",
+    "              Bidirectional(LSTM(32)),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(256, activation='relu'),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(6, activation='sigmoid')]\n",
+    "    \n",
+    "    model = Sequential(LAYERS)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26a56966",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tpu_strategy.scope():\n",
+    "    model = create_model()\n",
+    "    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
+    "                  loss=tf.keras.losses.binary_crossentropy,\n",
+    "                  metrics=AUC(multi_label=True, num_labels=6))\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "891727f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = model.fit(train, \n",
+    "                    epochs=Config.EPOCHS,\n",
+    "                    steps_per_epoch=len(train),\n",
+    "                    validation_data=val,\n",
+    "                    callbacks=callbacks_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "533cd762",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
+    "    y_pred = model.predict(pred_data)\n",
+    "    try:\n",
+    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        \n",
+    "    print(f\"Precision: {precision}\\n\"\n",
+    "          f\"Recall: {recall}\\n\"\n",
+    "          f\"F1-Score: {f1}\\n\"\n",
+    "          f\"ROC-AUC-Score: {auc}\")\n",
+    "    return (precision, recall, f1, auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2f19754",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"model_3.h5\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "314be9bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = np.concatenate([x for x, y in train])\n",
+    "y_train = np.concatenate([y for x, y in train])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec45f5ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_val = np.concatenate([x for x, y in val])\n",
+    "y_val = np.concatenate([y for x, y in val])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

experiment_notebooks/Experiment 4.ipynb ADDED Viewed

	@@ -0,0 +1,1475 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install nltk scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:13:06.118200Z",
+     "iopub.status.busy": "2023-05-03T14:13:06.117322Z",
+     "iopub.status.idle": "2023-05-03T14:13:36.869507Z",
+     "shell.execute_reply": "2023-05-03T14:13:36.868619Z",
+     "shell.execute_reply.started": "2023-05-03T14:13:06.118149Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "#import seaborn as sns\n",
+    "import tensorflow as tf\n",
+    "#import tensorflow_gpu\n",
+    "import urllib\n",
+    "from tensorflow.keras.layers import TextVectorization\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
+    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC\n",
+    "from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import re\n",
+    "import string\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('omw-1.4')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('wordnet2022')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf_tpu_or_gpu(device: str='gpu'):\n",
+    "    if device.lower() == 'gpu':\n",
+    "        print(\"Setting up GPU.....\")\n",
+    "        device_name = tf.test.gpu_device_name()\n",
+    "        if \"GPU\" not in device_name:\n",
+    "            print(\"GPU device not found\")\n",
+    "        print('Found GPU at: {}'.format(device_name))\n",
+    "        \n",
+    "        config = tf.compat.v1.ConfigProto() \n",
+    "        config.gpu_options.allow_growth = True \n",
+    "        sess = tf.compat.v1.Session(config=config) \n",
+    "        tf.compat.v1.keras.backend.set_session(sess)\n",
+    "        \n",
+    "        print(config)\n",
+    "    \n",
+    "    elif device.lower() == 'tpu':\n",
+    "        print(\"Setting up TPU.....\")\n",
+    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
+    "        print('Running on TPU ', tpu.master())\n",
+    "        tf.config.experimental_connect_to_cluster(tpu)\n",
+    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
+    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
+    "        \n",
+    "    else:\n",
+    "        raise Exception(\"Wrong Device Paramter Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_tpu_or_gpu(device='tpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:10.072253Z",
+     "iopub.status.busy": "2023-05-03T14:16:10.071138Z",
+     "iopub.status.idle": "2023-05-03T14:16:19.830833Z",
+     "shell.execute_reply": "2023-05-03T14:16:19.829780Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:10.072215Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on TPU  \n",
+      "INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\n",
+      "INFO:tensorflow:Initializing the TPU system: local\n",
+      "INFO:tensorflow:Finished initializing TPU system.\n",
+      "INFO:tensorflow:Found TPU system:\n",
+      "INFO:tensorflow:*** Num TPU Cores: 8\n",
+      "INFO:tensorflow:*** Num TPU Workers: 1\n",
+      "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n",
+      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n",
+      "REPLICAS:  8\n"
+     ]
+    }
+   ],
+   "source": [
+    "tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
+    "print('Running on TPU ', tpu.master())\n",
+    "tf.config.experimental_connect_to_cluster(tpu)\n",
+    "tf.tpu.experimental.initialize_tpu_system(tpu)\n",
+    "tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "device_name = tf.test.gpu_device_name()\n",
+    "if \"GPU\" not in device_name:\n",
+    "    print(\"GPU device not found\")\n",
+    "print('Found GPU at: {}'.format(device_name))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "config = tf.compat.v1.ConfigProto() \n",
+    "config.gpu_options.allow_growth = True \n",
+    "sess = tf.compat.v1.Session(config=config) \n",
+    "tf.compat.v1.keras.backend.set_session(sess)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:24.940878Z",
+     "iopub.status.busy": "2023-05-03T14:16:24.940140Z",
+     "iopub.status.idle": "2023-05-03T14:16:24.946837Z",
+     "shell.execute_reply": "2023-05-03T14:16:24.945707Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:24.940845Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class Config:\n",
+    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
+    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
+    "    VOCAB_SIZE = 200000\n",
+    "    OUTPUT_DIM = 1800\n",
+    "    BUFFER_SIZE = 160000\n",
+    "    BATCH_SIZE = 16*8\n",
+    "    EPOCHS = 10\n",
+    "    BASE_LOG_DIR = \"log_dir\"\n",
+    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:29.171506Z",
+     "iopub.status.busy": "2023-05-03T14:16:29.170711Z",
+     "iopub.status.idle": "2023-05-03T14:16:30.613189Z",
+     "shell.execute_reply": "2023-05-03T14:16:30.612012Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:29.171466Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>comment_text</th>\n",
+       "      <th>toxic</th>\n",
+       "      <th>severe_toxic</th>\n",
+       "      <th>obscene</th>\n",
+       "      <th>threat</th>\n",
+       "      <th>insult</th>\n",
+       "      <th>identity_hate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0000997932d777bf</td>\n",
+       "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>000103f0d9cfb60f</td>\n",
+       "      <td>D'aww! He matches this background colour I'm s...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>000113f07ec002fd</td>\n",
+       "      <td>Hey man, I'm really not trying to edit war. It...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0001b41b1c6bb37e</td>\n",
+       "      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0001d958c54c6e35</td>\n",
+       "      <td>You, sir, are my hero. Any chance you remember...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 id                                       comment_text  toxic   \n",
+       "0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0  \\\n",
+       "1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n",
+       "2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n",
+       "3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n",
+       "4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n",
+       "\n",
+       "   severe_toxic  obscene  threat  insult  identity_hate  \n",
+       "0             0        0       0       0              0  \n",
+       "1             0        0       0       0              0  \n",
+       "2             0        0       0       0              0  \n",
+       "3             0        0       0       0              0  \n",
+       "4             0        0       0       0              0  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
+    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:37.492444Z",
+     "iopub.status.busy": "2023-05-03T14:16:37.491342Z",
+     "iopub.status.idle": "2023-05-03T14:16:37.533400Z",
+     "shell.execute_reply": "2023-05-03T14:16:37.532235Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:37.492404Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 159571 entries, 0 to 159570\n",
+      "Data columns (total 8 columns):\n",
+      " #   Column         Non-Null Count   Dtype \n",
+      "---  ------         --------------   ----- \n",
+      " 0   id             159571 non-null  object\n",
+      " 1   comment_text   159571 non-null  object\n",
+      " 2   toxic          159571 non-null  int64 \n",
+      " 3   severe_toxic   159571 non-null  int64 \n",
+      " 4   obscene        159571 non-null  int64 \n",
+      " 5   threat         159571 non-null  int64 \n",
+      " 6   insult         159571 non-null  int64 \n",
+      " 7   identity_hate  159571 non-null  int64 \n",
+      "dtypes: int64(6), object(2)\n",
+      "memory usage: 9.7+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:41.586932Z",
+     "iopub.status.busy": "2023-05-03T14:16:41.585997Z",
+     "iopub.status.idle": "2023-05-03T14:16:41.618902Z",
+     "shell.execute_reply": "2023-05-03T14:16:41.617979Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:41.586895Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "id               0\n",
+       "comment_text     0\n",
+       "toxic            0\n",
+       "severe_toxic     0\n",
+       "obscene          0\n",
+       "threat           0\n",
+       "insult           0\n",
+       "identity_hate    0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:44.561198Z",
+     "iopub.status.busy": "2023-05-03T14:16:44.560414Z",
+     "iopub.status.idle": "2023-05-03T14:16:44.586487Z",
+     "shell.execute_reply": "2023-05-03T14:16:44.585582Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:44.561152Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>toxic</th>\n",
+       "      <td>144277</td>\n",
+       "      <td>15294</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>severe_toxic</th>\n",
+       "      <td>157976</td>\n",
+       "      <td>1595</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>obscene</th>\n",
+       "      <td>151122</td>\n",
+       "      <td>8449</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>threat</th>\n",
+       "      <td>159093</td>\n",
+       "      <td>478</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>insult</th>\n",
+       "      <td>151694</td>\n",
+       "      <td>7877</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>identity_hate</th>\n",
+       "      <td>158166</td>\n",
+       "      <td>1405</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    0      1\n",
+       "toxic          144277  15294\n",
+       "severe_toxic   157976   1595\n",
+       "obscene        151122   8449\n",
+       "threat         159093    478\n",
+       "insult         151694   7877\n",
+       "identity_hate  158166   1405"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:51.639830Z",
+     "iopub.status.busy": "2023-05-03T14:16:51.639059Z",
+     "iopub.status.idle": "2023-05-03T14:16:51.658065Z",
+     "shell.execute_reply": "2023-05-03T14:16:51.657049Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:51.639796Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "toxic value count\n",
+      "--------------------\n",
+      "0: 144277 | 90.42 %\n",
+      "1: 15294 | 9.58 %\n",
+      "\n",
+      "severe_toxic value count\n",
+      "--------------------\n",
+      "0: 157976 | 99.0 %\n",
+      "1: 1595 | 1.0 %\n",
+      "\n",
+      "obscene value count\n",
+      "--------------------\n",
+      "0: 151122 | 94.71 %\n",
+      "1: 8449 | 5.29 %\n",
+      "\n",
+      "threat value count\n",
+      "--------------------\n",
+      "0: 159093 | 99.7 %\n",
+      "1: 478 | 0.3 %\n",
+      "\n",
+      "insult value count\n",
+      "--------------------\n",
+      "0: 151694 | 95.06 %\n",
+      "1: 7877 | 4.94 %\n",
+      "\n",
+      "identity_hate value count\n",
+      "--------------------\n",
+      "0: 158166 | 99.12 %\n",
+      "1: 1405 | 0.88 %\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for column in data.columns:\n",
+    "    if data[column].dtype != 'O':\n",
+    "        value_count = data[column].value_counts()\n",
+    "        print(f\"{column} value count\\n{'--'*10}\")\n",
+    "        print(f\"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\\n\"\n",
+    "              f\"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[\"text_len\"] = data[\"comment_text\"].apply(lambda x: len(x.split()))\n",
+    "data[data[\"text_len\"]==data[\"text_len\"].max()]['comment_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:58.642154Z",
+     "iopub.status.busy": "2023-05-03T14:16:58.641279Z",
+     "iopub.status.idle": "2023-05-03T14:16:58.648851Z",
+     "shell.execute_reply": "2023-05-03T14:16:58.647773Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:58.642119Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X = data['comment_text']\n",
+    "y = data[data.columns[2:]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:02.919383Z",
+     "iopub.status.busy": "2023-05-03T14:17:02.918865Z",
+     "iopub.status.idle": "2023-05-03T14:17:02.927191Z",
+     "shell.execute_reply": "2023-05-03T14:17:02.926293Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:02.919350Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0         Explanation\\nWhy the edits made under my usern...\n",
+       "1         D'aww! He matches this background colour I'm s...\n",
+       "2         Hey man, I'm really not trying to edit war. It...\n",
+       "3         \"\\nMore\\nI can't make any real suggestions on ...\n",
+       "4         You, sir, are my hero. Any chance you remember...\n",
+       "                                ...                        \n",
+       "159566    \":::::And for the second time of asking, when ...\n",
+       "159567    You should be ashamed of yourself \\n\\nThat is ...\n",
+       "159568    Spitzer \\n\\nUmm, theres no actual article for ...\n",
+       "159569    And it looks like it was actually you who put ...\n",
+       "159570    \"\\nAnd ... I really don't think you understand...\n",
+       "Name: comment_text, Length: 159571, dtype: object"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:08.246451Z",
+     "iopub.status.busy": "2023-05-03T14:17:08.245491Z",
+     "iopub.status.idle": "2023-05-03T14:17:08.252604Z",
+     "shell.execute_reply": "2023-05-03T14:17:08.251608Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:08.246414Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       ...,\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Text Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:25.208007Z",
+     "iopub.status.busy": "2023-05-03T14:17:25.207157Z",
+     "iopub.status.idle": "2023-05-03T14:17:25.220446Z",
+     "shell.execute_reply": "2023-05-03T14:17:25.219390Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:25.207968Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class Text_Cleaner:\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "        self.STOPWORDS = stopwords.words('english')\n",
+    "        self.wordnet = WordNetLemmatizer()\n",
+    "        \n",
+    "    def new_line_code(self, x:str)->str:\n",
+    "        pattern = \"\\n\"\n",
+    "        x = re.sub(pattern,' ', x).strip().lower()\n",
+    "        return x\n",
+    "\n",
+    "    def remove_punctuations(self, x:str)->str:\n",
+    "        x = x.translate(str.maketrans('','',string.punctuation))\n",
+    "        return x\n",
+    "\n",
+    "    def remove_stopwords(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            if word not in self.STOPWORDS:\n",
+    "                sent.append(word)\n",
+    "        return ' '.join(sent)\n",
+    "\n",
+    "    def lemmatization(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            sent.append(self.wordnet.lemmatize(word))\n",
+    "        return ' '.join(sent)\n",
+    "    \n",
+    "    def clean_text(self):\n",
+    "        self.data = self.data.apply(self.new_line_code)\n",
+    "        self.data = self.data.apply(self.remove_punctuations)\n",
+    "        self.data = self.data.apply(self.remove_stopwords)\n",
+    "        self.data = self.data.apply(self.lemmatization)\n",
+    "        self.data = self.data.apply(lambda x: x.strip())\n",
+    "        return self.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:28.812213Z",
+     "iopub.status.busy": "2023-05-03T14:17:28.811115Z",
+     "iopub.status.idle": "2023-05-03T14:18:45.134664Z",
+     "shell.execute_reply": "2023-05-03T14:18:45.133093Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:28.812159Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X = Text_Cleaner(X).clean_text()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:19:08.971107Z",
+     "iopub.status.busy": "2023-05-03T14:19:08.969951Z",
+     "iopub.status.idle": "2023-05-03T14:19:08.979371Z",
+     "shell.execute_reply": "2023-05-03T14:19:08.978320Z",
+     "shell.execute_reply.started": "2023-05-03T14:19:08.971065Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0         explanation edits made username hardcore metal...\n",
+       "1         daww match background colour im seemingly stuc...\n",
+       "2         hey man im really trying edit war guy constant...\n",
+       "3         cant make real suggestion improvement wondered...\n",
+       "4                       sir hero chance remember page thats\n",
+       "                                ...                        \n",
+       "159566    second time asking view completely contradicts...\n",
+       "159567       ashamed horrible thing put talk page 128611993\n",
+       "159568    spitzer umm there actual article prostitution ...\n",
+       "159569    look like actually put speedy first version de...\n",
+       "159570    really dont think understand came idea bad rig...\n",
+       "Name: comment_text, Length: 159571, dtype: object"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model Building"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
+    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
+    "                               output_mode='int')\n",
+    "vectorizer.adapt(X.values)\n",
+    "vectorized_text = vectorizer(X.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:24.692312Z",
+     "iopub.status.busy": "2023-05-03T14:42:24.691267Z",
+     "iopub.status.idle": "2023-05-03T14:42:24.709520Z",
+     "shell.execute_reply": "2023-05-03T14:42:24.708295Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:24.692272Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
+    "dataset = dataset.cache()\n",
+    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
+    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:27.187117Z",
+     "iopub.status.busy": "2023-05-03T14:42:27.185929Z",
+     "iopub.status.idle": "2023-05-03T14:42:27.196570Z",
+     "shell.execute_reply": "2023-05-03T14:42:27.195443Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:27.187074Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train = dataset.take(int(len(dataset)*0.8))\n",
+    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
+    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:41:54.920944Z",
+     "iopub.status.busy": "2023-05-03T14:41:54.920085Z",
+     "iopub.status.idle": "2023-05-03T14:41:54.928526Z",
+     "shell.execute_reply": "2023-05-03T14:41:54.927502Z",
+     "shell.execute_reply.started": "2023-05-03T14:41:54.920907Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def create_model():\n",
+    "    \n",
+    "    LAYERS = [\n",
+    "              Embedding(Config.VOCAB_SIZE+1, 32),\n",
+    "              Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),\n",
+    "              Bidirectional(LSTM(32)),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dropout(0.1),\n",
+    "              Dense(256, activation='relu'),\n",
+    "              Dropout(0.1),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(6, activation='sigmoid')]\n",
+    "    \n",
+    "    model = Sequential(LAYERS)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:41:41.900942Z",
+     "iopub.status.busy": "2023-05-03T14:41:41.900504Z",
+     "iopub.status.idle": "2023-05-03T14:41:41.908480Z",
+     "shell.execute_reply": "2023-05-03T14:41:41.907187Z",
+     "shell.execute_reply.started": "2023-05-03T14:41:41.900911Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def callbacks(base_dir=\".\"):\n",
+    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
+    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
+    "    os.makedirs(ckpt_file,exist_ok=True)\n",
+    "\n",
+    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "      filepath = ckpt_file,\n",
+    "      save_best_only = True)\n",
+    "\n",
+    "    callback_list = [early_stopping,\n",
+    "                     ckpt_cb]\n",
+    "    return callback_list\n",
+    "callbacks_list = callbacks()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:07.719948Z",
+     "iopub.status.busy": "2023-05-03T14:42:07.719137Z",
+     "iopub.status.idle": "2023-05-03T14:42:09.288990Z",
+     "shell.execute_reply": "2023-05-03T14:42:09.287682Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:07.719910Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "with tpu_strategy.scope():\n",
+    "    model = create_model()\n",
+    "    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
+    "                  loss=tf.keras.losses.binary_crossentropy,\n",
+    "                  metrics=AUC(multi_label=True, num_labels=6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:34.084064Z",
+     "iopub.status.busy": "2023-05-03T14:42:34.083255Z",
+     "iopub.status.idle": "2023-05-03T14:42:34.110375Z",
+     "shell.execute_reply": "2023-05-03T14:42:34.109380Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:34.084025Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential_2\"\n",
+      "_________________________________________________________________\n",
+      " Layer (type)                Output Shape              Param #   \n",
+      "=================================================================\n",
+      " embedding_2 (Embedding)     (None, None, 32)          6400032   \n",
+      "                                                                 \n",
+      " bidirectional_4 (Bidirectio  (None, None, 128)        49664     \n",
+      " nal)                                                            \n",
+      "                                                                 \n",
+      " bidirectional_5 (Bidirectio  (None, 64)               41216     \n",
+      " nal)                                                            \n",
+      "                                                                 \n",
+      " dense_8 (Dense)             (None, 128)               8320      \n",
+      "                                                                 \n",
+      " dropout_4 (Dropout)         (None, 128)               0         \n",
+      "                                                                 \n",
+      " dense_9 (Dense)             (None, 256)               33024     \n",
+      "                                                                 \n",
+      " dropout_5 (Dropout)         (None, 256)               0         \n",
+      "                                                                 \n",
+      " dense_10 (Dense)            (None, 128)               32896     \n",
+      "                                                                 \n",
+      " dense_11 (Dense)            (None, 6)                 774       \n",
+      "                                                                 \n",
+      "=================================================================\n",
+      "Total params: 6,565,926\n",
+      "Trainable params: 6,565,926\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:20:19.051437Z",
+     "iopub.status.busy": "2023-05-03T14:20:19.050592Z",
+     "iopub.status.idle": "2023-05-03T14:20:19.057744Z",
+     "shell.execute_reply": "2023-05-03T14:20:19.056746Z",
+     "shell.execute_reply.started": "2023-05-03T14:20:19.051377Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "997"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:42.306143Z",
+     "iopub.status.busy": "2023-05-03T14:42:42.305188Z",
+     "iopub.status.idle": "2023-05-03T18:36:14.400588Z",
+     "shell.execute_reply": "2023-05-03T18:36:14.399250Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:42.306107Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/10\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-03 14:42:57.854226: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n",
+      "2023-05-03 14:42:58.165317: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - ETA: 0s - loss: 0.1688 - auc_2: 0.5909"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-03 15:05:36.690047: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n",
+      "2023-05-03 15:05:36.851778: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n",
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1425s 1s/step - loss: 0.1688 - auc_2: 0.5909 - val_loss: 0.0750 - val_auc_2: 0.9196\n",
+      "Epoch 2/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0640 - auc_2: 0.9400"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1395s 1s/step - loss: 0.0640 - auc_2: 0.9400 - val_loss: 0.0548 - val_auc_2: 0.9532\n",
+      "Epoch 3/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0524 - auc_2: 0.9594"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1396s 1s/step - loss: 0.0524 - auc_2: 0.9594 - val_loss: 0.0484 - val_auc_2: 0.9597\n",
+      "Epoch 4/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0466 - auc_2: 0.9672"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1396s 1s/step - loss: 0.0466 - auc_2: 0.9672 - val_loss: 0.0426 - val_auc_2: 0.9729\n",
+      "Epoch 5/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0440 - auc_2: 0.9715"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1395s 1s/step - loss: 0.0440 - auc_2: 0.9715 - val_loss: 0.0406 - val_auc_2: 0.9761\n",
+      "Epoch 6/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0416 - auc_2: 0.9725"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1396s 1s/step - loss: 0.0416 - auc_2: 0.9725 - val_loss: 0.0382 - val_auc_2: 0.9787\n",
+      "Epoch 7/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0394 - auc_2: 0.9762"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1396s 1s/step - loss: 0.0394 - auc_2: 0.9762 - val_loss: 0.0359 - val_auc_2: 0.9819\n",
+      "Epoch 8/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0379 - auc_2: 0.9773"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1396s 1s/step - loss: 0.0379 - auc_2: 0.9773 - val_loss: 0.0346 - val_auc_2: 0.9821\n",
+      "Epoch 9/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0367 - auc_2: 0.9776"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1396s 1s/step - loss: 0.0367 - auc_2: 0.9776 - val_loss: 0.0336 - val_auc_2: 0.9827\n",
+      "Epoch 10/10\n",
+      "997/997 [==============================] - ETA: 0s - loss: 0.0357 - auc_2: 0.9782"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 1395s 1s/step - loss: 0.0357 - auc_2: 0.9782 - val_loss: 0.0328 - val_auc_2: 0.9819\n"
+     ]
+    }
+   ],
+   "source": [
+    "history = model.fit(train, \n",
+    "                    epochs=Config.EPOCHS,\n",
+    "                    steps_per_epoch=len(train),\n",
+    "                    validation_data=val,\n",
+    "                    callbacks=callbacks_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:36:42.693133Z",
+     "iopub.status.busy": "2023-05-03T18:36:42.692246Z",
+     "iopub.status.idle": "2023-05-03T18:36:42.702544Z",
+     "shell.execute_reply": "2023-05-03T18:36:42.701196Z",
+     "shell.execute_reply.started": "2023-05-03T18:36:42.693095Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def model_evaluation(model, vectorizer: TextVectorization, pred_data: pd.Series, y_true):\n",
+    "    #pred_data = Text_Cleaner(pred_data).clean_text()\n",
+    "    #vectorized_text = vectorizer(pred_data)\n",
+    "    y_pred = model.predict(pred_data)\n",
+    "    try:\n",
+    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        \n",
+    "    print(f\"Precision: {precision}\\n\"\n",
+    "          f\"Recall: {recall}\\n\"\n",
+    "          f\"F1-Score: {f1}\\n\"\n",
+    "          f\"ROC-AUC-Score: {auc}\")\n",
+    "    return (precision, recall, f1, auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.evaluate(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:36:28.884733Z",
+     "iopub.status.busy": "2023-05-03T18:36:28.883953Z",
+     "iopub.status.idle": "2023-05-03T18:36:29.233282Z",
+     "shell.execute_reply": "2023-05-03T18:36:29.231964Z",
+     "shell.execute_reply.started": "2023-05-03T18:36:28.884694Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model.save(\"model_4.h5\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:51:24.530412Z",
+     "iopub.status.busy": "2023-05-03T18:51:24.529307Z",
+     "iopub.status.idle": "2023-05-03T19:20:36.675080Z",
+     "shell.execute_reply": "2023-05-03T19:20:36.673739Z",
+     "shell.execute_reply.started": "2023-05-03T18:51:24.530375Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3988/3988 [==============================] - 1747s 438ms/step\n",
+      "Precision: 0.034067329786671804\n",
+      "Recall: 0.03396435372259718\n",
+      "F1-Score: 0.03375883387877523\n",
+      "ROC-AUC-Score: 0.4963643308231378\n"
+     ]
+    }
+   ],
+   "source": [
+    "x_train = np.concatenate([x for x, y in train])\n",
+    "y_train = np.concatenate([y for x, y in train])\n",
+    "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_train, y_true=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:49:02.718178Z",
+     "iopub.status.busy": "2023-05-03T18:49:02.717234Z",
+     "iopub.status.idle": "2023-05-03T18:49:50.438077Z",
+     "shell.execute_reply": "2023-05-03T18:49:50.436458Z",
+     "shell.execute_reply.started": "2023-05-03T18:49:02.718132Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "996/996 [==============================] - 43s 42ms/step\n",
+      "Precision: 0.03615509646190422\n",
+      "Recall: 0.03674059129986899\n",
+      "F1-Score: 0.03625622443975915\n",
+      "ROC-AUC-Score: 0.4868083116383068\n"
+     ]
+    }
+   ],
+   "source": [
+    "x_val = np.concatenate([x for x, y in val])\n",
+    "y_val = np.concatenate([y for x, y in val])\n",
+    "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_val, y_true=y_val)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

experiment_notebooks/Experiment 5.ipynb ADDED Viewed

	@@ -0,0 +1,1003 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install nltk scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:13:06.118200Z",
+     "iopub.status.busy": "2023-05-03T14:13:06.117322Z",
+     "iopub.status.idle": "2023-05-03T14:13:36.869507Z",
+     "shell.execute_reply": "2023-05-03T14:13:36.868619Z",
+     "shell.execute_reply.started": "2023-05-03T14:13:06.118149Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "#import seaborn as sns\n",
+    "import tensorflow as tf\n",
+    "#import tensorflow_gpu\n",
+    "import urllib\n",
+    "from tensorflow.keras.layers import TextVectorization\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
+    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC\n",
+    "from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import re\n",
+    "import string\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('omw-1.4')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('wordnet2022')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf_tpu_or_gpu(device: str='gpu'):\n",
+    "    if device.lower() == 'gpu':\n",
+    "        print(\"Setting up GPU.....\")\n",
+    "        device_name = tf.test.gpu_device_name()\n",
+    "        if \"GPU\" not in device_name:\n",
+    "            print(\"GPU device not found\")\n",
+    "        print('Found GPU at: {}'.format(device_name))\n",
+    "        \n",
+    "        config = tf.compat.v1.ConfigProto() \n",
+    "        config.gpu_options.allow_growth = True \n",
+    "        sess = tf.compat.v1.Session(config=config) \n",
+    "        tf.compat.v1.keras.backend.set_session(sess)\n",
+    "        \n",
+    "        print(config)\n",
+    "    \n",
+    "    elif device.lower() == 'tpu':\n",
+    "        print(\"Setting up TPU.....\")\n",
+    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
+    "        print('Running on TPU ', tpu.master())\n",
+    "        tf.config.experimental_connect_to_cluster(tpu)\n",
+    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
+    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
+    "        \n",
+    "    else:\n",
+    "        raise Exception(\"Wrong Device Paramter Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_tpu_or_gpu(device='gpu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "device_name = tf.test.gpu_device_name()\n",
+    "if \"GPU\" not in device_name:\n",
+    "    print(\"GPU device not found\")\n",
+    "print('Found GPU at: {}'.format(device_name))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "config = tf.compat.v1.ConfigProto() \n",
+    "config.gpu_options.allow_growth = True \n",
+    "sess = tf.compat.v1.Session(config=config) \n",
+    "tf.compat.v1.keras.backend.set_session(sess)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:24.940878Z",
+     "iopub.status.busy": "2023-05-03T14:16:24.940140Z",
+     "iopub.status.idle": "2023-05-03T14:16:24.946837Z",
+     "shell.execute_reply": "2023-05-03T14:16:24.945707Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:24.940845Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class Config:\n",
+    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
+    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
+    "    VOCAB_SIZE = 10000\n",
+    "    OUTPUT_DIM = 100\n",
+    "    BUFFER_SIZE = 10000\n",
+    "    BATCH_SIZE = 64\n",
+    "    EPOCHS = 10\n",
+    "    BASE_LOG_DIR = \"log_dir\"\n",
+    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:29.171506Z",
+     "iopub.status.busy": "2023-05-03T14:16:29.170711Z",
+     "iopub.status.idle": "2023-05-03T14:16:30.613189Z",
+     "shell.execute_reply": "2023-05-03T14:16:30.612012Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:29.171466Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>comment_text</th>\n",
+       "      <th>toxic</th>\n",
+       "      <th>severe_toxic</th>\n",
+       "      <th>obscene</th>\n",
+       "      <th>threat</th>\n",
+       "      <th>insult</th>\n",
+       "      <th>identity_hate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0000997932d777bf</td>\n",
+       "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>000103f0d9cfb60f</td>\n",
+       "      <td>D'aww! He matches this background colour I'm s...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>000113f07ec002fd</td>\n",
+       "      <td>Hey man, I'm really not trying to edit war. It...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0001b41b1c6bb37e</td>\n",
+       "      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0001d958c54c6e35</td>\n",
+       "      <td>You, sir, are my hero. Any chance you remember...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 id                                       comment_text  toxic   \n",
+       "0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0  \\\n",
+       "1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n",
+       "2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n",
+       "3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n",
+       "4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n",
+       "\n",
+       "   severe_toxic  obscene  threat  insult  identity_hate  \n",
+       "0             0        0       0       0              0  \n",
+       "1             0        0       0       0              0  \n",
+       "2             0        0       0       0              0  \n",
+       "3             0        0       0       0              0  \n",
+       "4             0        0       0       0              0  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
+    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:37.492444Z",
+     "iopub.status.busy": "2023-05-03T14:16:37.491342Z",
+     "iopub.status.idle": "2023-05-03T14:16:37.533400Z",
+     "shell.execute_reply": "2023-05-03T14:16:37.532235Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:37.492404Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 159571 entries, 0 to 159570\n",
+      "Data columns (total 8 columns):\n",
+      " #   Column         Non-Null Count   Dtype \n",
+      "---  ------         --------------   ----- \n",
+      " 0   id             159571 non-null  object\n",
+      " 1   comment_text   159571 non-null  object\n",
+      " 2   toxic          159571 non-null  int64 \n",
+      " 3   severe_toxic   159571 non-null  int64 \n",
+      " 4   obscene        159571 non-null  int64 \n",
+      " 5   threat         159571 non-null  int64 \n",
+      " 6   insult         159571 non-null  int64 \n",
+      " 7   identity_hate  159571 non-null  int64 \n",
+      "dtypes: int64(6), object(2)\n",
+      "memory usage: 9.7+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:41.586932Z",
+     "iopub.status.busy": "2023-05-03T14:16:41.585997Z",
+     "iopub.status.idle": "2023-05-03T14:16:41.618902Z",
+     "shell.execute_reply": "2023-05-03T14:16:41.617979Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:41.586895Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "id               0\n",
+       "comment_text     0\n",
+       "toxic            0\n",
+       "severe_toxic     0\n",
+       "obscene          0\n",
+       "threat           0\n",
+       "insult           0\n",
+       "identity_hate    0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:44.561198Z",
+     "iopub.status.busy": "2023-05-03T14:16:44.560414Z",
+     "iopub.status.idle": "2023-05-03T14:16:44.586487Z",
+     "shell.execute_reply": "2023-05-03T14:16:44.585582Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:44.561152Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>toxic</th>\n",
+       "      <td>144277</td>\n",
+       "      <td>15294</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>severe_toxic</th>\n",
+       "      <td>157976</td>\n",
+       "      <td>1595</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>obscene</th>\n",
+       "      <td>151122</td>\n",
+       "      <td>8449</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>threat</th>\n",
+       "      <td>159093</td>\n",
+       "      <td>478</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>insult</th>\n",
+       "      <td>151694</td>\n",
+       "      <td>7877</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>identity_hate</th>\n",
+       "      <td>158166</td>\n",
+       "      <td>1405</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    0      1\n",
+       "toxic          144277  15294\n",
+       "severe_toxic   157976   1595\n",
+       "obscene        151122   8449\n",
+       "threat         159093    478\n",
+       "insult         151694   7877\n",
+       "identity_hate  158166   1405"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:51.639830Z",
+     "iopub.status.busy": "2023-05-03T14:16:51.639059Z",
+     "iopub.status.idle": "2023-05-03T14:16:51.658065Z",
+     "shell.execute_reply": "2023-05-03T14:16:51.657049Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:51.639796Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "toxic value count\n",
+      "--------------------\n",
+      "0: 144277 | 90.42 %\n",
+      "1: 15294 | 9.58 %\n",
+      "\n",
+      "severe_toxic value count\n",
+      "--------------------\n",
+      "0: 157976 | 99.0 %\n",
+      "1: 1595 | 1.0 %\n",
+      "\n",
+      "obscene value count\n",
+      "--------------------\n",
+      "0: 151122 | 94.71 %\n",
+      "1: 8449 | 5.29 %\n",
+      "\n",
+      "threat value count\n",
+      "--------------------\n",
+      "0: 159093 | 99.7 %\n",
+      "1: 478 | 0.3 %\n",
+      "\n",
+      "insult value count\n",
+      "--------------------\n",
+      "0: 151694 | 95.06 %\n",
+      "1: 7877 | 4.94 %\n",
+      "\n",
+      "identity_hate value count\n",
+      "--------------------\n",
+      "0: 158166 | 99.12 %\n",
+      "1: 1405 | 0.88 %\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for column in data.columns:\n",
+    "    if data[column].dtype != 'O':\n",
+    "        value_count = data[column].value_counts()\n",
+    "        print(f\"{column} value count\\n{'--'*10}\")\n",
+    "        print(f\"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\\n\"\n",
+    "              f\"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[\"text_len\"] = data[\"comment_text\"].apply(lambda x: len(x.split()))\n",
+    "data[data[\"text_len\"]==data[\"text_len\"].max()]['comment_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:16:58.642154Z",
+     "iopub.status.busy": "2023-05-03T14:16:58.641279Z",
+     "iopub.status.idle": "2023-05-03T14:16:58.648851Z",
+     "shell.execute_reply": "2023-05-03T14:16:58.647773Z",
+     "shell.execute_reply.started": "2023-05-03T14:16:58.642119Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X = data['comment_text']\n",
+    "y = data[data.columns[2:]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:02.919383Z",
+     "iopub.status.busy": "2023-05-03T14:17:02.918865Z",
+     "iopub.status.idle": "2023-05-03T14:17:02.927191Z",
+     "shell.execute_reply": "2023-05-03T14:17:02.926293Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:02.919350Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0         Explanation\\nWhy the edits made under my usern...\n",
+       "1         D'aww! He matches this background colour I'm s...\n",
+       "2         Hey man, I'm really not trying to edit war. It...\n",
+       "3         \"\\nMore\\nI can't make any real suggestions on ...\n",
+       "4         You, sir, are my hero. Any chance you remember...\n",
+       "                                ...                        \n",
+       "159566    \":::::And for the second time of asking, when ...\n",
+       "159567    You should be ashamed of yourself \\n\\nThat is ...\n",
+       "159568    Spitzer \\n\\nUmm, theres no actual article for ...\n",
+       "159569    And it looks like it was actually you who put ...\n",
+       "159570    \"\\nAnd ... I really don't think you understand...\n",
+       "Name: comment_text, Length: 159571, dtype: object"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:08.246451Z",
+     "iopub.status.busy": "2023-05-03T14:17:08.245491Z",
+     "iopub.status.idle": "2023-05-03T14:17:08.252604Z",
+     "shell.execute_reply": "2023-05-03T14:17:08.251608Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:08.246414Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       ...,\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Text Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:25.208007Z",
+     "iopub.status.busy": "2023-05-03T14:17:25.207157Z",
+     "iopub.status.idle": "2023-05-03T14:17:25.220446Z",
+     "shell.execute_reply": "2023-05-03T14:17:25.219390Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:25.207968Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class Text_Cleaner:\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "        self.STOPWORDS = stopwords.words('english')\n",
+    "        self.wordnet = WordNetLemmatizer()\n",
+    "        \n",
+    "    def new_line_code(self, x:str)->str:\n",
+    "        pattern = \"\\n\"\n",
+    "        x = re.sub(pattern,' ', x).strip().lower()\n",
+    "        return x\n",
+    "\n",
+    "    def remove_punctuations(self, x:str)->str:\n",
+    "        x = x.translate(str.maketrans('','',string.punctuation))\n",
+    "        return x\n",
+    "\n",
+    "    def remove_stopwords(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            if word not in self.STOPWORDS:\n",
+    "                sent.append(word)\n",
+    "        return ' '.join(sent)\n",
+    "\n",
+    "    def lemmatization(self, x:str)->str:\n",
+    "        sent=[]\n",
+    "        for word in x.split():\n",
+    "            sent.append(self.wordnet.lemmatize(word))\n",
+    "        return ' '.join(sent)\n",
+    "    \n",
+    "    def clean_text(self):\n",
+    "        self.data = self.data.apply(self.new_line_code)\n",
+    "        self.data = self.data.apply(self.remove_punctuations)\n",
+    "        self.data = self.data.apply(self.remove_stopwords)\n",
+    "        self.data = self.data.apply(self.lemmatization)\n",
+    "        self.data = self.data.apply(lambda x: x.strip())\n",
+    "        return self.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:17:28.812213Z",
+     "iopub.status.busy": "2023-05-03T14:17:28.811115Z",
+     "iopub.status.idle": "2023-05-03T14:18:45.134664Z",
+     "shell.execute_reply": "2023-05-03T14:18:45.133093Z",
+     "shell.execute_reply.started": "2023-05-03T14:17:28.812159Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X = Text_Cleaner(X).clean_text()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:19:08.971107Z",
+     "iopub.status.busy": "2023-05-03T14:19:08.969951Z",
+     "iopub.status.idle": "2023-05-03T14:19:08.979371Z",
+     "shell.execute_reply": "2023-05-03T14:19:08.978320Z",
+     "shell.execute_reply.started": "2023-05-03T14:19:08.971065Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0         explanation edits made username hardcore metal...\n",
+       "1         daww match background colour im seemingly stuc...\n",
+       "2         hey man im really trying edit war guy constant...\n",
+       "3         cant make real suggestion improvement wondered...\n",
+       "4                       sir hero chance remember page thats\n",
+       "                                ...                        \n",
+       "159566    second time asking view completely contradicts...\n",
+       "159567       ashamed horrible thing put talk page 128611993\n",
+       "159568    spitzer umm there actual article prostitution ...\n",
+       "159569    look like actually put speedy first version de...\n",
+       "159570    really dont think understand came idea bad rig...\n",
+       "Name: comment_text, Length: 159571, dtype: object"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model Building"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:24.692312Z",
+     "iopub.status.busy": "2023-05-03T14:42:24.691267Z",
+     "iopub.status.idle": "2023-05-03T14:42:24.709520Z",
+     "shell.execute_reply": "2023-05-03T14:42:24.708295Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:24.692272Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((X, y))\n",
+    "dataset = dataset.cache()\n",
+    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
+    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:27.187117Z",
+     "iopub.status.busy": "2023-05-03T14:42:27.185929Z",
+     "iopub.status.idle": "2023-05-03T14:42:27.196570Z",
+     "shell.execute_reply": "2023-05-03T14:42:27.195443Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:27.187074Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train = dataset.take(int(len(dataset)*0.8))\n",
+    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
+    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:41:54.920944Z",
+     "iopub.status.busy": "2023-05-03T14:41:54.920085Z",
+     "iopub.status.idle": "2023-05-03T14:41:54.928526Z",
+     "shell.execute_reply": "2023-05-03T14:41:54.927502Z",
+     "shell.execute_reply.started": "2023-05-03T14:41:54.920907Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def create_model(vectorizer):\n",
+    "    LAYERS = [\n",
+    "              vectorizer,\n",
+    "              Embedding(Config.VOCAB_SIZE+1, 32),\n",
+    "              Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),\n",
+    "              Bidirectional(LSTM(32)),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dropout(0.1),\n",
+    "              Dense(256, activation='relu'),\n",
+    "              Dropout(0.1),\n",
+    "              Dense(128, activation='relu'),\n",
+    "              Dense(6, activation='sigmoid')]\n",
+    "    \n",
+    "    model = Sequential(LAYERS)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:41:41.900942Z",
+     "iopub.status.busy": "2023-05-03T14:41:41.900504Z",
+     "iopub.status.idle": "2023-05-03T14:41:41.908480Z",
+     "shell.execute_reply": "2023-05-03T14:41:41.907187Z",
+     "shell.execute_reply.started": "2023-05-03T14:41:41.900911Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def callbacks(base_dir=\".\"):\n",
+    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
+    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
+    "    os.makedirs(ckpt_file,exist_ok=True)\n",
+    "\n",
+    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "      filepath = ckpt_file,\n",
+    "      save_best_only = True)\n",
+    "\n",
+    "    callback_list = [early_stopping,\n",
+    "                     ckpt_cb]\n",
+    "    return callback_list\n",
+    "callbacks_list = callbacks()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:07.719948Z",
+     "iopub.status.busy": "2023-05-03T14:42:07.719137Z",
+     "iopub.status.idle": "2023-05-03T14:42:09.288990Z",
+     "shell.execute_reply": "2023-05-03T14:42:09.287682Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:07.719910Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
+    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
+    "                               output_mode='int')\n",
+    "vectorizer.adapt(X.values)\n",
+    "\n",
+    "model = create_model(vectorizer)\n",
+    "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
+    "              loss=tf.keras.losses.binary_crossentropy,\n",
+    "              metrics=AUC(multi_label=True, num_labels=6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:34.084064Z",
+     "iopub.status.busy": "2023-05-03T14:42:34.083255Z",
+     "iopub.status.idle": "2023-05-03T14:42:34.110375Z",
+     "shell.execute_reply": "2023-05-03T14:42:34.109380Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:34.084025Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T14:42:42.306143Z",
+     "iopub.status.busy": "2023-05-03T14:42:42.305188Z",
+     "iopub.status.idle": "2023-05-03T18:36:14.400588Z",
+     "shell.execute_reply": "2023-05-03T18:36:14.399250Z",
+     "shell.execute_reply.started": "2023-05-03T14:42:42.306107Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "history = model.fit(train, \n",
+    "                    epochs=Config.EPOCHS,\n",
+    "                    steps_per_epoch=len(train),\n",
+    "                    validation_data=val,\n",
+    "                    callbacks=callbacks_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:36:42.693133Z",
+     "iopub.status.busy": "2023-05-03T18:36:42.692246Z",
+     "iopub.status.idle": "2023-05-03T18:36:42.702544Z",
+     "shell.execute_reply": "2023-05-03T18:36:42.701196Z",
+     "shell.execute_reply.started": "2023-05-03T18:36:42.693095Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
+    "    y_pred = model.predict(pred_data)\n",
+    "    try:\n",
+    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
+    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        \n",
+    "    print(f\"Precision: {precision}\\n\"\n",
+    "          f\"Recall: {recall}\\n\"\n",
+    "          f\"F1-Score: {f1}\\n\"\n",
+    "          f\"ROC-AUC-Score: {auc}\")\n",
+    "    return (precision, recall, f1, auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:36:28.884733Z",
+     "iopub.status.busy": "2023-05-03T18:36:28.883953Z",
+     "iopub.status.idle": "2023-05-03T18:36:29.233282Z",
+     "shell.execute_reply": "2023-05-03T18:36:29.231964Z",
+     "shell.execute_reply.started": "2023-05-03T18:36:28.884694Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model.save(\"model_5\", save_format='tf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:51:24.530412Z",
+     "iopub.status.busy": "2023-05-03T18:51:24.529307Z",
+     "iopub.status.idle": "2023-05-03T19:20:36.675080Z",
+     "shell.execute_reply": "2023-05-03T19:20:36.673739Z",
+     "shell.execute_reply.started": "2023-05-03T18:51:24.530375Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "x_train = np.concatenate([x for x, y in train])\n",
+    "y_train = np.concatenate([y for x, y in train])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-05-03T18:49:02.718178Z",
+     "iopub.status.busy": "2023-05-03T18:49:02.717234Z",
+     "iopub.status.idle": "2023-05-03T18:49:50.438077Z",
+     "shell.execute_reply": "2023-05-03T18:49:50.436458Z",
+     "shell.execute_reply.started": "2023-05-03T18:49:02.718132Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "x_val = np.concatenate([x for x, y in val])\n",
+    "y_val = np.concatenate([y for x, y in val])\n",
+    "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

experiment_notebooks/Transformer-Roberta-Hidden-state.ipynb.ipynb ADDED Viewed

	@@ -0,0 +1 @@

experiment_notebooks/Transformer-Roberta-Pooler-state.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install evaluate seaborn datasets transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:10:38.588160Z","iopub.execute_input":"2023-05-12T08:10:38.588801Z","iopub.status.idle":"2023-05-12T08:11:05.038848Z","shell.execute_reply.started":"2023-05-12T08:10:38.588769Z","shell.execute_reply":"2023-05-12T08:11:05.037913Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:11:23.828906Z","iopub.execute_input":"2023-05-12T08:11:23.829355Z","iopub.status.idle":"2023-05-12T08:12:04.965072Z","shell.execute_reply.started":"2023-05-12T08:11:23.829319Z","shell.execute_reply":"2023-05-12T08:12:04.964003Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 08:11:57.679357494 14 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 08:11:57.679397945 14 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 08:11:57.679401929 14 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 08:11:57.679404612 14 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 08:11:57.679406720 14 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 08:11:57.679409002 14 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 08:11:57.679411626 14 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 08:11:57.679414267 14 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 08:11:57.679416463 14 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 08:11:57.679418575 14 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 08:11:57.679420670 14 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 08:11:57.679422786 14 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 08:11:57.679424925 14 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 08:11:57.679427123 14 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 08:11:57.679611260 14 ev_epoll1_linux.cc:122] grpc epoll fd: 62\nD0512 08:11:57.685069810 14 ev_posix.cc:144] Using polling engine: epoll1\nD0512 08:11:57.685091110 14 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 08:11:57.685503222 14 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 08:11:57.685513372 14 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 08:11:57.685516328 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 08:11:57.685518925 14 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 08:11:57.685521601 14 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 08:11:57.685524245 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 08:11:57.685530262 14 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 08:11:57.685544918 14 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 08:11:57.685567780 14 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 08:11:57.685580119 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 08:11:57.685583175 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 08:11:57.685586100 14 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 08:11:57.685591323 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 08:11:57.685594369 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 08:11:57.685597356 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 08:11:57.685601004 14 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 08:11:57.687522778 14 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 08:11:57.713894001 315 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 08:11:57.732068033 315 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {grpc_status:2, created_time:\"2023-05-12T08:11:57.73203804+00:00\"}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:04.967007Z","iopub.execute_input":"2023-05-12T08:12:04.967631Z","iopub.status.idle":"2023-05-12T08:12:15.185956Z","shell.execute_reply.started":"2023-05-12T08:12:04.967600Z","shell.execute_reply":"2023-05-12T08:12:15.184914Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 3 #2\n MODEL = \"xlm-roberta-large\"\n BUFFER_SIZE = 2048\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 1e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:15.187164Z","iopub.execute_input":"2023-05-12T08:12:15.187478Z","iopub.status.idle":"2023-05-12T08:12:15.193012Z","shell.execute_reply.started":"2023-05-12T08:12:15.187450Z","shell.execute_reply":"2023-05-12T08:12:15.192060Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:15.195496Z","iopub.execute_input":"2023-05-12T08:12:15.195907Z","iopub.status.idle":"2023-05-12T08:12:42.439698Z","shell.execute_reply.started":"2023-05-12T08:12:15.195884Z","shell.execute_reply":"2023-05-12T08:12:42.438494Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1.head()","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:42.440974Z","iopub.execute_input":"2023-05-12T08:12:42.441315Z","iopub.status.idle":"2023-05-12T08:12:42.461414Z","shell.execute_reply.started":"2023-05-12T08:12:42.441285Z","shell.execute_reply":"2023-05-12T08:12:42.460195Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0000997932d777bf</td>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>000103f0d9cfb60f</td>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>000113f07ec002fd</td>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0001b41b1c6bb37e</td>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0001d958c54c6e35</td>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train2.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.462658Z","iopub.execute_input":"2023-05-12T08:12:42.462965Z","iopub.status.idle":"2023-05-12T08:12:42.487874Z","shell.execute_reply.started":"2023-05-12T08:12:42.462921Z","shell.execute_reply":"2023-05-12T08:12:42.487081Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 59848 This is so cool. It's like, 'would you want yo... 0.000000 \\\n1 59849 Thank you!! This would make my life a lot less... 0.000000 \n2 59852 This is such an urgent design problem; kudos t... 0.000000 \n3 59855 Is this something I'll be able to install on m... 0.000000 \n4 59856 haha you guys are a bunch of losers. 0.893617 \n\n severe_toxicity obscene identity_attack insult threat asian atheist \n0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \\\n1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n\n ... article_id rating funny wow sad likes disagree \n0 ... 2006 rejected 0 0 0 0 0 \\\n1 ... 2006 rejected 0 0 0 0 0 \n2 ... 2006 rejected 0 0 0 0 0 \n3 ... 2006 rejected 0 0 0 0 0 \n4 ... 2006 rejected 0 0 0 1 0 \n\n sexual_explicit identity_annotator_count toxicity_annotator_count \n0 0.0 0 4 \n1 0.0 0 4 \n2 0.0 0 4 \n3 0.0 0 4 \n4 0.0 4 47 \n\n[5 rows x 45 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxicity</th>\n <th>obscene</th>\n <th>identity_attack</th>\n <th>insult</th>\n <th>threat</th>\n <th>asian</th>\n <th>atheist</th>\n <th>...</th>\n <th>article_id</th>\n <th>rating</th>\n <th>funny</th>\n <th>wow</th>\n <th>sad</th>\n <th>likes</th>\n <th>disagree</th>\n <th>sexual_explicit</th>\n <th>identity_annotator_count</th>\n <th>toxicity_annotator_count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>59848</td>\n <td>This is so cool. It's like, 'would you want yo...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1</th>\n <td>59849</td>\n <td>Thank you!! This would make my life a lot less...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>59852</td>\n <td>This is such an urgent design problem; kudos t...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>3</th>\n <td>59855</td>\n <td>Is this something I'll be able to install on m...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>4</th>\n <td>59856</td>\n <td>haha you guys are a bunch of losers.</td>\n <td>0.893617</td>\n <td>0.021277</td>\n <td>0.0</td>\n <td>0.021277</td>\n <td>0.87234</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0.0</td>\n <td>4</td>\n <td>47</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 45 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.488844Z","iopub.execute_input":"2023-05-12T08:12:42.489110Z","iopub.status.idle":"2023-05-12T08:12:42.504161Z","shell.execute_reply.started":"2023-05-12T08:12:42.489087Z","shell.execute_reply":"2023-05-12T08:12:42.503316Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" id comment_text lang toxic\n0 0 Este usuario ni siquiera llega al rango de ... es 0\n1 1 Il testo di questa voce pare esser scopiazzato... it 0\n2 2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... es 1\n3 3 Bu maddenin alt başlığı olarak uluslararası i... tr 0\n4 4 Belçika nın şehirlerinin yanında ilçe ve belde... tr 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>es</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>it</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>es</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.505217Z","iopub.execute_input":"2023-05-12T08:12:42.505504Z","iopub.status.idle":"2023-05-12T08:12:42.518947Z","shell.execute_reply.started":"2023-05-12T08:12:42.505480Z","shell.execute_reply":"2023-05-12T08:12:42.518159Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" id content lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>content</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train1[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.519956Z","iopub.execute_input":"2023-05-12T08:12:42.520259Z","iopub.status.idle":"2023-05-12T08:12:42.534176Z","shell.execute_reply.started":"2023-05-12T08:12:42.520234Z","shell.execute_reply":"2023-05-12T08:12:42.533484Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"toxic\n0 202165\n1 21384\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train2[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.537691Z","iopub.execute_input":"2023-05-12T08:12:42.537946Z","iopub.status.idle":"2023-05-12T08:12:42.574451Z","shell.execute_reply.started":"2023-05-12T08:12:42.537925Z","shell.execute_reply":"2023-05-12T08:12:42.573541Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"toxic\n0.000000 1333035\n0.166667 138501\n0.200000 113271\n0.300000 62195\n0.400000 52703\n ... \n0.037609 1\n0.971193 1\n0.988430 1\n0.008309 1\n0.967316 1\nName: count, Length: 3853, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.575526Z","iopub.execute_input":"2023-05-12T08:12:42.575805Z","iopub.status.idle":"2023-05-12T08:12:42.584242Z","shell.execute_reply.started":"2023-05-12T08:12:42.575781Z","shell.execute_reply":"2023-05-12T08:12:42.583468Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"toxic\n0 6770\n1 1230\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.585256Z","iopub.execute_input":"2023-05-12T08:12:42.585532Z","iopub.status.idle":"2023-05-12T08:12:42.596996Z","shell.execute_reply.started":"2023-05-12T08:12:42.585510Z","shell.execute_reply":"2023-05-12T08:12:42.596246Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"lang\ntr 3000\nes 2500\nit 2500\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"test[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.597893Z","iopub.execute_input":"2023-05-12T08:12:42.598151Z","iopub.status.idle":"2023-05-12T08:12:42.612575Z","shell.execute_reply.started":"2023-05-12T08:12:42.598129Z","shell.execute_reply":"2023-05-12T08:12:42.611766Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"lang\ntr 14000\npt 11012\nru 10948\nfr 10920\nit 8494\nes 8438\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.613596Z","iopub.execute_input":"2023-05-12T08:12:42.613863Z","iopub.status.idle":"2023-05-12T08:12:42.689129Z","shell.execute_reply.started":"2023-05-12T08:12:42.613841Z","shell.execute_reply":"2023-05-12T08:12:42.687961Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"train2.toxic.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.690307Z","iopub.execute_input":"2023-05-12T08:12:42.690632Z","iopub.status.idle":"2023-05-12T08:12:42.714449Z","shell.execute_reply.started":"2023-05-12T08:12:42.690603Z","shell.execute_reply":"2023-05-12T08:12:42.712954Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"toxic\n0 1789968\n1 112226\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.717790Z","iopub.execute_input":"2023-05-12T08:12:42.719209Z","iopub.status.idle":"2023-05-12T08:12:43.083471Z","shell.execute_reply.started":"2023-05-12T08:12:42.719178Z","shell.execute_reply":"2023-05-12T08:12:43.082364Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.084697Z","iopub.execute_input":"2023-05-12T08:12:43.084992Z","iopub.status.idle":"2023-05-12T08:12:43.091149Z","shell.execute_reply.started":"2023-05-12T08:12:43.084966Z","shell.execute_reply":"2023-05-12T08:12:43.090173Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"train.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.092288Z","iopub.execute_input":"2023-05-12T08:12:43.092591Z","iopub.status.idle":"2023-05-12T08:12:43.108428Z","shell.execute_reply.started":"2023-05-12T08:12:43.092565Z","shell.execute_reply":"2023-05-12T08:12:43.107388Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Explanation\\nWhy the edits made under my usern... 0\n1 D'aww! He matches this background colour I'm s... 0\n2 Hey man, I'm really not trying to edit war. It... 0\n3 \"\\nMore\\nI can't make any real suggestions on ... 0\n4 You, sir, are my hero. Any chance you remember... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.109560Z","iopub.execute_input":"2023-05-12T08:12:43.109882Z","iopub.status.idle":"2023-05-12T08:12:43.124398Z","shell.execute_reply.started":"2023-05-12T08:12:43.109856Z","shell.execute_reply":"2023-05-12T08:12:43.123602Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Este usuario ni siquiera llega al rango de ... 0\n1 Il testo di questa voce pare esser scopiazzato... 0\n2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... 1\n3 Bu maddenin alt başlığı olarak uluslararası i... 0\n4 Belçika nın şehirlerinin yanında ilçe ve belde... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.125574Z","iopub.execute_input":"2023-05-12T08:12:43.125871Z","iopub.status.idle":"2023-05-12T08:12:43.140740Z","shell.execute_reply.started":"2023-05-12T08:12:43.125845Z","shell.execute_reply":"2023-05-12T08:12:43.139834Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":" id comment_text lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.141938Z","iopub.execute_input":"2023-05-12T08:12:43.142286Z","iopub.status.idle":"2023-05-12T08:12:43.152137Z","shell.execute_reply.started":"2023-05-12T08:12:43.142257Z","shell.execute_reply":"2023-05-12T08:12:43.151267Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.153204Z","iopub.execute_input":"2023-05-12T08:12:43.153504Z","iopub.status.idle":"2023-05-12T08:12:44.735211Z","shell.execute_reply.started":"2023-05-12T08:12:43.153479Z","shell.execute_reply":"2023-05-12T08:12:44.734019Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:44.736464Z","iopub.execute_input":"2023-05-12T08:12:44.736759Z","iopub.status.idle":"2023-05-12T08:12:46.680516Z","shell.execute_reply.started":"2023-05-12T08:12:44.736733Z","shell.execute_reply":"2023-05-12T08:12:46.679299Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stderr","text":"Downloading (…)lve/main/config.json: 100%|██████████| 616/616 [00:00<00:00, 133kB/s]\nDownloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 61.2MB/s]\nDownloading (…)/main/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 38.3MB/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n return tokenizer(text_data.comment_text.values.tolist(), \n max_length=max_len, \n truncation=True, \n padding=\"max_length\",\n add_special_tokens=True,\n return_tensors=\"tf\",\n return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:46.681935Z","iopub.execute_input":"2023-05-12T08:12:46.682277Z","iopub.status.idle":"2023-05-12T08:12:46.688026Z","shell.execute_reply.started":"2023-05-12T08:12:46.682252Z","shell.execute_reply":"2023-05-12T08:12:46.687060Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:46.689142Z","iopub.execute_input":"2023-05-12T08:12:46.689525Z","iopub.status.idle":"2023-05-12T08:13:40.477757Z","shell.execute_reply.started":"2023-05-12T08:12:46.689501Z","shell.execute_reply":"2023-05-12T08:13:40.476350Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n .repeat()\n .shuffle(Config.BUFFER_SIZE)\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:13:40.479182Z","iopub.execute_input":"2023-05-12T08:13:40.479716Z","iopub.status.idle":"2023-05-12T08:13:40.514773Z","shell.execute_reply.started":"2023-05-12T08:13:40.479687Z","shell.execute_reply":"2023-05-12T08:13:40.513645Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformers_layers, max_len=Config.MAX_LEN):\n input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n \n roberta_layers = transformers_layers.roberta(input_ids, attention_mask=masks)[1]\n intermediate = Dense(1024, activation='relu')(roberta_layers)\n output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\n model = Model(inputs=[input_ids, masks], outputs=output)\n model.layers[2].trainable = True\n \n model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=tf.keras.metrics.AUC())\n return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:18.861686Z","iopub.execute_input":"2023-05-12T08:19:18.862636Z","iopub.status.idle":"2023-05-12T08:19:18.872779Z","shell.execute_reply.started":"2023-05-12T08:19:18.862595Z","shell.execute_reply":"2023-05-12T08:19:18.871516Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n transformers_layers = TFAutoModel.from_pretrained(Config.MODEL)\n model = model_builder(transformers_layers=transformers_layers)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:23.540792Z","iopub.execute_input":"2023-05-12T08:19:23.541710Z","iopub.status.idle":"2023-05-12T08:19:58.819514Z","shell.execute_reply.started":"2023-05-12T08:19:23.541670Z","shell.execute_reply":"2023-05-12T08:19:58.818311Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stderr","text":"All model checkpoint layers were used when initializing TFXLMRobertaModel.\n\nAll the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-large.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:58.821255Z","iopub.execute_input":"2023-05-12T08:19:58.821564Z","iopub.status.idle":"2023-05-12T08:19:58.877105Z","shell.execute_reply.started":"2023-05-12T08:19:58.821537Z","shell.execute_reply":"2023-05-12T08:19:58.876009Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stdout","text":"Model: \"model_2\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n roberta (TFXLMRobertaMainLayer TFBaseModelOutputWi 559890432 ['input_ids[0][0]', \n ) thPoolingAndCrossAt 'attention_mask[0][0]'] \n tentions(last_hidde \n n_state=(None, 192, \n 1024), \n pooler_output=(Non \n e, 1024), \n past_key_values=No \n ne, hidden_states=N \n one, attentions=Non \n e, cross_attentions \n =None) \n \n dense_4 (Dense) (None, 1024) 1049600 ['roberta[0][1]'] \n \n output_layer (Dense) (None, 1) 1025 ['dense_4[0][0]'] \n \n==================================================================================================\nTotal params: 560,941,057\nTrainable params: 560,941,057\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n validation_data=val_dataset,\n steps_per_epoch=train_steps_per_epoch,\n epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:20:30.201166Z","iopub.execute_input":"2023-05-12T08:20:30.201570Z","iopub.status.idle":"2023-05-12T09:40:52.828332Z","shell.execute_reply.started":"2023-05-12T08:20:30.201539Z","shell.execute_reply":"2023-05-12T09:40:52.826896Z"},"trusted":true},"execution_count":39,"outputs":[{"name":"stdout","text":"Epoch 1/3\n","output_type":"stream"},{"name":"stderr","text":"2023-05-12 08:21:52.144761: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n2023-05-12 08:21:54.569388: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0501 - auc_2: 0.9972","output_type":"stream"},{"name":"stderr","text":"2023-05-12 08:49:07.665397: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 08:49:08.172000: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 1747s 375ms/step - loss: 0.0501 - auc_2: 0.9972 - val_loss: 0.3338 - val_auc_2: 0.9137\nEpoch 2/3\n4185/4185 [==============================] - 1538s 367ms/step - loss: 0.0420 - auc_2: 0.9981 - val_loss: 0.2931 - val_auc_2: 0.9114\nEpoch 3/3\n4185/4185 [==============================] - 1537s 367ms/step - loss: 0.0369 - auc_2: 0.9985 - val_loss: 0.3070 - val_auc_2: 0.9039\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n steps_per_epoch=val_steps_per_epoch,\n epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:43:06.760317Z","iopub.execute_input":"2023-05-12T09:43:06.760739Z","iopub.status.idle":"2023-05-12T09:43:52.591536Z","shell.execute_reply.started":"2023-05-12T09:43:06.760702Z","shell.execute_reply":"2023-05-12T09:43:52.590324Z"},"trusted":true},"execution_count":41,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 23s 365ms/step - loss: 0.0899 - auc_2: 0.9893\nEpoch 2/2\n62/62 [==============================] - 23s 365ms/step - loss: 0.0800 - auc_2: 0.9916\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:47:56.071510Z","iopub.execute_input":"2023-05-12T09:47:56.072708Z","iopub.status.idle":"2023-05-12T09:49:15.802261Z","shell.execute_reply.started":"2023-05-12T09:47:56.072664Z","shell.execute_reply":"2023-05-12T09:49:15.800711Z"},"trusted":true},"execution_count":42,"outputs":[{"name":"stderr","text":"2023-05-12 09:48:05.583905: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n2023-05-12 09:48:05.992232: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"499/499 [==============================] - 79s 118ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:49:24.580208Z","iopub.execute_input":"2023-05-12T09:49:24.580625Z","iopub.status.idle":"2023-05-12T09:50:44.681561Z","shell.execute_reply.started":"2023-05-12T09:49:24.580595Z","shell.execute_reply":"2023-05-12T09:50:44.680112Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"},{"name":"stdout","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"},{"name":"stderr","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"}]},{"cell_type":"code","source":"import shutil\nshutil.make_archive(\"roberta-fine-tuned-2\",\"zip\",'/kaggle/working/roberta-fine-tuned-2')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:53:15.505782Z","iopub.execute_input":"2023-05-12T09:53:15.506262Z","iopub.status.idle":"2023-05-12T10:00:10.288432Z","shell.execute_reply.started":"2023-05-12T09:53:15.506226Z","shell.execute_reply":"2023-05-12T10:00:10.287215Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"'/kaggle/working/roberta-fine-tuned-2.zip'"},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2-best\", save_format='h5')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:06:24.426264Z","iopub.execute_input":"2023-05-12T10:06:24.426727Z","iopub.status.idle":"2023-05-12T10:06:40.506795Z","shell.execute_reply.started":"2023-05-12T10:06:24.426692Z","shell.execute_reply":"2023-05-12T10:06:40.505341Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"model = tf.keras.models.load_model('/kaggle/working/roberta-fine-tuned-2-best')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:07:36.737706Z","iopub.execute_input":"2023-05-12T10:07:36.738837Z","iopub.status.idle":"2023-05-12T10:07:59.902966Z","shell.execute_reply.started":"2023-05-12T10:07:36.738795Z","shell.execute_reply":"2023-05-12T10:07:59.901400Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"\"\"\"%%capture\n!pip install git+https://github.com/huggingface/huggingface_hub.git@main\n!sudo apt -qq install git-lfs\n!git config --global credential.helper store\"\"\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!huggingface-cli login --token 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","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:12:13.025974Z","iopub.execute_input":"2023-05-12T10:12:13.026917Z","iopub.status.idle":"2023-05-12T10:12:15.351277Z","shell.execute_reply.started":"2023-05-12T10:12:13.026877Z","shell.execute_reply":"2023-05-12T10:12:15.349659Z"},"trusted":true},"execution_count":55,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nToken will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\nToken is valid.\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta-best')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import HfApi\napi = HfApi()\napi.upload_folder(\n folder_path=\"/kaggle/working/\",\n repo_id=\"shivansh-ka/Toxic-Comment-Classifier-Multi\",\n repo_type=\"space\",\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T06:59:23.928089Z","iopub.execute_input":"2023-05-12T06:59:23.928495Z","iopub.status.idle":"2023-05-12T06:59:56.375479Z","shell.execute_reply.started":"2023-05-12T06:59:23.928466Z","shell.execute_reply":"2023-05-12T06:59:56.374295Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\nconfig.json not found in HuggingFace Hub.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"84f3f3229b3e42668708162e27df3168"}},"metadata":{}}]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:06:50.246933Z","iopub.execute_input":"2023-05-12T07:06:50.247789Z","iopub.status.idle":"2023-05-12T07:29:11.940923Z","shell.execute_reply.started":"2023-05-12T07:06:50.247752Z","shell.execute_reply":"2023-05-12T07:29:11.939745Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"499/499 [==============================] - 1341s 3s/step\n","output_type":"stream"}]},{"cell_type":"code","source":"m.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:31:58.337639Z","iopub.execute_input":"2023-05-12T07:31:58.338344Z","iopub.status.idle":"2023-05-12T07:31:58.425154Z","shell.execute_reply.started":"2023-05-12T07:31:58.338300Z","shell.execute_reply":"2023-05-12T07:31:58.424117Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n roberta (Custom>TFXLMRobertaMa {'pooler_output': ( 559890432 ['input_ids[0][0]', \n inLayer) None, 1024), 'attention_mask[0][0]'] \n 'last_hidden_state \n ': (None, 192, 1024 \n )} \n \n dense (Dense) (None, 1024) 1049600 ['roberta[0][1]'] \n \n output_layer (Dense) (None, 1) 1025 ['dense[0][0]'] \n \n==================================================================================================\nTotal params: 560,941,057\nTrainable params: 560,941,057\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"sub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:32:36.768119Z","iopub.execute_input":"2023-05-12T07:32:36.768542Z","iopub.status.idle":"2023-05-12T07:32:36.963761Z","shell.execute_reply.started":"2023-05-12T07:32:36.768512Z","shell.execute_reply":"2023-05-12T07:32:36.962584Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}

experiment_notebooks/Transformer-mBert-Hidden-state.ipynb.ipynb ADDED Viewed

	@@ -0,0 +1 @@

experiment_notebooks/Transformer-mBert-Pooler-state.ipynb.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:47:51.413800Z","iopub.execute_input":"2023-05-12T11:47:51.414070Z","iopub.status.idle":"2023-05-12T11:48:18.602918Z","shell.execute_reply.started":"2023-05-12T11:47:51.414046Z","shell.execute_reply":"2023-05-12T11:48:18.601877Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:49:44.573340Z","iopub.execute_input":"2023-05-12T11:49:44.574079Z","iopub.status.idle":"2023-05-12T11:50:25.814142Z","shell.execute_reply.started":"2023-05-12T11:49:44.574046Z","shell.execute_reply":"2023-05-12T11:50:25.812749Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 11:50:18.442119879 14 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 11:50:18.442154551 14 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 11:50:18.442158337 14 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 11:50:18.442160973 14 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 11:50:18.442163321 14 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 11:50:18.442165969 14 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 11:50:18.442168796 14 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 11:50:18.442171109 14 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 11:50:18.442173402 14 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 11:50:18.442175638 14 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 11:50:18.442177867 14 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 11:50:18.442181062 14 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 11:50:18.442183630 14 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 11:50:18.442185959 14 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 11:50:18.442394344 14 ev_epoll1_linux.cc:122] grpc epoll fd: 62\nD0512 11:50:18.453257763 14 ev_posix.cc:144] Using polling engine: epoll1\nD0512 11:50:18.453301358 14 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 11:50:18.453762003 14 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 11:50:18.453774538 14 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 11:50:18.453779385 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 11:50:18.453782660 14 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 11:50:18.453786243 14 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 11:50:18.453789942 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 11:50:18.453797356 14 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 11:50:18.453818829 14 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 11:50:18.453851056 14 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 11:50:18.453873781 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 11:50:18.453877823 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 11:50:18.453881490 14 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 11:50:18.453888362 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 11:50:18.453892163 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 11:50:18.453896027 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 11:50:18.453901564 14 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 11:50:18.456269287 14 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 11:50:18.476859295 376 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 11:50:18.484409363 376 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {created_time:\"2023-05-12T11:50:18.484390999+00:00\", grpc_status:2}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:25.816399Z","iopub.execute_input":"2023-05-12T11:50:25.817031Z","iopub.status.idle":"2023-05-12T11:50:35.943243Z","shell.execute_reply.started":"2023-05-12T11:50:25.816998Z","shell.execute_reply":"2023-05-12T11:50:35.942201Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 3 #2\n MODEL = \"bert-base-multilingual-uncased\"\n BUFFER_SIZE = 2048\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 1e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:35.944622Z","iopub.execute_input":"2023-05-12T11:50:35.944945Z","iopub.status.idle":"2023-05-12T11:50:35.950932Z","shell.execute_reply.started":"2023-05-12T11:50:35.944916Z","shell.execute_reply":"2023-05-12T11:50:35.949929Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:35.953167Z","iopub.execute_input":"2023-05-12T11:50:35.953494Z","iopub.status.idle":"2023-05-12T11:51:03.310955Z","shell.execute_reply.started":"2023-05-12T11:50:35.953467Z","shell.execute_reply":"2023-05-12T11:51:03.309809Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1.head()","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:42.440974Z","iopub.execute_input":"2023-05-12T08:12:42.441315Z","iopub.status.idle":"2023-05-12T08:12:42.461414Z","shell.execute_reply.started":"2023-05-12T08:12:42.441285Z","shell.execute_reply":"2023-05-12T08:12:42.460195Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0000997932d777bf</td>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>000103f0d9cfb60f</td>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>000113f07ec002fd</td>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0001b41b1c6bb37e</td>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0001d958c54c6e35</td>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train2.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.462658Z","iopub.execute_input":"2023-05-12T08:12:42.462965Z","iopub.status.idle":"2023-05-12T08:12:42.487874Z","shell.execute_reply.started":"2023-05-12T08:12:42.462921Z","shell.execute_reply":"2023-05-12T08:12:42.487081Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 59848 This is so cool. It's like, 'would you want yo... 0.000000 \\\n1 59849 Thank you!! This would make my life a lot less... 0.000000 \n2 59852 This is such an urgent design problem; kudos t... 0.000000 \n3 59855 Is this something I'll be able to install on m... 0.000000 \n4 59856 haha you guys are a bunch of losers. 0.893617 \n\n severe_toxicity obscene identity_attack insult threat asian atheist \n0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \\\n1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n\n ... article_id rating funny wow sad likes disagree \n0 ... 2006 rejected 0 0 0 0 0 \\\n1 ... 2006 rejected 0 0 0 0 0 \n2 ... 2006 rejected 0 0 0 0 0 \n3 ... 2006 rejected 0 0 0 0 0 \n4 ... 2006 rejected 0 0 0 1 0 \n\n sexual_explicit identity_annotator_count toxicity_annotator_count \n0 0.0 0 4 \n1 0.0 0 4 \n2 0.0 0 4 \n3 0.0 0 4 \n4 0.0 4 47 \n\n[5 rows x 45 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxicity</th>\n <th>obscene</th>\n <th>identity_attack</th>\n <th>insult</th>\n <th>threat</th>\n <th>asian</th>\n <th>atheist</th>\n <th>...</th>\n <th>article_id</th>\n <th>rating</th>\n <th>funny</th>\n <th>wow</th>\n <th>sad</th>\n <th>likes</th>\n <th>disagree</th>\n <th>sexual_explicit</th>\n <th>identity_annotator_count</th>\n <th>toxicity_annotator_count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>59848</td>\n <td>This is so cool. It's like, 'would you want yo...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1</th>\n <td>59849</td>\n <td>Thank you!! This would make my life a lot less...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>59852</td>\n <td>This is such an urgent design problem; kudos t...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>3</th>\n <td>59855</td>\n <td>Is this something I'll be able to install on m...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>4</th>\n <td>59856</td>\n <td>haha you guys are a bunch of losers.</td>\n <td>0.893617</td>\n <td>0.021277</td>\n <td>0.0</td>\n <td>0.021277</td>\n <td>0.87234</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>2006</td>\n <td>rejected</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0.0</td>\n <td>4</td>\n <td>47</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 45 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.488844Z","iopub.execute_input":"2023-05-12T08:12:42.489110Z","iopub.status.idle":"2023-05-12T08:12:42.504161Z","shell.execute_reply.started":"2023-05-12T08:12:42.489087Z","shell.execute_reply":"2023-05-12T08:12:42.503316Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" id comment_text lang toxic\n0 0 Este usuario ni siquiera llega al rango de ... es 0\n1 1 Il testo di questa voce pare esser scopiazzato... it 0\n2 2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... es 1\n3 3 Bu maddenin alt başlığı olarak uluslararası i... tr 0\n4 4 Belçika nın şehirlerinin yanında ilçe ve belde... tr 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>es</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>it</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>es</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>tr</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.505217Z","iopub.execute_input":"2023-05-12T08:12:42.505504Z","iopub.status.idle":"2023-05-12T08:12:42.518947Z","shell.execute_reply.started":"2023-05-12T08:12:42.505480Z","shell.execute_reply":"2023-05-12T08:12:42.518159Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" id content lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>content</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train1[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.519956Z","iopub.execute_input":"2023-05-12T08:12:42.520259Z","iopub.status.idle":"2023-05-12T08:12:42.534176Z","shell.execute_reply.started":"2023-05-12T08:12:42.520234Z","shell.execute_reply":"2023-05-12T08:12:42.533484Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"toxic\n0 202165\n1 21384\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train2[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.537691Z","iopub.execute_input":"2023-05-12T08:12:42.537946Z","iopub.status.idle":"2023-05-12T08:12:42.574451Z","shell.execute_reply.started":"2023-05-12T08:12:42.537925Z","shell.execute_reply":"2023-05-12T08:12:42.573541Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"toxic\n0.000000 1333035\n0.166667 138501\n0.200000 113271\n0.300000 62195\n0.400000 52703\n ... \n0.037609 1\n0.971193 1\n0.988430 1\n0.008309 1\n0.967316 1\nName: count, Length: 3853, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.575526Z","iopub.execute_input":"2023-05-12T08:12:42.575805Z","iopub.status.idle":"2023-05-12T08:12:42.584242Z","shell.execute_reply.started":"2023-05-12T08:12:42.575781Z","shell.execute_reply":"2023-05-12T08:12:42.583468Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"toxic\n0 6770\n1 1230\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.585256Z","iopub.execute_input":"2023-05-12T08:12:42.585532Z","iopub.status.idle":"2023-05-12T08:12:42.596996Z","shell.execute_reply.started":"2023-05-12T08:12:42.585510Z","shell.execute_reply":"2023-05-12T08:12:42.596246Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"lang\ntr 3000\nes 2500\nit 2500\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"test[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.597893Z","iopub.execute_input":"2023-05-12T08:12:42.598151Z","iopub.status.idle":"2023-05-12T08:12:42.612575Z","shell.execute_reply.started":"2023-05-12T08:12:42.598129Z","shell.execute_reply":"2023-05-12T08:12:42.611766Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"lang\ntr 14000\npt 11012\nru 10948\nfr 10920\nit 8494\nes 8438\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.312161Z","iopub.execute_input":"2023-05-12T11:51:03.312475Z","iopub.status.idle":"2023-05-12T11:51:03.453706Z","shell.execute_reply.started":"2023-05-12T11:51:03.312450Z","shell.execute_reply":"2023-05-12T11:51:03.452741Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"train2.toxic.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.454767Z","iopub.execute_input":"2023-05-12T11:51:03.455331Z","iopub.status.idle":"2023-05-12T11:51:03.481303Z","shell.execute_reply.started":"2023-05-12T11:51:03.455304Z","shell.execute_reply":"2023-05-12T11:51:03.480425Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"toxic\n0 1789968\n1 112226\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.482311Z","iopub.execute_input":"2023-05-12T11:51:03.482966Z","iopub.status.idle":"2023-05-12T11:51:03.827807Z","shell.execute_reply.started":"2023-05-12T11:51:03.482940Z","shell.execute_reply":"2023-05-12T11:51:03.826717Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.829068Z","iopub.execute_input":"2023-05-12T11:51:03.829375Z","iopub.status.idle":"2023-05-12T11:51:03.834997Z","shell.execute_reply.started":"2023-05-12T11:51:03.829350Z","shell.execute_reply":"2023-05-12T11:51:03.834118Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"train.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.835982Z","iopub.execute_input":"2023-05-12T11:51:03.836269Z","iopub.status.idle":"2023-05-12T11:51:03.854775Z","shell.execute_reply.started":"2023-05-12T11:51:03.836227Z","shell.execute_reply":"2023-05-12T11:51:03.853871Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Explanation\\nWhy the edits made under my usern... 0\n1 D'aww! He matches this background colour I'm s... 0\n2 Hey man, I'm really not trying to edit war. It... 0\n3 \"\\nMore\\nI can't make any real suggestions on ... 0\n4 You, sir, are my hero. Any chance you remember... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.858708Z","iopub.execute_input":"2023-05-12T11:51:03.859118Z","iopub.status.idle":"2023-05-12T11:51:03.866689Z","shell.execute_reply.started":"2023-05-12T11:51:03.859092Z","shell.execute_reply":"2023-05-12T11:51:03.865871Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Este usuario ni siquiera llega al rango de ... 0\n1 Il testo di questa voce pare esser scopiazzato... 0\n2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... 1\n3 Bu maddenin alt başlığı olarak uluslararası i... 0\n4 Belçika nın şehirlerinin yanında ilçe ve belde... 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Este usuario ni siquiera llega al rango de ...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Il testo di questa voce pare esser scopiazzato...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Bu maddenin alt başlığı olarak uluslararası i...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.867828Z","iopub.execute_input":"2023-05-12T11:51:03.868255Z","iopub.status.idle":"2023-05-12T11:51:03.881894Z","shell.execute_reply.started":"2023-05-12T11:51:03.868213Z","shell.execute_reply":"2023-05-12T11:51:03.881141Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":" id comment_text lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>lang</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>Вполне возможно, но я пока не вижу необходимо...</td>\n <td>ru</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>Quindi tu sei uno di quelli conservativi , ...</td>\n <td>it</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n <td>tr</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n <td>tr</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.882947Z","iopub.execute_input":"2023-05-12T11:51:03.883338Z","iopub.status.idle":"2023-05-12T11:51:03.892723Z","shell.execute_reply.started":"2023-05-12T11:51:03.883311Z","shell.execute_reply":"2023-05-12T11:51:03.891955Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.893692Z","iopub.execute_input":"2023-05-12T11:51:03.894038Z","iopub.status.idle":"2023-05-12T11:51:05.368808Z","shell.execute_reply.started":"2023-05-12T11:51:03.894014Z","shell.execute_reply":"2023-05-12T11:51:05.367736Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:05.369914Z","iopub.execute_input":"2023-05-12T11:51:05.370196Z","iopub.status.idle":"2023-05-12T11:51:08.102915Z","shell.execute_reply.started":"2023-05-12T11:51:05.370173Z","shell.execute_reply":"2023-05-12T11:51:08.101871Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"56.28243572395129\n2321\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"iVBORw0KGgoAAAANSUhEUgAAAkIAAAGdCAYAAAD+JxxnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+BklEQVR4nO3de3BU9f3/8VcSkw0BNuFiElICRFEhykVCDfutOlxCFsw4UqODymhEhIFv4jSkBZv+MNzawWK5WYJpqxA6SgU61VagIWuQUMsCEki5CaMWv7Ff2GAVWAiwWZLz+6OT82UJQhYWVjnPx8xO3fN5n7Of/byT8OrZc5IIwzAMAQAAWFBkuCcAAAAQLgQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWQQhAABgWbeEewLfZs3NzTpy5Ig6duyoiIiIcE8HAAC0gWEYOnXqlFJSUhQZeflzPgShyzhy5IhSU1PDPQ0AAHAVvvjiC3Xv3v2yNQShy+jYsaOk/yyk3W4P6bH9fr8qKyuVnZ2t6OjokB4bbUMPwo8ehB89CD96EHper1epqanmv+OXQxC6jJaPw+x2+3UJQnFxcbLb7Xzhhwk9CD96EH70IPzowfXTlstauFgaAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABY1i3hnoDV3TNro3xNEUHv9/nLOddhNgAAWAtnhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGURhAAAgGVdUxB6+eWXFRERocLCQnPbuXPnlJ+fry5duqhDhw7Kzc1VfX19wH51dXXKyclRXFycEhMTNW3aNJ0/fz6gZvPmzRo0aJBsNpt69+6t8vLyVq9fWlqqXr16KTY2VpmZmdqxY0fAeFvmAgAArOuqg9BHH32k3/zmN+rfv3/A9qlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7uqIHT69GmNGzdOv/vd79SpUydz+8mTJ/XGG29o4cKFGj58uDIyMrRixQpt3bpV27ZtkyRVVlbqwIEDevPNNzVw4ECNHj1ac+fOVWlpqRobGyVJZWVlSktL04IFC9S3b18VFBToscce06JFi8zXWrhwoSZOnKjx48crPT1dZWVliouL0/Lly9s8FwAAYG1XFYTy8/OVk5OjrKysgO01NTXy+/0B2/v06aMePXrI7XZLktxut/r166ekpCSzxul0yuv1av/+/WbNxcd2Op3mMRobG1VTUxNQExkZqaysLLOmLXMBAADWdkuwO7z99tvatWuXPvroo1ZjHo9HMTExSkhICNielJQkj8dj1lwYglrGW8YuV+P1enX27FkdP35cTU1Nl6w5ePBgm+dyMZ/PJ5/PZz73er2SJL/fL7/ff8l9rlbL8WyRxjXtj6vXsoasZfjQg/CjB+FHD0IvmLUMKgh98cUX+tGPfiSXy6XY2NigJ/ZtN2/ePM2ePbvV9srKSsXFxV2X15w7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtUEFoZqaGh07dkyDBg0ytzU1NWnLli1aunSpNm7cqMbGRp04cSLgTEx9fb2Sk5MlScnJya3u7mq5k+vCmovv7qqvr5fdble7du0UFRWlqKioS9ZceIwrzeVixcXFKioqMp97vV6lpqYqOztbdru9LUvUZn6/Xy6XSy/tjJSvOSLo/ffNcoZ0PlbU0oORI0cqOjo63NOxJHoQfvQg/OhB6LV8otMWQQWhESNGaO/evQHbxo8frz59+ujFF19UamqqoqOjVVVVpdzcXEnSoUOHVFdXJ4fDIUlyOBz6xS9+oWPHjikxMVHSf1Kw3W5Xenq6WXPxGQ+Xy2UeIyYmRhkZGaqqqtKYMWMkSc3NzaqqqlJBQYEkKSMj44pzuZjNZpPNZmu1PTo6+rp9cfqaI+RrCj4I8c0SOtezv2gbehB+9CD86EHoBLOOQQWhjh076p577gnY1r59e3Xp0sXcPmHCBBUVFalz586y2+164YUX5HA4NGTIEElSdna20tPT9fTTT2v+/PnyeDyaMWOG8vPzzRAyefJkLV26VNOnT9dzzz2nTZs2ac2aNVq/fr35ukVFRcrLy9PgwYN13333afHixWpoaND48eMlSfHx8VecCwAAsLagL5a+kkWLFikyMlK5ubny+XxyOp1atmyZOR4VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv/7OGjs2LH68ssvVVJSIo/Ho4EDB6qioiLgAuorzQUAAFjbNQehzZs3BzyPjY1VaWmpSktLv3Gfnj17XvFi36FDh2r37t2XrSkoKDA/CruUtswFAABYF39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQei1115T//79ZbfbZbfb5XA49Ne//tUcHzp0qCIiIgIekydPDjhGXV2dcnJyFBcXp8TERE2bNk3nz58PqNm8ebMGDRokm82m3r17q7y8vNVcSktL1atXL8XGxiozM1M7duwIGD937pzy8/PVpUsXdejQQbm5uaqvrw/m7QIAgJtcUEGoe/fuevnll1VTU6OdO3dq+PDheuSRR7R//36zZuLEiTp69Kj5mD9/vjnW1NSknJwcNTY2auvWrVq5cqXKy8tVUlJi1hw+fFg5OTkaNmyYamtrVVhYqOeff14bN240a1avXq2ioiLNnDlTu3bt0oABA+R0OnXs2DGzZurUqXrvvfe0du1aVVdX68iRI3r00UevapEAAMDNKagg9PDDD+uhhx7SHXfcoTvvvFO/+MUv1KFDB23bts2siYuLU3Jysvmw2+3mWGVlpQ4cOKA333xTAwcO1OjRozV37lyVlpaqsbFRklRWVqa0tDQtWLBAffv2VUFBgR577DEtWrTIPM7ChQs1ceJEjR8/Xunp6SorK1NcXJyWL18uSTp58qTeeOMNLVy4UMOHD1dGRoZWrFihrVu3BswVAABY2y1Xu2NTU5PWrl2rhoYGORwOc/tbb72lN998U8nJyXr44Yf10ksvKS4uTpLkdrvVr18/JSUlmfVOp1NTpkzR/v37de+998rtdisrKyvgtZxOpwoLCyVJjY2NqqmpUXFxsTkeGRmprKwsud1uSVJNTY38fn/Acfr06aMePXrI7XZryJAhl3xPPp9PPp/PfO71eiVJfr9ffr//apbpG7UczxZpXNP+uHota8hahg89CD96EH70IPSCWcugg9DevXvlcDh07tw5dejQQe+8847S09MlSU899ZR69uyplJQU7dmzRy+++KIOHTqkP/3pT5Ikj8cTEIIkmc89Hs9la7xer86ePavjx4+rqanpkjUHDx40jxETE6OEhIRWNS2vcynz5s3T7NmzW22vrKw0w1yozR3cfFX7bdiwIcQzsS6XyxXuKVgePQg/ehB+9CB0zpw50+baoIPQXXfdpdraWp08eVJ//OMflZeXp+rqaqWnp2vSpElmXb9+/dStWzeNGDFCn332mW6//fZgX+qGKy4uVlFRkfnc6/UqNTVV2dnZAR/xhYLf75fL5dJLOyPla44Iev99s5whnY8VtfRg5MiRio6ODvd0LIkehB89CD96EHotn+i0RdBBKCYmRr1795YkZWRk6KOPPtKSJUv0m9/8plVtZmamJOnTTz/V7bffruTk5FZ3d7XcyZWcnGz+78V3d9XX18tut6tdu3aKiopSVFTUJWsuPEZjY6NOnDgRcFbowppLsdlsstlsrbZHR0dfty9OX3OEfE3BByG+WULnevYXbUMPwo8ehB89CJ1g1vGaf49Qc3NzwHU1F6qtrZUkdevWTZLkcDi0d+/egLu7XC6X7Ha7+fGaw+FQVVVVwHFcLpd5HVJMTIwyMjICapqbm1VVVWXWZGRkKDo6OqDm0KFDqqurC7ieCQAAWFtQZ4SKi4s1evRo9ejRQ6dOndKqVau0efNmbdy4UZ999plWrVqlhx56SF26dNGePXs0depUPfjgg+rfv78kKTs7W+np6Xr66ac1f/58eTwezZgxQ/n5+eaZmMmTJ2vp0qWaPn26nnvuOW3atElr1qzR+vXrzXkUFRUpLy9PgwcP1n333afFixeroaFB48ePlyTFx8drwoQJKioqUufOnWW32/XCCy/I4XB844XSAADAeoIKQseOHdMzzzyjo0ePKj4+Xv3799fGjRs1cuRIffHFF3r//ffNUJKamqrc3FzNmDHD3D8qKkrr1q3TlClT5HA41L59e+Xl5WnOnDlmTVpamtavX6+pU6dqyZIl6t69u15//XU5nf93TczYsWP15ZdfqqSkRB6PRwMHDlRFRUXABdSLFi1SZGSkcnNz5fP55HQ6tWzZsmtZKwAAcJMJKgi98cYb3ziWmpqq6urqKx6jZ8+eV7zjaejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyJvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6gg9Nprr6l///6y2+2y2+1yOBz661//ao6fO3dO+fn56tKlizp06KDc3FzV19cHHKOurk45OTmKi4tTYmKipk2bpvPnzwfUbN68WYMGDZLNZlPv3r1VXl7eai6lpaXq1auXYmNjlZmZqR07dgSMt2UuAADA2oIKQt27d9fLL7+smpoa7dy5U8OHD9cjjzyi/fv3S5KmTp2q9957T2vXrlV1dbWOHDmiRx991Ny/qalJOTk5amxs1NatW7Vy5UqVl5erpKTErDl8+LBycnI0bNgw1dbWqrCwUM8//7w2btxo1qxevVpFRUWaOXOmdu3apQEDBsjpdOrYsWNmzZXmAgAAEFQQevjhh/XQQw/pjjvu0J133qlf/OIX6tChg7Zt26aTJ0/qjTfe0MKFCzV8+HBlZGRoxYoV2rp1q7Zt2yZJqqys1IEDB/Tmm29q4MCBGj16tObOnavS0lI1NjZKksrKypSWlqYFCxaob9++Kigo0GOPPaZFixaZ81i4cKEmTpyo8ePHKz09XWVlZYqLi9Py5cslqU1zAQAAuOVqd2xqatLatWvV0NAgh8Ohmpoa+f1+ZWVlmTV9+vRRjx495Ha7NWTIELndbvXr109JSUlmjdPp1JQpU7R//37de++9crvdAcdoqSksLJQkNTY2qqamRsXFxeZ4ZGSksrKy5Ha7JalNc7kUn88nn89nPvd6vZIkv98vv99/lSt1aS3Hs0Ua17Q/rl7LGrKW4UMPwo8ehB89CL1g1jLoILR37145HA6dO3dOHTp00DvvvKP09HTV1tYqJiZGCQkJAfVJSUnyeDySJI/HExCCWsZbxi5X4/V6dfbsWR0/flxNTU2XrDl48KB5jCvN5VLmzZun2bNnt9peWVmpuLi4b9zvWswd3HxV+23YsCHEM7Eul8sV7ilYHj0IP3oQfvQgdM6cOdPm2qCD0F133aXa2lqdPHlSf/zjH5WXl6fq6upgD/OtVFxcrKKiIvO51+tVamqqsrOzZbfbQ/pafr9fLpdLL+2MlK85Iuj9981yhnQ+VtTSg5EjRyo6Ojrc07EkehB+9CD86EHotXyi0xZBB6GYmBj17t1bkpSRkaGPPvpIS5Ys0dixY9XY2KgTJ04EnImpr69XcnKyJCk5ObnV3V0td3JdWHPx3V319fWy2+1q166doqKiFBUVdcmaC49xpblcis1mk81ma7U9Ojr6un1x+poj5GsKPgjxzRI617O/aBt6EH70IPzoQegEs47X/HuEmpub5fP5lJGRoejoaFVVVZljhw4dUl1dnRwOhyTJ4XBo7969AXd3uVwu2e12paenmzUXHqOlpuUYMTExysjICKhpbm5WVVWVWdOWuQAAAAR1Rqi4uFijR49Wjx49dOrUKa1atUqbN2/Wxo0bFR8frwkTJqioqEidO3eW3W7XCy+8IIfDYV6cnJ2drfT0dD399NOaP3++PB6PZsyYofz8fPNMzOTJk7V06VJNnz5dzz33nDZt2qQ1a9Zo/fr15jyKioqUl5enwYMH67777tPixYvV0NCg8ePHS1Kb5gIAABBUEDp27JieeeYZHT16VPHx8erfv782btyokSNHSpIWLVqkyMhI5ebmyufzyel0atmyZeb+UVFRWrdunaZMmSKHw6H27dsrLy9Pc+bMMWvS0tK0fv16TZ06VUuWLFH37t31+uuvy+n8v2tixo4dqy+//FIlJSXyeDwaOHCgKioqAi6gvtJcAAAAIgzDuLr7ty3A6/UqPj5eJ0+evC4XS2/YsEHTd0Rd1TVCn7+cE9L5WFFLDx566CE+lw8TehB+9CD86EHoBfPvN39rDAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWBZBCAAAWFZQQWjevHn6/ve/r44dOyoxMVFjxozRoUOHAmqGDh2qiIiIgMfkyZMDaurq6pSTk6O4uDglJiZq2rRpOn/+fEDN5s2bNWjQINlsNvXu3Vvl5eWt5lNaWqpevXopNjZWmZmZ2rFjR8D4uXPnlJ+fry5duqhDhw7Kzc1VfX19MG8ZAADcxIIKQtXV1crPz9e2bdvkcrnk9/uVnZ2thoaGgLqJEyfq6NGj5mP+/PnmWFNTk3JyctTY2KitW7dq5cqVKi8vV0lJiVlz+PBh5eTkaNiwYaqtrVVhYaGef/55bdy40axZvXq1ioqKNHPmTO3atUsDBgyQ0+nUsWPHzJqpU6fqvffe09q1a1VdXa0jR47o0UcfDXqRAADAzemWYIorKioCnpeXlysxMVE1NTV68MEHze1xcXFKTk6+5DEqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP9XJkyf1xhtvaNWqVRo+fLgkacWKFerbt6+2bdumIUOGBPPWAQDATSioIHSxkydPSpI6d+4csP2tt97Sm2++qeTkZD388MN66aWXFBcXJ0lyu93q16+fkpKSzHqn06kpU6Zo//79uvfee+V2u5WVlRVwTKfTqcLCQklSY2OjampqVFxcbI5HRkYqKytLbrdbklRTUyO/3x9wnD59+qhHjx5yu92XDEI+n08+n8987vV6JUl+v19+vz/o9bmcluPZIo1r2h9Xr2UNWcvwoQfhRw/Cjx6EXjBredVBqLm5WYWFhfrBD36ge+65x9z+1FNPqWfPnkpJSdGePXv04osv6tChQ/rTn/4kSfJ4PAEhSJL53OPxXLbG6/Xq7NmzOn78uJqami5Zc/DgQfMYMTExSkhIaFXT8joXmzdvnmbPnt1qe2VlpRnkQm3u4Oar2m/Dhg0hnol1uVyucE/B8uhB+NGD8KMHoXPmzJk21151EMrPz9e+ffv04YcfBmyfNGmS+d/9+vVTt27dNGLECH322We6/fbbr/blboji4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r37ZWszMzMlSZ9++qluv/12JScnt7q7q+VOrpbripKTk1vd3VVfXy+73a527dopKipKUVFRl6y58BiNjY06ceJEwFmhC2suZrPZZLPZWm2Pjo6+bl+cvuYI+ZqCD0J8s4TO9ewv2oYehB89CD96EDrBrGNQd40ZhqGCggK988472rRpk9LS0q64T21trSSpW7dukiSHw6G9e/cG3N3lcrlkt9uVnp5u1lRVVQUcx+VyyeFwSJJiYmKUkZERUNPc3KyqqiqzJiMjQ9HR0QE1hw4dUl1dnVkDAACsLagzQvn5+Vq1apX+/Oc/q2PHjua1NvHx8WrXrp0+++wzrVq1Sg899JC6dOmiPXv2aOrUqXrwwQfVv39/SVJ2drbS09P19NNPa/78+fJ4PJoxY4by8/PNszGTJ0/W0qVLNX36dD333HPatGmT1qxZo/Xr15tzKSoqUl5engYPHqz77rtPixcvVkNDg3kXWXx8vCZMmKCioiJ17txZdrtdL7zwghwOB3eMAQAASUEGoddee03Sf35p4oVWrFihZ599VjExMXr//ffNUJKamqrc3FzNmDHDrI2KitK6des0ZcoUORwOtW/fXnl5eZozZ45Zk5aWpvXr12vq1KlasmSJunfvrtdff928dV6Sxo4dqy+//FIlJSXyeDwaOHCgKioqAi6gXrRokSIjI5Wbmyufzyen06lly5YFtUAAAODmFVQQMozL3+qdmpqq6urqKx6nZ8+eV7zraejQodq9e/dlawoKClRQUPCN47GxsSotLVVpaekV5wQAAKyHvzUGAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsiyAEAAAsK6ggNG/ePH3/+99Xx44dlZiYqDFjxujQoUMBNefOnVN+fr66dOmiDh06KDc3V/X19QE1dXV1ysnJUVxcnBITEzVt2jSdP38+oGbz5s0aNGiQbDabevfurfLy8lbzKS0tVa9evRQbG6vMzEzt2LEj6LkAAADrCioIVVdXKz8/X9u2bZPL5ZLf71d2drYaGhrMmqlTp+q9997T2rVrVV1drSNHjujRRx81x5uampSTk6PGxkZt3bpVK1euVHl5uUpKSsyaw4cPKycnR8OGDVNtba0KCwv1/PPPa+PGjWbN6tWrVVRUpJkzZ2rXrl0aMGCAnE6njh071ua5AAAAa7slmOKKioqA5+Xl5UpMTFRNTY0efPBBnTx5Um+88YZWrVql4cOHS5JWrFihvn37atu2bRoyZIgqKyt14MABvf/++0pKStLAgQM1d+5cvfjii5o1a5ZiYmJUVlamtLQ0LViwQJLUt29fffjhh1q0aJGcTqckaeHChZo4caLGjx8vSSorK9P69eu1fPly/fSnP23TXAAAgLUFFYQudvLkSUlS586dJUk1NTXy+/3Kysoya/r06aMePXrI7XZryJAhcrvd6tevn5KSkswap9OpKVOmaP/+/br33nvldrsDjtFSU1hYKElqbGxUTU2NiouLzfHIyEhlZWXJ7Xa3eS4X8/l88vl85nOv1ytJ8vv98vv9V7VG36TleLZI45r2x9VrWUPWMnzoQfjRg/CjB6EXzFpedRBqbm5WYWGhfvCDH+iee+6RJHk8HsXExCghISGgNikpSR6Px6y5MAS1jLeMXa7G6/Xq7NmzOn78uJqami5Zc/DgwTbP5WLz5s3T7NmzW22vrKxUXFzcNy3FNZk7uPmq9tuwYUOIZ2JdLpcr3FOwPHoQfvQg/OhB6Jw5c6bNtVcdhPLz87Vv3z59+OGHV3uIb53i4mIVFRWZz71er1JTU5WdnS273R7S1/L7/XK5XHppZ6R8zRFB779vljOk87Gilh6MHDlS0dHR4Z6OJdGD8KMH4UcPQq/lE522uKogVFBQoHXr1mnLli3q3r27uT05OVmNjY06ceJEwJmY+vp6JScnmzUX393VcifXhTUX391VX18vu92udu3aKSoqSlFRUZesufAYV5rLxWw2m2w2W6vt0dHR1+2L09ccIV9T8EGIb5bQuZ79RdvQg/CjB+FHD0InmHUM6q4xwzBUUFCgd955R5s2bVJaWlrAeEZGhqKjo1VVVWVuO3TokOrq6uRwOCRJDodDe/fuDbi7y+VyyW63Kz093ay58BgtNS3HiImJUUZGRkBNc3OzqqqqzJq2zAUAAFhbUGeE8vPztWrVKv35z39Wx44dzWtt4uPj1a5dO8XHx2vChAkqKipS586dZbfb9cILL8jhcJgXJ2dnZys9PV1PP/205s+fL4/HoxkzZig/P988GzN58mQtXbpU06dP13PPPadNmzZpzZo1Wr9+vTmXoqIi5eXlafDgwbrvvvu0ePFiNTQ0mHeRtWUuAADA2oIKQq+99pokaejQoQHbV6xYoWeffVaStGjRIkVGRio3N1c+n09Op1PLli0za6OiorRu3TpNmTJFDodD7du3V15enubMmWPWpKWlaf369Zo6daqWLFmi7t276/XXXzdvnZeksWPH6ssvv1RJSYk8Ho8GDhyoioqKgAuorzQXAABgbUEFIcO48q3esbGxKi0tVWlp6TfW9OzZ84p3PQ0dOlS7d+++bE1BQYEKCgquaS4AAMC6+FtjAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsoIOQlu2bNHDDz+slJQURURE6N133w0Yf/bZZxURERHwGDVqVEDN119/rXHjxslutyshIUETJkzQ6dOnA2r27NmjBx54QLGxsUpNTdX8+fNbzWXt2rXq06ePYmNj1a9fP23YsCFg3DAMlZSUqFu3bmrXrp2ysrL0ySefBPuWAQDATSroINTQ0KABAwaotLT0G2tGjRqlo0ePmo8//OEPAePjxo3T/v375XK5tG7dOm3ZskWTJk0yx71er7Kzs9WzZ0/V1NTolVde0axZs/Tb3/7WrNm6dauefPJJTZgwQbt379aYMWM0ZswY7du3z6yZP3++Xn31VZWVlWn79u1q3769nE6nzp07F+zbBgAAN6Fbgt1h9OjRGj169GVrbDabkpOTLzn28ccfq6KiQh999JEGDx4sSfr1r3+thx56SL/61a+UkpKit956S42NjVq+fLliYmJ09913q7a2VgsXLjQD05IlSzRq1ChNmzZNkjR37ly5XC4tXbpUZWVlMgxDixcv1owZM/TII49Ikn7/+98rKSlJ7777rp544olg3zoAALjJBB2E2mLz5s1KTExUp06dNHz4cP385z9Xly5dJElut1sJCQlmCJKkrKwsRUZGavv27frhD38ot9utBx98UDExMWaN0+nUL3/5Sx0/flydOnWS2+1WUVFRwOs6nU7zo7rDhw/L4/EoKyvLHI+Pj1dmZqbcbvclg5DP55PP5zOfe71eSZLf75ff77/2hblAy/FskcY17Y+r17KGrGX40IPwowfhRw9CL5i1DHkQGjVqlB599FGlpaXps88+089+9jONHj1abrdbUVFR8ng8SkxMDJzELbeoc+fO8ng8kiSPx6O0tLSAmqSkJHOsU6dO8ng85rYLay48xoX7XarmYvPmzdPs2bNbba+srFRcXFxblyAocwc3X9V+F18PhavncrnCPQXLowfhRw/Cjx6EzpkzZ9pcG/IgdOGZln79+ql///66/fbbtXnzZo0YMSLULxdSxcXFAWeZvF6vUlNTlZ2dLbvdHtLX8vv9crlcemlnpHzNEUHvv2+WM6TzsaKWHowcOVLR0dHhno4l0YPwowfhRw9Cr+UTnba4Lh+NXei2225T165d9emnn2rEiBFKTk7WsWPHAmrOnz+vr7/+2ryuKDk5WfX19QE1Lc+vVHPheMu2bt26BdQMHDjwknO12Wyy2WyttkdHR1+3L05fc4R8TcEHIb5ZQud69hdtQw/Cjx6EHz0InWDW8br/HqF//etf+uqrr8ww4nA4dOLECdXU1Jg1mzZtUnNzszIzM82aLVu2BHzG53K5dNddd6lTp05mTVVVVcBruVwuORwOSVJaWpqSk5MDarxer7Zv327WAAAAaws6CJ0+fVq1tbWqra2V9J+Lkmtra1VXV6fTp09r2rRp2rZtmz7//HNVVVXpkUceUe/eveV0/uejnL59+2rUqFGaOHGiduzYob///e8qKCjQE088oZSUFEnSU089pZiYGE2YMEH79+/X6tWrtWTJkoCPrX70ox+poqJCCxYs0MGDBzVr1izt3LlTBQUFkqSIiAgVFhbq5z//uf7yl79o7969euaZZ5SSkqIxY8Zc47IBAICbQdAfje3cuVPDhg0zn7eEk7y8PL322mvas2ePVq5cqRMnTiglJUXZ2dmaO3duwEdOb731lgoKCjRixAhFRkYqNzdXr776qjkeHx+vyspK5efnKyMjQ127dlVJSUnA7xr6r//6L61atUozZszQz372M91xxx169913dc8995g106dPV0NDgyZNmqQTJ07o/vvvV0VFhWJjY4N92wAA4CYUdBAaOnSoDOObb/neuHHjFY/RuXNnrVq16rI1/fv319/+9rfL1jz++ON6/PHHv3E8IiJCc+bM0Zw5c644JwAAYD38rTEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZBCEAAGBZQQehLVu26OGHH1ZKSooiIiL07rvvBowbhqGSkhJ169ZN7dq1U1ZWlj755JOAmq+//lrjxo2T3W5XQkKCJkyYoNOnTwfU7NmzRw888IBiY2OVmpqq+fPnt5rL2rVr1adPH8XGxqpfv37asGFD0HMBAADWFXQQamho0IABA1RaWnrJ8fnz5+vVV19VWVmZtm/frvbt28vpdOrcuXNmzbhx47R//365XC6tW7dOW7Zs0aRJk8xxr9er7Oxs9ezZUzU1NXrllVc0a9Ys/fa3vzVrtm7dqieffFITJkzQ7t27NWbMGI0ZM0b79u0Lai4AAMC6bgl2h9GjR2v06NGXHDMMQ4sXL9aMGTP0yCOPSJJ+//vfKykpSe+++66eeOIJffzxx6qoqNBHH32kwYMHS5J+/etf66GHHtKvfvUrpaSk6K233lJjY6OWL1+umJgY3X333aqtrdXChQvNwLRkyRKNGjVK06ZNkyTNnTtXLpdLS5cuVVlZWZvmAgAArC2k1wgdPnxYHo9HWVlZ5rb4+HhlZmbK7XZLktxutxISEswQJElZWVmKjIzU9u3bzZoHH3xQMTExZo3T6dShQ4d0/Phxs+bC12mpaXmdtswFAABYW9BnhC7H4/FIkpKSkgK2JyUlmWMej0eJiYmBk7jlFnXu3DmgJi0trdUxWsY6deokj8dzxde50lwu5vP55PP5zOder1eS5Pf75ff7L/fWg9ZyPFukcU374+q1rCFrGT70IPzoQfjRg9ALZi1DGoS+6+bNm6fZs2e32l5ZWam4uLjr8ppzBzdf1X4XXxiOq+dyucI9BcujB+FHD8KPHoTOmTNn2lwb0iCUnJwsSaqvr1e3bt3M7fX19Ro4cKBZc+zYsYD9zp8/r6+//trcPzk5WfX19QE1Lc+vVHPh+JXmcrHi4mIVFRWZz71er1JTU5WdnS273X7lBQiC3++Xy+XSSzsj5WuOCHr/fbOcIZ2PFbX0YOTIkYqOjg73dCyJHoQfPQg/ehB6LZ/otEVIg1BaWpqSk5NVVVVlhg2v16vt27drypQpkiSHw6ETJ06opqZGGRkZkqRNmzapublZmZmZZs3/+3//T36/3/yicLlcuuuuu9SpUyezpqqqSoWFhebru1wuORyONs/lYjabTTabrdX26Ojo6/bF6WuOkK8p+CDEN0voXM/+om3oQfjRg/CjB6ETzDoGfbH06dOnVVtbq9raWkn/uSi5trZWdXV1ioiIUGFhoX7+85/rL3/5i/bu3atnnnlGKSkpGjNmjCSpb9++GjVqlCZOnKgdO3bo73//uwoKCvTEE08oJSVFkvTUU08pJiZGEyZM0P79+7V69WotWbIk4GzNj370I1VUVGjBggU6ePCgZs2apZ07d6qgoECS2jQXAABgbUGfEdq5c6eGDRtmPm8JJ3l5eSovL9f06dPV0NCgSZMm6cSJE7r//vtVUVGh2NhYc5+33npLBQUFGjFihCIjI5Wbm6tXX33VHI+Pj1dlZaXy8/OVkZGhrl27qqSkJOB3Df3Xf/2XVq1apRkzZuhnP/uZ7rjjDr377ru65557zJq2zAUAAFhX0EFo6NChMoxvvtMpIiJCc+bM0Zw5c76xpnPnzlq1atVlX6d///7629/+dtmaxx9/XI8//vg1zQUAAFgXf2sMAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYFkEIAABYVtB/awzfDr1+uv6q9/385ZwQzgQAgO8uzggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLIggBAADLCnkQmjVrliIiIgIeffr0McfPnTun/Px8denSRR06dFBubq7q6+sDjlFXV6ecnBzFxcUpMTFR06ZN0/nz5wNqNm/erEGDBslms6l3794qLy9vNZfS0lL16tVLsbGxyszM1I4dO0L9dgEAwHfYdTkjdPfdd+vo0aPm48MPPzTHpk6dqvfee09r165VdXW1jhw5okcffdQcb2pqUk5OjhobG7V161atXLlS5eXlKikpMWsOHz6snJwcDRs2TLW1tSosLNTzzz+vjRs3mjWrV69WUVGRZs6cqV27dmnAgAFyOp06duzY9XjLAADgO+i6BKFbbrlFycnJ5qNr166SpJMnT+qNN97QwoULNXz4cGVkZGjFihXaunWrtm3bJkmqrKzUgQMH9Oabb2rgwIEaPXq05s6dq9LSUjU2NkqSysrKlJaWpgULFqhv374qKCjQY489pkWLFplzWLhwoSZOnKjx48crPT1dZWVliouL0/Lly6/HWwYAAN9Bt1yPg37yySdKSUlRbGysHA6H5s2bpx49eqimpkZ+v19ZWVlmbZ8+fdSjRw+53W4NGTJEbrdb/fr1U1JSklnjdDo1ZcoU7d+/X/fee6/cbnfAMVpqCgsLJUmNjY2qqalRcXGxOR4ZGamsrCy53e5vnLfP55PP5zOfe71eSZLf75ff77+mNblYy/FskUZIjxvMa1tdyzqwHuFDD8KPHoQfPQi9YNYy5EEoMzNT5eXluuuuu3T06FHNnj1bDzzwgPbt2yePx6OYmBglJCQE7JOUlCSPxyNJ8ng8ASGoZbxl7HI1Xq9XZ8+e1fHjx9XU1HTJmoMHD37j3OfNm6fZs2e32l5ZWam4uLi2LUCQ5g5uvi7HvZwNGzbc8Nf8NnO5XOGeguXRg/CjB+FHD0LnzJkzba4NeRAaPXq0+d/9+/dXZmamevbsqTVr1qhdu3ahfrmQKi4uVlFRkfnc6/UqNTVV2dnZstvtIX0tv98vl8ull3ZGytccEdJjX8m+Wc4b+nrfVi09GDlypKKjo8M9HUuiB+FHD8KPHoReyyc6bXFdPhq7UEJCgu688059+umnGjlypBobG3XixImAs0L19fVKTk6WJCUnJ7e6u6vlrrILay6+06y+vl52u13t2rVTVFSUoqKiLlnTcoxLsdlsstlsrbZHR0dfty9OX3OEfE03NgjxjRboevYXbUMPwo8ehB89CJ1g1vG6/x6h06dP67PPPlO3bt2UkZGh6OhoVVVVmeOHDh1SXV2dHA6HJMnhcGjv3r0Bd3e5XC7Z7Xalp6ebNRceo6Wm5RgxMTHKyMgIqGlublZVVZVZAwAAEPIg9JOf/ETV1dX6/PPPtXXrVv3whz9UVFSUnnzyScXHx2vChAkqKirSBx98oJqaGo0fP14Oh0NDhgyRJGVnZys9PV1PP/20/vGPf2jjxo2aMWOG8vPzzbM1kydP1j//+U9Nnz5dBw8e1LJly7RmzRpNnTrVnEdRUZF+97vfaeXKlfr44481ZcoUNTQ0aPz48aF+ywAA4Dsq5B+N/etf/9KTTz6pr776Srfeeqvuv/9+bdu2TbfeeqskadGiRYqMjFRubq58Pp+cTqeWLVtm7h8VFaV169ZpypQpcjgcat++vfLy8jRnzhyzJi0tTevXr9fUqVO1ZMkSde/eXa+//rqczv+79mXs2LH68ssvVVJSIo/Ho4EDB6qioqLVBdQAAMC6Qh6E3n777cuOx8bGqrS0VKWlpd9Y07Nnzyve2TR06FDt3r37sjUFBQUqKCi4bA0AALAu/tYYAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwLIIQAACwrFvCPQHceL1+uv6q9/385ZwQzgQAgPDijBAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsghAAALAsSwSh0tJS9erVS7GxscrMzNSOHTvCPSUAAPAtcEu4J3C9rV69WkVFRSorK1NmZqYWL14sp9OpQ4cOKTExMdzT+87p9dP1V73v5y/nhHAmAABcu5v+jNDChQs1ceJEjR8/Xunp6SorK1NcXJyWL18e7qkBAIAwu6nPCDU2NqqmpkbFxcXmtsjISGVlZcntdreq9/l88vl85vOTJ09Kkr7++mv5/f6Qzs3v9+vMmTO6xR+ppuaIkB7726r3T9aE5XW3F4+45PaWHnz11VeKjo6+wbOCRA++DehB+NGD0Dt16pQkyTCMK9be1EHo3//+t5qampSUlBSwPSkpSQcPHmxVP2/ePM2ePbvV9rS0tOs2R1x/XReEewYAgHA4deqU4uPjL1tzUwehYBUXF6uoqMh83tzcrK+//lpdunRRRERoz9p4vV6lpqbqiy++kN1uD+mx0Tb0IPzoQfjRg/CjB6FnGIZOnTqllJSUK9be1EGoa9euioqKUn19fcD2+vp6JScnt6q32Wyy2WwB2xISEq7nFGW32/nCDzN6EH70IPzoQfjRg9C60pmgFjf1xdIxMTHKyMhQVVWVua25uVlVVVVyOBxhnBkAAPg2uKnPCElSUVGR8vLyNHjwYN13331avHixGhoaNH78+HBPDQAAhNlNH4TGjh2rL7/8UiUlJfJ4PBo4cKAqKipaXUB9o9lsNs2cObPVR3G4cehB+NGD8KMH4UcPwivCaMu9ZQAAADehm/oaIQAAgMshCAEAAMsiCAEAAMsiCAEAAMsiCIVBaWmpevXqpdjYWGVmZmrHjh3hntJNY9asWYqIiAh49OnTxxw/d+6c8vPz1aVLF3Xo0EG5ubmtfuFmXV2dcnJyFBcXp8TERE2bNk3nz5+/0W/lO2PLli16+OGHlZKSooiICL377rsB44ZhqKSkRN26dVO7du2UlZWlTz75JKDm66+/1rhx42S325WQkKAJEybo9OnTATV79uzRAw88oNjYWKWmpmr+/PnX+619Z1ypB88++2yr74tRo0YF1NCDazNv3jx9//vfV8eOHZWYmKgxY8bo0KFDATWh+vmzefNmDRo0SDabTb1791Z5efn1fns3NYLQDbZ69WoVFRVp5syZ2rVrlwYMGCCn06ljx46Fe2o3jbvvvltHjx41Hx9++KE5NnXqVL333ntau3atqqurdeTIET366KPmeFNTk3JyctTY2KitW7dq5cqVKi8vV0lJSTjeyndCQ0ODBgwYoNLS0kuOz58/X6+++qrKysq0fft2tW/fXk6nU+fOnTNrxo0bp/3798vlcmndunXasmWLJk2aZI57vV5lZ2erZ8+eqqmp0SuvvKJZs2bpt7/97XV/f98FV+qBJI0aNSrg++IPf/hDwDg9uDbV1dXKz8/Xtm3b5HK55Pf7lZ2drYaGBrMmFD9/Dh8+rJycHA0bNky1tbUqLCzU888/r40bN97Q93tTMXBD3XfffUZ+fr75vKmpyUhJSTHmzZsXxlndPGbOnGkMGDDgkmMnTpwwoqOjjbVr15rbPv74Y0OS4Xa7DcMwjA0bNhiRkZGGx+Mxa1577TXDbrcbPp/vus79ZiDJeOedd8znzc3NRnJysvHKK6+Y206cOGHYbDbjD3/4g2EYhnHgwAFDkvHRRx+ZNX/961+NiIgI43//938NwzCMZcuWGZ06dQrowYsvvmjcdddd1/kdffdc3APDMIy8vDzjkUce+cZ96EHoHTt2zJBkVFdXG4YRup8/06dPN+6+++6A1xo7dqzhdDqv91u6aXFG6AZqbGxUTU2NsrKyzG2RkZHKysqS2+0O48xuLp988olSUlJ02223ady4caqrq5Mk1dTUyO/3B6x/nz591KNHD3P93W63+vXrF/ALN51Op7xer/bv339j38hN4PDhw/J4PAFrHh8fr8zMzIA1T0hI0ODBg82arKwsRUZGavv27WbNgw8+qJiYGLPG6XTq0KFDOn78+A16N99tmzdvVmJiou666y5NmTJFX331lTlGD0Lv5MmTkqTOnTtLCt3PH7fbHXCMlhr+Dbl6BKEb6N///reamppa/VbrpKQkeTyeMM3q5pKZmany8nJVVFTotdde0+HDh/XAAw/o1KlT8ng8iomJafWHdC9cf4/Hc8n+tIwhOC1rdrmveY/Ho8TExIDxW265RZ07d6YvITJq1Cj9/ve/V1VVlX75y1+qurpao0ePVlNTkyR6EGrNzc0qLCzUD37wA91zzz2SFLKfP99U4/V6dfbs2evxdm56N/2f2IC1jB492vzv/v37KzMzUz179tSaNWvUrl27MM4MCJ8nnnjC/O9+/fqpf//+uv3227V582aNGDEijDO7OeXn52vfvn0B1yfi24szQjdQ165dFRUV1eougfr6eiUnJ4dpVje3hIQE3Xnnnfr000+VnJysxsZGnThxIqDmwvVPTk6+ZH9axhCcljW73Nd8cnJyq5sFzp8/r6+//pq+XCe33Xabunbtqk8//VQSPQilgoICrVu3Th988IG6d+9ubg/Vz59vqrHb7fyfvatEELqBYmJilJGRoaqqKnNbc3Ozqqqq5HA4wjizm9fp06f12WefqVu3bsrIyFB0dHTA+h86dEh1dXXm+jscDu3duzfgHwWXyyW73a709PQbPv/vurS0NCUnJwesudfr1fbt2wPW/MSJE6qpqTFrNm3apObmZmVmZpo1W7Zskd/vN2tcLpfuuusuderU6Qa9m5vHv/71L3311Vfq1q2bJHoQCoZhqKCgQO+88442bdqktLS0gPFQ/fxxOBwBx2ip4d+QaxDuq7Wt5u233zZsNptRXl5uHDhwwJg0aZKRkJAQcJcArt6Pf/xjY/Pmzcbhw4eNv//970ZWVpbRtWtX49ixY4ZhGMbkyZONHj16GJs2bTJ27txpOBwOw+FwmPufP3/euOeee4zs7GyjtrbWqKioMG699VajuLg4XG/pW+/UqVPG7t27jd27dxuSjIULFxq7d+82/ud//scwDMN4+eWXjYSEBOPPf/6zsWfPHuORRx4x0tLSjLNnz5rHGDVqlHHvvfca27dvNz788EPjjjvuMJ588klz/MSJE0ZSUpLx9NNPG/v27TPefvttIy4uzvjNb35zw9/vt9HlenDq1CnjJz/5ieF2u43Dhw8b77//vjFo0CDjjjvuMM6dO2cegx5cmylTphjx8fHG5s2bjaNHj5qPM2fOmDWh+Pnzz3/+04iLizOmTZtmfPzxx0ZpaakRFRVlVFRU3ND3ezMhCIXBr3/9a6NHjx5GTEyMcd999xnbtm0L95RuGmPHjjW6detmxMTEGN/73veMsWPHGp9++qk5fvbsWeO///u/jU6dOhlxcXHGD3/4Q+Po0aMBx/j888+N0aNHG+3atTO6du1q/PjHPzb8fv+NfivfGR988IEhqdUjLy/PMIz/3EL/0ksvGUlJSYbNZjNGjBhhHDp0KOAYX331lfHkk08aHTp0MOx2uzF+/Hjj1KlTATX/+Mc/jPvvv9+w2WzG9773PePll1++UW/xW+9yPThz5oyRnZ1t3HrrrUZ0dLTRs2dPY+LEia3+zxc9uDaXWn9JxooVK8yaUP38+eCDD4yBAwcaMTExxm233RbwGghehGEYxo0+CwUAAPBtwDVCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsghCAADAsv4/2I1qxbjTQ/MAAAAASUVORK5CYII="},"metadata":{}}]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T11:51:08.104083Z","iopub.execute_input":"2023-05-12T11:51:08.104505Z","iopub.status.idle":"2023-05-12T11:51:08.889863Z","shell.execute_reply.started":"2023-05-12T11:51:08.104477Z","shell.execute_reply":"2023-05-12T11:51:08.888809Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stderr","text":"Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.6kB/s]\nDownloading (…)lve/main/config.json: 100%|██████████| 625/625 [00:00<00:00, 383kB/s]\nDownloading (…)solve/main/vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 14.0MB/s]\nDownloading (…)/main/tokenizer.json: 100%|██████████| 1.72M/1.72M [00:00<00:00, 90.1MB/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n return tokenizer(text_data.comment_text.values.tolist(), \n max_length=max_len, \n truncation=True, \n padding=\"max_length\",\n add_special_tokens=True,\n return_tensors=\"tf\",\n return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:08.890965Z","iopub.execute_input":"2023-05-12T11:51:08.891271Z","iopub.status.idle":"2023-05-12T11:51:08.896368Z","shell.execute_reply.started":"2023-05-12T11:51:08.891227Z","shell.execute_reply":"2023-05-12T11:51:08.895589Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:08.897392Z","iopub.execute_input":"2023-05-12T11:51:08.897682Z","iopub.status.idle":"2023-05-12T11:51:49.986967Z","shell.execute_reply.started":"2023-05-12T11:51:08.897657Z","shell.execute_reply":"2023-05-12T11:51:49.985562Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n .repeat()\n .shuffle(Config.BUFFER_SIZE)\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n .batch(Config.BATCH_SIZE)\n .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:49.988465Z","iopub.execute_input":"2023-05-12T11:51:49.988785Z","iopub.status.idle":"2023-05-12T11:51:50.023604Z","shell.execute_reply.started":"2023-05-12T11:51:49.988758Z","shell.execute_reply":"2023-05-12T11:51:50.022512Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformers_layers, max_len=Config.MAX_LEN):\n input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n \n bert_layers = transformers_layers.bert(input_ids, attention_mask=masks)[1]\n intermediate = Dense(1024, activation='relu')(bert_layers)\n output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\n model = Model(inputs=[input_ids, masks], outputs=output)\n model.layers[2].trainable = True\n \n model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=tf.keras.metrics.AUC())\n return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:50.024852Z","iopub.execute_input":"2023-05-12T11:51:50.025144Z","iopub.status.idle":"2023-05-12T11:51:50.033876Z","shell.execute_reply.started":"2023-05-12T11:51:50.025120Z","shell.execute_reply":"2023-05-12T11:51:50.032937Z"},"trusted":true},"execution_count":20,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n transformers_layers = TFAutoModel.from_pretrained(Config.MODEL)\n model = model_builder(transformers_layers=transformers_layers)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:50.034973Z","iopub.execute_input":"2023-05-12T11:51:50.035277Z","iopub.status.idle":"2023-05-12T11:52:52.408470Z","shell.execute_reply.started":"2023-05-12T11:51:50.035228Z","shell.execute_reply":"2023-05-12T11:52:52.407200Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stderr","text":"Downloading tf_model.h5: 100%|██████████| 999M/999M [00:17<00:00, 58.2MB/s] \nSome layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']\n- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nAll the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:52.409793Z","iopub.execute_input":"2023-05-12T11:52:52.410082Z","iopub.status.idle":"2023-05-12T11:52:52.449580Z","shell.execute_reply.started":"2023-05-12T11:52:52.410057Z","shell.execute_reply":"2023-05-12T11:52:52.448645Z"},"trusted":true},"execution_count":22,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 192)] 0 [] \n \n attention_mask (InputLayer) [(None, 192)] 0 [] \n \n bert (TFBertMainLayer) TFBaseModelOutputWi 167356416 ['input_ids[0][0]', \n thPoolingAndCrossAt 'attention_mask[0][0]'] \n tentions(last_hidde \n n_state=(None, 192, \n 768), \n pooler_output=(Non \n e, 768), \n past_key_values=No \n ne, hidden_states=N \n one, attentions=Non \n e, cross_attentions \n =None) \n \n dense (Dense) (None, 1024) 787456 ['bert[0][1]'] \n \n output_layer (Dense) (None, 1) 1025 ['dense[0][0]'] \n \n==================================================================================================\nTotal params: 168,144,897\nTrainable params: 168,144,897\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n validation_data=val_dataset,\n steps_per_epoch=train_steps_per_epoch,\n epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:52:52.450624Z","iopub.execute_input":"2023-05-12T11:52:52.450887Z","iopub.status.idle":"2023-05-12T12:22:32.594613Z","shell.execute_reply.started":"2023-05-12T11:52:52.450864Z","shell.execute_reply":"2023-05-12T12:22:32.593393Z"},"trusted":true},"execution_count":23,"outputs":[{"name":"stdout","text":"Epoch 1/3\n","output_type":"stream"},{"name":"stderr","text":"2023-05-12 11:53:33.890629: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_406/ReadVariableOp.\n2023-05-12 11:53:34.988485: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_406/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"2998/4185 [====================>.........] - ETA: 2:36 - loss: 0.0710 - auc: 0.9958","output_type":"stream"},{"text":"IOPub message rate exceeded.\nThe notebook server will temporarily stop sending output\nto the client in order to avoid crashing it.\nTo change this limit, set the config variable\n`--NotebookApp.iopub_msg_rate_limit`.\n\nCurrent values:\nNotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\nNotebookApp.rate_limit_window=3.0 (secs)\n\n","name":"stderr","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0512 - auc: 0.9971","output_type":"stream"},{"name":"stderr","text":"2023-05-12 12:03:47.008155: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 12:03:47.287188: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 669s 137ms/step - loss: 0.0512 - auc: 0.9971 - val_loss: 0.3909 - val_auc: 0.8110\nEpoch 2/3\n4072/4185 [============================>.] - ETA: 14s - loss: 0.0427 - auc: 0.9980","output_type":"stream"},{"text":"IOPub message rate exceeded.\nThe notebook server will temporarily stop sending output\nto the client in order to avoid crashing it.\nTo change this limit, set the config variable\n`--NotebookApp.iopub_msg_rate_limit`.\n\nCurrent values:\nNotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\nNotebookApp.rate_limit_window=3.0 (secs)\n\n","name":"stderr","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 555s 133ms/step - loss: 0.0358 - auc: 0.9986 - val_loss: 0.3950 - val_auc: 0.8189\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n steps_per_epoch=val_steps_per_epoch,\n epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:24:25.248856Z","iopub.execute_input":"2023-05-12T12:24:25.249821Z","iopub.status.idle":"2023-05-12T12:25:30.559794Z","shell.execute_reply.started":"2023-05-12T12:24:25.249786Z","shell.execute_reply":"2023-05-12T12:25:30.558520Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 8s 131ms/step - loss: 0.2848 - auc: 0.8819\nEpoch 2/2\n62/62 [==============================] - 56s 132ms/step - loss: 0.1757 - auc: 0.9617\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T12:26:42.649251Z","iopub.execute_input":"2023-05-12T12:26:42.649715Z","iopub.status.idle":"2023-05-12T12:27:19.803541Z","shell.execute_reply.started":"2023-05-12T12:26:42.649683Z","shell.execute_reply":"2023-05-12T12:27:19.802272Z"},"trusted":true},"execution_count":25,"outputs":[{"name":"stderr","text":"2023-05-12 12:26:48.382290: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n2023-05-12 12:26:48.652698: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"499/499 [==============================] - 37s 49ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"model.save(\"mbert-fine-tuned-1-pooler\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:49:24.580208Z","iopub.execute_input":"2023-05-12T09:49:24.580625Z","iopub.status.idle":"2023-05-12T09:50:44.681561Z","shell.execute_reply.started":"2023-05-12T09:49:24.580595Z","shell.execute_reply":"2023-05-12T09:50:44.680112Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"},{"name":"stdout","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"},{"name":"stderr","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"}]},{"cell_type":"code","source":"import shutil\nshutil.make_archive(\"roberta-fine-tuned-2\",\"zip\",'/kaggle/working/roberta-fine-tuned-2')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:53:15.505782Z","iopub.execute_input":"2023-05-12T09:53:15.506262Z","iopub.status.idle":"2023-05-12T10:00:10.288432Z","shell.execute_reply.started":"2023-05-12T09:53:15.506226Z","shell.execute_reply":"2023-05-12T10:00:10.287215Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"'/kaggle/working/roberta-fine-tuned-2.zip'"},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2-best\", save_format='h5')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:06:24.426264Z","iopub.execute_input":"2023-05-12T10:06:24.426727Z","iopub.status.idle":"2023-05-12T10:06:40.506795Z","shell.execute_reply.started":"2023-05-12T10:06:24.426692Z","shell.execute_reply":"2023-05-12T10:06:40.505341Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"model = tf.keras.models.load_model('/kaggle/working/roberta-fine-tuned-2-best')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:07:36.737706Z","iopub.execute_input":"2023-05-12T10:07:36.738837Z","iopub.status.idle":"2023-05-12T10:07:59.902966Z","shell.execute_reply.started":"2023-05-12T10:07:36.738795Z","shell.execute_reply":"2023-05-12T10:07:59.901400Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"!huggingface-cli login --token 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","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:12:13.025974Z","iopub.execute_input":"2023-05-12T10:12:13.026917Z","iopub.status.idle":"2023-05-12T10:12:15.351277Z","shell.execute_reply.started":"2023-05-12T10:12:13.026877Z","shell.execute_reply":"2023-05-12T10:12:15.349659Z"},"trusted":true},"execution_count":55,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nToken will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\nToken is valid.\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta-best')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import HfApi\napi = HfApi()\napi.upload_folder(\n folder_path=\"/kaggle/working/\",\n repo_id=\"shivansh-ka/Toxic-Comment-Classifier-Multi\",\n repo_type=\"space\",\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T06:59:23.928089Z","iopub.execute_input":"2023-05-12T06:59:23.928495Z","iopub.status.idle":"2023-05-12T06:59:56.375479Z","shell.execute_reply.started":"2023-05-12T06:59:23.928466Z","shell.execute_reply":"2023-05-12T06:59:56.374295Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\nconfig.json not found in HuggingFace Hub.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"84f3f3229b3e42668708162e27df3168"}},"metadata":{}}]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:06:50.246933Z","iopub.execute_input":"2023-05-12T07:06:50.247789Z","iopub.status.idle":"2023-05-12T07:29:11.940923Z","shell.execute_reply.started":"2023-05-12T07:06:50.247752Z","shell.execute_reply":"2023-05-12T07:29:11.939745Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"499/499 [==============================] - 1341s 3s/step\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}