{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install nltk scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:13:06.118200Z",
     "iopub.status.busy": "2023-05-03T14:13:06.117322Z",
     "iopub.status.idle": "2023-05-03T14:13:36.869507Z",
     "shell.execute_reply": "2023-05-03T14:13:36.868619Z",
     "shell.execute_reply.started": "2023-05-03T14:13:06.118149Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "#import seaborn as sns\n",
    "import tensorflow as tf\n",
    "#import tensorflow_gpu\n",
    "import urllib\n",
    "from tensorflow.keras.layers import TextVectorization\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC\n",
    "from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score\n",
    "\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "import re\n",
    "import string\n",
    "nltk.download('stopwords')\n",
    "nltk.download('omw-1.4')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('wordnet2022')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tf_tpu_or_gpu(device: str='gpu'):\n",
    "    if device.lower() == 'gpu':\n",
    "        print(\"Setting up GPU.....\")\n",
    "        device_name = tf.test.gpu_device_name()\n",
    "        if \"GPU\" not in device_name:\n",
    "            print(\"GPU device not found\")\n",
    "        print('Found GPU at: {}'.format(device_name))\n",
    "        \n",
    "        config = tf.compat.v1.ConfigProto() \n",
    "        config.gpu_options.allow_growth = True \n",
    "        sess = tf.compat.v1.Session(config=config) \n",
    "        tf.compat.v1.keras.backend.set_session(sess)\n",
    "        \n",
    "        print(config)\n",
    "    \n",
    "    elif device.lower() == 'tpu':\n",
    "        print(\"Setting up TPU.....\")\n",
    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
    "        print('Running on TPU ', tpu.master())\n",
    "        tf.config.experimental_connect_to_cluster(tpu)\n",
    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
    "        \n",
    "    else:\n",
    "        raise Exception(\"Wrong Device Paramter Passed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_tpu_or_gpu(device='tpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:10.072253Z",
     "iopub.status.busy": "2023-05-03T14:16:10.071138Z",
     "iopub.status.idle": "2023-05-03T14:16:19.830833Z",
     "shell.execute_reply": "2023-05-03T14:16:19.829780Z",
     "shell.execute_reply.started": "2023-05-03T14:16:10.072215Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on TPU  \n",
      "INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\n",
      "INFO:tensorflow:Initializing the TPU system: local\n",
      "INFO:tensorflow:Finished initializing TPU system.\n",
      "INFO:tensorflow:Found TPU system:\n",
      "INFO:tensorflow:*** Num TPU Cores: 8\n",
      "INFO:tensorflow:*** Num TPU Workers: 1\n",
      "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n",
      "REPLICAS:  8\n"
     ]
    }
   ],
   "source": [
    "tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
    "print('Running on TPU ', tpu.master())\n",
    "tf.config.experimental_connect_to_cluster(tpu)\n",
    "tf.tpu.experimental.initialize_tpu_system(tpu)\n",
    "tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
    "print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "device_name = tf.test.gpu_device_name()\n",
    "if \"GPU\" not in device_name:\n",
    "    print(\"GPU device not found\")\n",
    "print('Found GPU at: {}'.format(device_name))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "config = tf.compat.v1.ConfigProto() \n",
    "config.gpu_options.allow_growth = True \n",
    "sess = tf.compat.v1.Session(config=config) \n",
    "tf.compat.v1.keras.backend.set_session(sess)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:24.940878Z",
     "iopub.status.busy": "2023-05-03T14:16:24.940140Z",
     "iopub.status.idle": "2023-05-03T14:16:24.946837Z",
     "shell.execute_reply": "2023-05-03T14:16:24.945707Z",
     "shell.execute_reply.started": "2023-05-03T14:16:24.940845Z"
    }
   },
   "outputs": [],
   "source": [
    "class Config:\n",
    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
    "    VOCAB_SIZE = 200000\n",
    "    OUTPUT_DIM = 1800\n",
    "    BUFFER_SIZE = 160000\n",
    "    BATCH_SIZE = 16*8\n",
    "    EPOCHS = 10\n",
    "    BASE_LOG_DIR = \"log_dir\"\n",
    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:29.171506Z",
     "iopub.status.busy": "2023-05-03T14:16:29.170711Z",
     "iopub.status.idle": "2023-05-03T14:16:30.613189Z",
     "shell.execute_reply": "2023-05-03T14:16:30.612012Z",
     "shell.execute_reply.started": "2023-05-03T14:16:29.171466Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "      <th>toxic</th>\n",
       "      <th>severe_toxic</th>\n",
       "      <th>obscene</th>\n",
       "      <th>threat</th>\n",
       "      <th>insult</th>\n",
       "      <th>identity_hate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000997932d777bf</td>\n",
       "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000103f0d9cfb60f</td>\n",
       "      <td>D'aww! He matches this background colour I'm s...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000113f07ec002fd</td>\n",
       "      <td>Hey man, I'm really not trying to edit war. It...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0001b41b1c6bb37e</td>\n",
       "      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0001d958c54c6e35</td>\n",
       "      <td>You, sir, are my hero. Any chance you remember...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id                                       comment_text  toxic   \n",
       "0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0  \\\n",
       "1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n",
       "2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n",
       "3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n",
       "4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n",
       "\n",
       "   severe_toxic  obscene  threat  insult  identity_hate  \n",
       "0             0        0       0       0              0  \n",
       "1             0        0       0       0              0  \n",
       "2             0        0       0       0              0  \n",
       "3             0        0       0       0              0  \n",
       "4             0        0       0       0              0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:37.492444Z",
     "iopub.status.busy": "2023-05-03T14:16:37.491342Z",
     "iopub.status.idle": "2023-05-03T14:16:37.533400Z",
     "shell.execute_reply": "2023-05-03T14:16:37.532235Z",
     "shell.execute_reply.started": "2023-05-03T14:16:37.492404Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 159571 entries, 0 to 159570\n",
      "Data columns (total 8 columns):\n",
      " #   Column         Non-Null Count   Dtype \n",
      "---  ------         --------------   ----- \n",
      " 0   id             159571 non-null  object\n",
      " 1   comment_text   159571 non-null  object\n",
      " 2   toxic          159571 non-null  int64 \n",
      " 3   severe_toxic   159571 non-null  int64 \n",
      " 4   obscene        159571 non-null  int64 \n",
      " 5   threat         159571 non-null  int64 \n",
      " 6   insult         159571 non-null  int64 \n",
      " 7   identity_hate  159571 non-null  int64 \n",
      "dtypes: int64(6), object(2)\n",
      "memory usage: 9.7+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:41.586932Z",
     "iopub.status.busy": "2023-05-03T14:16:41.585997Z",
     "iopub.status.idle": "2023-05-03T14:16:41.618902Z",
     "shell.execute_reply": "2023-05-03T14:16:41.617979Z",
     "shell.execute_reply.started": "2023-05-03T14:16:41.586895Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id               0\n",
       "comment_text     0\n",
       "toxic            0\n",
       "severe_toxic     0\n",
       "obscene          0\n",
       "threat           0\n",
       "insult           0\n",
       "identity_hate    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:44.561198Z",
     "iopub.status.busy": "2023-05-03T14:16:44.560414Z",
     "iopub.status.idle": "2023-05-03T14:16:44.586487Z",
     "shell.execute_reply": "2023-05-03T14:16:44.585582Z",
     "shell.execute_reply.started": "2023-05-03T14:16:44.561152Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>toxic</th>\n",
       "      <td>144277</td>\n",
       "      <td>15294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>severe_toxic</th>\n",
       "      <td>157976</td>\n",
       "      <td>1595</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>obscene</th>\n",
       "      <td>151122</td>\n",
       "      <td>8449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>threat</th>\n",
       "      <td>159093</td>\n",
       "      <td>478</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insult</th>\n",
       "      <td>151694</td>\n",
       "      <td>7877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>identity_hate</th>\n",
       "      <td>158166</td>\n",
       "      <td>1405</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    0      1\n",
       "toxic          144277  15294\n",
       "severe_toxic   157976   1595\n",
       "obscene        151122   8449\n",
       "threat         159093    478\n",
       "insult         151694   7877\n",
       "identity_hate  158166   1405"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:51.639830Z",
     "iopub.status.busy": "2023-05-03T14:16:51.639059Z",
     "iopub.status.idle": "2023-05-03T14:16:51.658065Z",
     "shell.execute_reply": "2023-05-03T14:16:51.657049Z",
     "shell.execute_reply.started": "2023-05-03T14:16:51.639796Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "toxic value count\n",
      "--------------------\n",
      "0: 144277 | 90.42 %\n",
      "1: 15294 | 9.58 %\n",
      "\n",
      "severe_toxic value count\n",
      "--------------------\n",
      "0: 157976 | 99.0 %\n",
      "1: 1595 | 1.0 %\n",
      "\n",
      "obscene value count\n",
      "--------------------\n",
      "0: 151122 | 94.71 %\n",
      "1: 8449 | 5.29 %\n",
      "\n",
      "threat value count\n",
      "--------------------\n",
      "0: 159093 | 99.7 %\n",
      "1: 478 | 0.3 %\n",
      "\n",
      "insult value count\n",
      "--------------------\n",
      "0: 151694 | 95.06 %\n",
      "1: 7877 | 4.94 %\n",
      "\n",
      "identity_hate value count\n",
      "--------------------\n",
      "0: 158166 | 99.12 %\n",
      "1: 1405 | 0.88 %\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for column in data.columns:\n",
    "    if data[column].dtype != 'O':\n",
    "        value_count = data[column].value_counts()\n",
    "        print(f\"{column} value count\\n{'--'*10}\")\n",
    "        print(f\"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\\n\"\n",
    "              f\"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"text_len\"] = data[\"comment_text\"].apply(lambda x: len(x.split()))\n",
    "data[data[\"text_len\"]==data[\"text_len\"].max()]['comment_text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:16:58.642154Z",
     "iopub.status.busy": "2023-05-03T14:16:58.641279Z",
     "iopub.status.idle": "2023-05-03T14:16:58.648851Z",
     "shell.execute_reply": "2023-05-03T14:16:58.647773Z",
     "shell.execute_reply.started": "2023-05-03T14:16:58.642119Z"
    }
   },
   "outputs": [],
   "source": [
    "X = data['comment_text']\n",
    "y = data[data.columns[2:]].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:17:02.919383Z",
     "iopub.status.busy": "2023-05-03T14:17:02.918865Z",
     "iopub.status.idle": "2023-05-03T14:17:02.927191Z",
     "shell.execute_reply": "2023-05-03T14:17:02.926293Z",
     "shell.execute_reply.started": "2023-05-03T14:17:02.919350Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         Explanation\\nWhy the edits made under my usern...\n",
       "1         D'aww! He matches this background colour I'm s...\n",
       "2         Hey man, I'm really not trying to edit war. It...\n",
       "3         \"\\nMore\\nI can't make any real suggestions on ...\n",
       "4         You, sir, are my hero. Any chance you remember...\n",
       "                                ...                        \n",
       "159566    \":::::And for the second time of asking, when ...\n",
       "159567    You should be ashamed of yourself \\n\\nThat is ...\n",
       "159568    Spitzer \\n\\nUmm, theres no actual article for ...\n",
       "159569    And it looks like it was actually you who put ...\n",
       "159570    \"\\nAnd ... I really don't think you understand...\n",
       "Name: comment_text, Length: 159571, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:17:08.246451Z",
     "iopub.status.busy": "2023-05-03T14:17:08.245491Z",
     "iopub.status.idle": "2023-05-03T14:17:08.252604Z",
     "shell.execute_reply": "2023-05-03T14:17:08.251608Z",
     "shell.execute_reply.started": "2023-05-03T14:17:08.246414Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 0, 0]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Text Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:17:25.208007Z",
     "iopub.status.busy": "2023-05-03T14:17:25.207157Z",
     "iopub.status.idle": "2023-05-03T14:17:25.220446Z",
     "shell.execute_reply": "2023-05-03T14:17:25.219390Z",
     "shell.execute_reply.started": "2023-05-03T14:17:25.207968Z"
    }
   },
   "outputs": [],
   "source": [
    "class Text_Cleaner:\n",
    "    def __init__(self, data):\n",
    "        self.data = data\n",
    "        self.STOPWORDS = stopwords.words('english')\n",
    "        self.wordnet = WordNetLemmatizer()\n",
    "        \n",
    "    def new_line_code(self, x:str)->str:\n",
    "        pattern = \"\\n\"\n",
    "        x = re.sub(pattern,' ', x).strip().lower()\n",
    "        return x\n",
    "\n",
    "    def remove_punctuations(self, x:str)->str:\n",
    "        x = x.translate(str.maketrans('','',string.punctuation))\n",
    "        return x\n",
    "\n",
    "    def remove_stopwords(self, x:str)->str:\n",
    "        sent=[]\n",
    "        for word in x.split():\n",
    "            if word not in self.STOPWORDS:\n",
    "                sent.append(word)\n",
    "        return ' '.join(sent)\n",
    "\n",
    "    def lemmatization(self, x:str)->str:\n",
    "        sent=[]\n",
    "        for word in x.split():\n",
    "            sent.append(self.wordnet.lemmatize(word))\n",
    "        return ' '.join(sent)\n",
    "    \n",
    "    def clean_text(self):\n",
    "        self.data = self.data.apply(self.new_line_code)\n",
    "        self.data = self.data.apply(self.remove_punctuations)\n",
    "        self.data = self.data.apply(self.remove_stopwords)\n",
    "        self.data = self.data.apply(self.lemmatization)\n",
    "        self.data = self.data.apply(lambda x: x.strip())\n",
    "        return self.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:17:28.812213Z",
     "iopub.status.busy": "2023-05-03T14:17:28.811115Z",
     "iopub.status.idle": "2023-05-03T14:18:45.134664Z",
     "shell.execute_reply": "2023-05-03T14:18:45.133093Z",
     "shell.execute_reply.started": "2023-05-03T14:17:28.812159Z"
    }
   },
   "outputs": [],
   "source": [
    "X = Text_Cleaner(X).clean_text()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:19:08.971107Z",
     "iopub.status.busy": "2023-05-03T14:19:08.969951Z",
     "iopub.status.idle": "2023-05-03T14:19:08.979371Z",
     "shell.execute_reply": "2023-05-03T14:19:08.978320Z",
     "shell.execute_reply.started": "2023-05-03T14:19:08.971065Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         explanation edits made username hardcore metal...\n",
       "1         daww match background colour im seemingly stuc...\n",
       "2         hey man im really trying edit war guy constant...\n",
       "3         cant make real suggestion improvement wondered...\n",
       "4                       sir hero chance remember page thats\n",
       "                                ...                        \n",
       "159566    second time asking view completely contradicts...\n",
       "159567       ashamed horrible thing put talk page 128611993\n",
       "159568    spitzer umm there actual article prostitution ...\n",
       "159569    look like actually put speedy first version de...\n",
       "159570    really dont think understand came idea bad rig...\n",
       "Name: comment_text, Length: 159571, dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Building"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
    "                               output_mode='int')\n",
    "vectorizer.adapt(X.values)\n",
    "vectorized_text = vectorizer(X.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:42:24.692312Z",
     "iopub.status.busy": "2023-05-03T14:42:24.691267Z",
     "iopub.status.idle": "2023-05-03T14:42:24.709520Z",
     "shell.execute_reply": "2023-05-03T14:42:24.708295Z",
     "shell.execute_reply.started": "2023-05-03T14:42:24.692272Z"
    }
   },
   "outputs": [],
   "source": [
    "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
    "dataset = dataset.cache()\n",
    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:42:27.187117Z",
     "iopub.status.busy": "2023-05-03T14:42:27.185929Z",
     "iopub.status.idle": "2023-05-03T14:42:27.196570Z",
     "shell.execute_reply": "2023-05-03T14:42:27.195443Z",
     "shell.execute_reply.started": "2023-05-03T14:42:27.187074Z"
    }
   },
   "outputs": [],
   "source": [
    "train = dataset.take(int(len(dataset)*0.8))\n",
    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:41:54.920944Z",
     "iopub.status.busy": "2023-05-03T14:41:54.920085Z",
     "iopub.status.idle": "2023-05-03T14:41:54.928526Z",
     "shell.execute_reply": "2023-05-03T14:41:54.927502Z",
     "shell.execute_reply.started": "2023-05-03T14:41:54.920907Z"
    }
   },
   "outputs": [],
   "source": [
    "def create_model():\n",
    "    \n",
    "    LAYERS = [\n",
    "              Embedding(Config.VOCAB_SIZE+1, 32),\n",
    "              Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),\n",
    "              Bidirectional(LSTM(32)),\n",
    "              Dense(128, activation='relu'),\n",
    "              Dropout(0.1),\n",
    "              Dense(256, activation='relu'),\n",
    "              Dropout(0.1),\n",
    "              Dense(128, activation='relu'),\n",
    "              Dense(6, activation='sigmoid')]\n",
    "    \n",
    "    model = Sequential(LAYERS)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:41:41.900942Z",
     "iopub.status.busy": "2023-05-03T14:41:41.900504Z",
     "iopub.status.idle": "2023-05-03T14:41:41.908480Z",
     "shell.execute_reply": "2023-05-03T14:41:41.907187Z",
     "shell.execute_reply.started": "2023-05-03T14:41:41.900911Z"
    }
   },
   "outputs": [],
   "source": [
    "def callbacks(base_dir=\".\"):\n",
    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
    "    os.makedirs(ckpt_file,exist_ok=True)\n",
    "\n",
    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
    "      filepath = ckpt_file,\n",
    "      save_best_only = True)\n",
    "\n",
    "    callback_list = [early_stopping,\n",
    "                     ckpt_cb]\n",
    "    return callback_list\n",
    "callbacks_list = callbacks()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:42:07.719948Z",
     "iopub.status.busy": "2023-05-03T14:42:07.719137Z",
     "iopub.status.idle": "2023-05-03T14:42:09.288990Z",
     "shell.execute_reply": "2023-05-03T14:42:09.287682Z",
     "shell.execute_reply.started": "2023-05-03T14:42:07.719910Z"
    }
   },
   "outputs": [],
   "source": [
    "with tpu_strategy.scope():\n",
    "    model = create_model()\n",
    "    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
    "                  loss=tf.keras.losses.binary_crossentropy,\n",
    "                  metrics=AUC(multi_label=True, num_labels=6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:42:34.084064Z",
     "iopub.status.busy": "2023-05-03T14:42:34.083255Z",
     "iopub.status.idle": "2023-05-03T14:42:34.110375Z",
     "shell.execute_reply": "2023-05-03T14:42:34.109380Z",
     "shell.execute_reply.started": "2023-05-03T14:42:34.084025Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"sequential_2\"\n",
      "_________________________________________________________________\n",
      " Layer (type)                Output Shape              Param #   \n",
      "=================================================================\n",
      " embedding_2 (Embedding)     (None, None, 32)          6400032   \n",
      "                                                                 \n",
      " bidirectional_4 (Bidirectio  (None, None, 128)        49664     \n",
      " nal)                                                            \n",
      "                                                                 \n",
      " bidirectional_5 (Bidirectio  (None, 64)               41216     \n",
      " nal)                                                            \n",
      "                                                                 \n",
      " dense_8 (Dense)             (None, 128)               8320      \n",
      "                                                                 \n",
      " dropout_4 (Dropout)         (None, 128)               0         \n",
      "                                                                 \n",
      " dense_9 (Dense)             (None, 256)               33024     \n",
      "                                                                 \n",
      " dropout_5 (Dropout)         (None, 256)               0         \n",
      "                                                                 \n",
      " dense_10 (Dense)            (None, 128)               32896     \n",
      "                                                                 \n",
      " dense_11 (Dense)            (None, 6)                 774       \n",
      "                                                                 \n",
      "=================================================================\n",
      "Total params: 6,565,926\n",
      "Trainable params: 6,565,926\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:20:19.051437Z",
     "iopub.status.busy": "2023-05-03T14:20:19.050592Z",
     "iopub.status.idle": "2023-05-03T14:20:19.057744Z",
     "shell.execute_reply": "2023-05-03T14:20:19.056746Z",
     "shell.execute_reply.started": "2023-05-03T14:20:19.051377Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T14:42:42.306143Z",
     "iopub.status.busy": "2023-05-03T14:42:42.305188Z",
     "iopub.status.idle": "2023-05-03T18:36:14.400588Z",
     "shell.execute_reply": "2023-05-03T18:36:14.399250Z",
     "shell.execute_reply.started": "2023-05-03T14:42:42.306107Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/10\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-03 14:42:57.854226: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n",
      "2023-05-03 14:42:58.165317: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - ETA: 0s - loss: 0.1688 - auc_2: 0.5909"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-03 15:05:36.690047: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n",
      "2023-05-03 15:05:36.851778: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n",
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1425s 1s/step - loss: 0.1688 - auc_2: 0.5909 - val_loss: 0.0750 - val_auc_2: 0.9196\n",
      "Epoch 2/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0640 - auc_2: 0.9400"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1395s 1s/step - loss: 0.0640 - auc_2: 0.9400 - val_loss: 0.0548 - val_auc_2: 0.9532\n",
      "Epoch 3/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0524 - auc_2: 0.9594"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1396s 1s/step - loss: 0.0524 - auc_2: 0.9594 - val_loss: 0.0484 - val_auc_2: 0.9597\n",
      "Epoch 4/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0466 - auc_2: 0.9672"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1396s 1s/step - loss: 0.0466 - auc_2: 0.9672 - val_loss: 0.0426 - val_auc_2: 0.9729\n",
      "Epoch 5/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0440 - auc_2: 0.9715"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1395s 1s/step - loss: 0.0440 - auc_2: 0.9715 - val_loss: 0.0406 - val_auc_2: 0.9761\n",
      "Epoch 6/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0416 - auc_2: 0.9725"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1396s 1s/step - loss: 0.0416 - auc_2: 0.9725 - val_loss: 0.0382 - val_auc_2: 0.9787\n",
      "Epoch 7/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0394 - auc_2: 0.9762"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1396s 1s/step - loss: 0.0394 - auc_2: 0.9762 - val_loss: 0.0359 - val_auc_2: 0.9819\n",
      "Epoch 8/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0379 - auc_2: 0.9773"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1396s 1s/step - loss: 0.0379 - auc_2: 0.9773 - val_loss: 0.0346 - val_auc_2: 0.9821\n",
      "Epoch 9/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0367 - auc_2: 0.9776"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1396s 1s/step - loss: 0.0367 - auc_2: 0.9776 - val_loss: 0.0336 - val_auc_2: 0.9827\n",
      "Epoch 10/10\n",
      "997/997 [==============================] - ETA: 0s - loss: 0.0357 - auc_2: 0.9782"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "997/997 [==============================] - 1395s 1s/step - loss: 0.0357 - auc_2: 0.9782 - val_loss: 0.0328 - val_auc_2: 0.9819\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(train, \n",
    "                    epochs=Config.EPOCHS,\n",
    "                    steps_per_epoch=len(train),\n",
    "                    validation_data=val,\n",
    "                    callbacks=callbacks_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T18:36:42.693133Z",
     "iopub.status.busy": "2023-05-03T18:36:42.692246Z",
     "iopub.status.idle": "2023-05-03T18:36:42.702544Z",
     "shell.execute_reply": "2023-05-03T18:36:42.701196Z",
     "shell.execute_reply.started": "2023-05-03T18:36:42.693095Z"
    }
   },
   "outputs": [],
   "source": [
    "def model_evaluation(model, vectorizer: TextVectorization, pred_data: pd.Series, y_true):\n",
    "    #pred_data = Text_Cleaner(pred_data).clean_text()\n",
    "    #vectorized_text = vectorizer(pred_data)\n",
    "    y_pred = model.predict(pred_data)\n",
    "    try:\n",
    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "    print(f\"Precision: {precision}\\n\"\n",
    "          f\"Recall: {recall}\\n\"\n",
    "          f\"F1-Score: {f1}\\n\"\n",
    "          f\"ROC-AUC-Score: {auc}\")\n",
    "    return (precision, recall, f1, auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.evaluate(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T18:36:28.884733Z",
     "iopub.status.busy": "2023-05-03T18:36:28.883953Z",
     "iopub.status.idle": "2023-05-03T18:36:29.233282Z",
     "shell.execute_reply": "2023-05-03T18:36:29.231964Z",
     "shell.execute_reply.started": "2023-05-03T18:36:28.884694Z"
    }
   },
   "outputs": [],
   "source": [
    "model.save(\"model_4.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T18:51:24.530412Z",
     "iopub.status.busy": "2023-05-03T18:51:24.529307Z",
     "iopub.status.idle": "2023-05-03T19:20:36.675080Z",
     "shell.execute_reply": "2023-05-03T19:20:36.673739Z",
     "shell.execute_reply.started": "2023-05-03T18:51:24.530375Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3988/3988 [==============================] - 1747s 438ms/step\n",
      "Precision: 0.034067329786671804\n",
      "Recall: 0.03396435372259718\n",
      "F1-Score: 0.03375883387877523\n",
      "ROC-AUC-Score: 0.4963643308231378\n"
     ]
    }
   ],
   "source": [
    "x_train = np.concatenate([x for x, y in train])\n",
    "y_train = np.concatenate([y for x, y in train])\n",
    "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_train, y_true=y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-05-03T18:49:02.718178Z",
     "iopub.status.busy": "2023-05-03T18:49:02.717234Z",
     "iopub.status.idle": "2023-05-03T18:49:50.438077Z",
     "shell.execute_reply": "2023-05-03T18:49:50.436458Z",
     "shell.execute_reply.started": "2023-05-03T18:49:02.718132Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "996/996 [==============================] - 43s 42ms/step\n",
      "Precision: 0.03615509646190422\n",
      "Recall: 0.03674059129986899\n",
      "F1-Score: 0.03625622443975915\n",
      "ROC-AUC-Score: 0.4868083116383068\n"
     ]
    }
   ],
   "source": [
    "x_val = np.concatenate([x for x, y in val])\n",
    "y_val = np.concatenate([y for x, y in val])\n",
    "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_val, y_true=y_val)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}