{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install nltk scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:13:06.118200Z", "iopub.status.busy": "2023-05-03T14:13:06.117322Z", "iopub.status.idle": "2023-05-03T14:13:36.869507Z", "shell.execute_reply": "2023-05-03T14:13:36.868619Z", "shell.execute_reply.started": "2023-05-03T14:13:06.118149Z" }, "scrolled": true }, "outputs": [], "source": [ "import os\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "#import seaborn as sns\n", "import tensorflow as tf\n", "#import tensorflow_gpu\n", "import urllib\n", "from tensorflow.keras.layers import TextVectorization\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n", "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC\n", "from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score\n", "\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "import re\n", "import string\n", "nltk.download('stopwords')\n", "nltk.download('omw-1.4')\n", "nltk.download('wordnet')\n", "nltk.download('wordnet2022')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tf_tpu_or_gpu(device: str='gpu'):\n", " if device.lower() == 'gpu':\n", " print(\"Setting up GPU.....\")\n", " device_name = tf.test.gpu_device_name()\n", " if \"GPU\" not in device_name:\n", " print(\"GPU device not found\")\n", " print('Found GPU at: {}'.format(device_name))\n", " \n", " config = tf.compat.v1.ConfigProto() \n", " config.gpu_options.allow_growth = True \n", " sess = tf.compat.v1.Session(config=config) \n", " tf.compat.v1.keras.backend.set_session(sess)\n", " \n", " print(config)\n", " \n", " elif device.lower() == 'tpu':\n", " print(\"Setting up TPU.....\")\n", " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n", " print('Running on TPU ', tpu.master())\n", " tf.config.experimental_connect_to_cluster(tpu)\n", " tf.tpu.experimental.initialize_tpu_system(tpu)\n", " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n", " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n", " \n", " else:\n", " raise Exception(\"Wrong Device Paramter Passed\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf_tpu_or_gpu(device='tpu')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:10.072253Z", "iopub.status.busy": "2023-05-03T14:16:10.071138Z", "iopub.status.idle": "2023-05-03T14:16:19.830833Z", "shell.execute_reply": "2023-05-03T14:16:19.829780Z", "shell.execute_reply.started": "2023-05-03T14:16:10.072215Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on TPU \n", "INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\n", "INFO:tensorflow:Initializing the TPU system: local\n", "INFO:tensorflow:Finished initializing TPU system.\n", "INFO:tensorflow:Found TPU system:\n", "INFO:tensorflow:*** Num TPU Cores: 8\n", "INFO:tensorflow:*** Num TPU Workers: 1\n", "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n", "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n", "REPLICAS: 8\n" ] } ], "source": [ "tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n", "print('Running on TPU ', tpu.master())\n", "tf.config.experimental_connect_to_cluster(tpu)\n", "tf.tpu.experimental.initialize_tpu_system(tpu)\n", "tpu_strategy = tf.distribute.TPUStrategy(tpu)\n", "print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "device_name = tf.test.gpu_device_name()\n", "if \"GPU\" not in device_name:\n", " print(\"GPU device not found\")\n", "print('Found GPU at: {}'.format(device_name))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "config = tf.compat.v1.ConfigProto() \n", "config.gpu_options.allow_growth = True \n", "sess = tf.compat.v1.Session(config=config) \n", "tf.compat.v1.keras.backend.set_session(sess)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:24.940878Z", "iopub.status.busy": "2023-05-03T14:16:24.940140Z", "iopub.status.idle": "2023-05-03T14:16:24.946837Z", "shell.execute_reply": "2023-05-03T14:16:24.945707Z", "shell.execute_reply.started": "2023-05-03T14:16:24.940845Z" } }, "outputs": [], "source": [ "class Config:\n", " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n", " FILE_NAME = \"toxic_comment_data.csv\"\n", " VOCAB_SIZE = 200000\n", " OUTPUT_DIM = 1800\n", " BUFFER_SIZE = 160000\n", " BATCH_SIZE = 16*8\n", " EPOCHS = 10\n", " BASE_LOG_DIR = \"log_dir\"\n", " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:29.171506Z", "iopub.status.busy": "2023-05-03T14:16:29.170711Z", "iopub.status.idle": "2023-05-03T14:16:30.613189Z", "shell.execute_reply": "2023-05-03T14:16:30.612012Z", "shell.execute_reply.started": "2023-05-03T14:16:29.171466Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\\nWhy the edits made under my usern...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\\nMore\\nI can't make any real suggestions on ...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
\n", "
" ], "text/plain": [ " id comment_text toxic \n", "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n", "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n", "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n", "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n", "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n", "\n", " severe_toxic obscene threat insult identity_hate \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n", "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:37.492444Z", "iopub.status.busy": "2023-05-03T14:16:37.491342Z", "iopub.status.idle": "2023-05-03T14:16:37.533400Z", "shell.execute_reply": "2023-05-03T14:16:37.532235Z", "shell.execute_reply.started": "2023-05-03T14:16:37.492404Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 159571 entries, 0 to 159570\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 159571 non-null object\n", " 1 comment_text 159571 non-null object\n", " 2 toxic 159571 non-null int64 \n", " 3 severe_toxic 159571 non-null int64 \n", " 4 obscene 159571 non-null int64 \n", " 5 threat 159571 non-null int64 \n", " 6 insult 159571 non-null int64 \n", " 7 identity_hate 159571 non-null int64 \n", "dtypes: int64(6), object(2)\n", "memory usage: 9.7+ MB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:41.586932Z", "iopub.status.busy": "2023-05-03T14:16:41.585997Z", "iopub.status.idle": "2023-05-03T14:16:41.618902Z", "shell.execute_reply": "2023-05-03T14:16:41.617979Z", "shell.execute_reply.started": "2023-05-03T14:16:41.586895Z" } }, "outputs": [ { "data": { "text/plain": [ "id 0\n", "comment_text 0\n", "toxic 0\n", "severe_toxic 0\n", "obscene 0\n", "threat 0\n", "insult 0\n", "identity_hate 0\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:44.561198Z", "iopub.status.busy": "2023-05-03T14:16:44.560414Z", "iopub.status.idle": "2023-05-03T14:16:44.586487Z", "shell.execute_reply": "2023-05-03T14:16:44.585582Z", "shell.execute_reply.started": "2023-05-03T14:16:44.561152Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
toxic14427715294
severe_toxic1579761595
obscene1511228449
threat159093478
insult1516947877
identity_hate1581661405
\n", "
" ], "text/plain": [ " 0 1\n", "toxic 144277 15294\n", "severe_toxic 157976 1595\n", "obscene 151122 8449\n", "threat 159093 478\n", "insult 151694 7877\n", "identity_hate 158166 1405" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:51.639830Z", "iopub.status.busy": "2023-05-03T14:16:51.639059Z", "iopub.status.idle": "2023-05-03T14:16:51.658065Z", "shell.execute_reply": "2023-05-03T14:16:51.657049Z", "shell.execute_reply.started": "2023-05-03T14:16:51.639796Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "toxic value count\n", "--------------------\n", "0: 144277 | 90.42 %\n", "1: 15294 | 9.58 %\n", "\n", "severe_toxic value count\n", "--------------------\n", "0: 157976 | 99.0 %\n", "1: 1595 | 1.0 %\n", "\n", "obscene value count\n", "--------------------\n", "0: 151122 | 94.71 %\n", "1: 8449 | 5.29 %\n", "\n", "threat value count\n", "--------------------\n", "0: 159093 | 99.7 %\n", "1: 478 | 0.3 %\n", "\n", "insult value count\n", "--------------------\n", "0: 151694 | 95.06 %\n", "1: 7877 | 4.94 %\n", "\n", "identity_hate value count\n", "--------------------\n", "0: 158166 | 99.12 %\n", "1: 1405 | 0.88 %\n", "\n" ] } ], "source": [ "for column in data.columns:\n", " if data[column].dtype != 'O':\n", " value_count = data[column].value_counts()\n", " print(f\"{column} value count\\n{'--'*10}\")\n", " print(f\"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\\n\"\n", " f\"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data[\"text_len\"] = data[\"comment_text\"].apply(lambda x: len(x.split()))\n", "data[data[\"text_len\"]==data[\"text_len\"].max()]['comment_text']" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:16:58.642154Z", "iopub.status.busy": "2023-05-03T14:16:58.641279Z", "iopub.status.idle": "2023-05-03T14:16:58.648851Z", "shell.execute_reply": "2023-05-03T14:16:58.647773Z", "shell.execute_reply.started": "2023-05-03T14:16:58.642119Z" } }, "outputs": [], "source": [ "X = data['comment_text']\n", "y = data[data.columns[2:]].values" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:17:02.919383Z", "iopub.status.busy": "2023-05-03T14:17:02.918865Z", "iopub.status.idle": "2023-05-03T14:17:02.927191Z", "shell.execute_reply": "2023-05-03T14:17:02.926293Z", "shell.execute_reply.started": "2023-05-03T14:17:02.919350Z" } }, "outputs": [ { "data": { "text/plain": [ "0 Explanation\\nWhy the edits made under my usern...\n", "1 D'aww! He matches this background colour I'm s...\n", "2 Hey man, I'm really not trying to edit war. It...\n", "3 \"\\nMore\\nI can't make any real suggestions on ...\n", "4 You, sir, are my hero. Any chance you remember...\n", " ... \n", "159566 \":::::And for the second time of asking, when ...\n", "159567 You should be ashamed of yourself \\n\\nThat is ...\n", "159568 Spitzer \\n\\nUmm, theres no actual article for ...\n", "159569 And it looks like it was actually you who put ...\n", "159570 \"\\nAnd ... I really don't think you understand...\n", "Name: comment_text, Length: 159571, dtype: object" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:17:08.246451Z", "iopub.status.busy": "2023-05-03T14:17:08.245491Z", "iopub.status.idle": "2023-05-03T14:17:08.252604Z", "shell.execute_reply": "2023-05-03T14:17:08.251608Z", "shell.execute_reply.started": "2023-05-03T14:17:08.246414Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " ...,\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Text Preprocessing" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:17:25.208007Z", "iopub.status.busy": "2023-05-03T14:17:25.207157Z", "iopub.status.idle": "2023-05-03T14:17:25.220446Z", "shell.execute_reply": "2023-05-03T14:17:25.219390Z", "shell.execute_reply.started": "2023-05-03T14:17:25.207968Z" } }, "outputs": [], "source": [ "class Text_Cleaner:\n", " def __init__(self, data):\n", " self.data = data\n", " self.STOPWORDS = stopwords.words('english')\n", " self.wordnet = WordNetLemmatizer()\n", " \n", " def new_line_code(self, x:str)->str:\n", " pattern = \"\\n\"\n", " x = re.sub(pattern,' ', x).strip().lower()\n", " return x\n", "\n", " def remove_punctuations(self, x:str)->str:\n", " x = x.translate(str.maketrans('','',string.punctuation))\n", " return x\n", "\n", " def remove_stopwords(self, x:str)->str:\n", " sent=[]\n", " for word in x.split():\n", " if word not in self.STOPWORDS:\n", " sent.append(word)\n", " return ' '.join(sent)\n", "\n", " def lemmatization(self, x:str)->str:\n", " sent=[]\n", " for word in x.split():\n", " sent.append(self.wordnet.lemmatize(word))\n", " return ' '.join(sent)\n", " \n", " def clean_text(self):\n", " self.data = self.data.apply(self.new_line_code)\n", " self.data = self.data.apply(self.remove_punctuations)\n", " self.data = self.data.apply(self.remove_stopwords)\n", " self.data = self.data.apply(self.lemmatization)\n", " self.data = self.data.apply(lambda x: x.strip())\n", " return self.data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:17:28.812213Z", "iopub.status.busy": "2023-05-03T14:17:28.811115Z", "iopub.status.idle": "2023-05-03T14:18:45.134664Z", "shell.execute_reply": "2023-05-03T14:18:45.133093Z", "shell.execute_reply.started": "2023-05-03T14:17:28.812159Z" } }, "outputs": [], "source": [ "X = Text_Cleaner(X).clean_text()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:19:08.971107Z", "iopub.status.busy": "2023-05-03T14:19:08.969951Z", "iopub.status.idle": "2023-05-03T14:19:08.979371Z", "shell.execute_reply": "2023-05-03T14:19:08.978320Z", "shell.execute_reply.started": "2023-05-03T14:19:08.971065Z" } }, "outputs": [ { "data": { "text/plain": [ "0 explanation edits made username hardcore metal...\n", "1 daww match background colour im seemingly stuc...\n", "2 hey man im really trying edit war guy constant...\n", "3 cant make real suggestion improvement wondered...\n", "4 sir hero chance remember page thats\n", " ... \n", "159566 second time asking view completely contradicts...\n", "159567 ashamed horrible thing put talk page 128611993\n", "159568 spitzer umm there actual article prostitution ...\n", "159569 look like actually put speedy first version de...\n", "159570 really dont think understand came idea bad rig...\n", "Name: comment_text, Length: 159571, dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model Building" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n", " output_sequence_length=Config.OUTPUT_DIM,\n", " output_mode='int')\n", "vectorizer.adapt(X.values)\n", "vectorized_text = vectorizer(X.values)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:42:24.692312Z", "iopub.status.busy": "2023-05-03T14:42:24.691267Z", "iopub.status.idle": "2023-05-03T14:42:24.709520Z", "shell.execute_reply": "2023-05-03T14:42:24.708295Z", "shell.execute_reply.started": "2023-05-03T14:42:24.692272Z" } }, "outputs": [], "source": [ "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n", "dataset = dataset.cache()\n", "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n", "dataset = dataset.batch(Config.BATCH_SIZE)\n", "dataset = dataset.prefetch(tf.data.AUTOTUNE)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:42:27.187117Z", "iopub.status.busy": "2023-05-03T14:42:27.185929Z", "iopub.status.idle": "2023-05-03T14:42:27.196570Z", "shell.execute_reply": "2023-05-03T14:42:27.195443Z", "shell.execute_reply.started": "2023-05-03T14:42:27.187074Z" } }, "outputs": [], "source": [ "train = dataset.take(int(len(dataset)*0.8))\n", "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n", "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:41:54.920944Z", "iopub.status.busy": "2023-05-03T14:41:54.920085Z", "iopub.status.idle": "2023-05-03T14:41:54.928526Z", "shell.execute_reply": "2023-05-03T14:41:54.927502Z", "shell.execute_reply.started": "2023-05-03T14:41:54.920907Z" } }, "outputs": [], "source": [ "def create_model():\n", " \n", " LAYERS = [\n", " Embedding(Config.VOCAB_SIZE+1, 32),\n", " Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),\n", " Bidirectional(LSTM(32)),\n", " Dense(128, activation='relu'),\n", " Dropout(0.1),\n", " Dense(256, activation='relu'),\n", " Dropout(0.1),\n", " Dense(128, activation='relu'),\n", " Dense(6, activation='sigmoid')]\n", " \n", " model = Sequential(LAYERS)\n", " return model" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:41:41.900942Z", "iopub.status.busy": "2023-05-03T14:41:41.900504Z", "iopub.status.idle": "2023-05-03T14:41:41.908480Z", "shell.execute_reply": "2023-05-03T14:41:41.907187Z", "shell.execute_reply.started": "2023-05-03T14:41:41.900911Z" } }, "outputs": [], "source": [ "def callbacks(base_dir=\".\"):\n", " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n", " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n", " os.makedirs(ckpt_file,exist_ok=True)\n", "\n", " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n", " filepath = ckpt_file,\n", " save_best_only = True)\n", "\n", " callback_list = [early_stopping,\n", " ckpt_cb]\n", " return callback_list\n", "callbacks_list = callbacks()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:42:07.719948Z", "iopub.status.busy": "2023-05-03T14:42:07.719137Z", "iopub.status.idle": "2023-05-03T14:42:09.288990Z", "shell.execute_reply": "2023-05-03T14:42:09.287682Z", "shell.execute_reply.started": "2023-05-03T14:42:07.719910Z" } }, "outputs": [], "source": [ "with tpu_strategy.scope():\n", " model = create_model()\n", " model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n", " loss=tf.keras.losses.binary_crossentropy,\n", " metrics=AUC(multi_label=True, num_labels=6))" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:42:34.084064Z", "iopub.status.busy": "2023-05-03T14:42:34.083255Z", "iopub.status.idle": "2023-05-03T14:42:34.110375Z", "shell.execute_reply": "2023-05-03T14:42:34.109380Z", "shell.execute_reply.started": "2023-05-03T14:42:34.084025Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_2\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding_2 (Embedding) (None, None, 32) 6400032 \n", " \n", " bidirectional_4 (Bidirectio (None, None, 128) 49664 \n", " nal) \n", " \n", " bidirectional_5 (Bidirectio (None, 64) 41216 \n", " nal) \n", " \n", " dense_8 (Dense) (None, 128) 8320 \n", " \n", " dropout_4 (Dropout) (None, 128) 0 \n", " \n", " dense_9 (Dense) (None, 256) 33024 \n", " \n", " dropout_5 (Dropout) (None, 256) 0 \n", " \n", " dense_10 (Dense) (None, 128) 32896 \n", " \n", " dense_11 (Dense) (None, 6) 774 \n", " \n", "=================================================================\n", "Total params: 6,565,926\n", "Trainable params: 6,565,926\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:20:19.051437Z", "iopub.status.busy": "2023-05-03T14:20:19.050592Z", "iopub.status.idle": "2023-05-03T14:20:19.057744Z", "shell.execute_reply": "2023-05-03T14:20:19.056746Z", "shell.execute_reply.started": "2023-05-03T14:20:19.051377Z" } }, "outputs": [ { "data": { "text/plain": [ "997" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T14:42:42.306143Z", "iopub.status.busy": "2023-05-03T14:42:42.305188Z", "iopub.status.idle": "2023-05-03T18:36:14.400588Z", "shell.execute_reply": "2023-05-03T18:36:14.399250Z", "shell.execute_reply.started": "2023-05-03T14:42:42.306107Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2023-05-03 14:42:57.854226: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n", "2023-05-03 14:42:58.165317: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - ETA: 0s - loss: 0.1688 - auc_2: 0.5909" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2023-05-03 15:05:36.690047: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n", "2023-05-03 15:05:36.851778: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n", "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1425s 1s/step - loss: 0.1688 - auc_2: 0.5909 - val_loss: 0.0750 - val_auc_2: 0.9196\n", "Epoch 2/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0640 - auc_2: 0.9400" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1395s 1s/step - loss: 0.0640 - auc_2: 0.9400 - val_loss: 0.0548 - val_auc_2: 0.9532\n", "Epoch 3/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0524 - auc_2: 0.9594" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1396s 1s/step - loss: 0.0524 - auc_2: 0.9594 - val_loss: 0.0484 - val_auc_2: 0.9597\n", "Epoch 4/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0466 - auc_2: 0.9672" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1396s 1s/step - loss: 0.0466 - auc_2: 0.9672 - val_loss: 0.0426 - val_auc_2: 0.9729\n", "Epoch 5/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0440 - auc_2: 0.9715" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1395s 1s/step - loss: 0.0440 - auc_2: 0.9715 - val_loss: 0.0406 - val_auc_2: 0.9761\n", "Epoch 6/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0416 - auc_2: 0.9725" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1396s 1s/step - loss: 0.0416 - auc_2: 0.9725 - val_loss: 0.0382 - val_auc_2: 0.9787\n", "Epoch 7/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0394 - auc_2: 0.9762" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1396s 1s/step - loss: 0.0394 - auc_2: 0.9762 - val_loss: 0.0359 - val_auc_2: 0.9819\n", "Epoch 8/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0379 - auc_2: 0.9773" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1396s 1s/step - loss: 0.0379 - auc_2: 0.9773 - val_loss: 0.0346 - val_auc_2: 0.9821\n", "Epoch 9/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0367 - auc_2: 0.9776" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1396s 1s/step - loss: 0.0367 - auc_2: 0.9776 - val_loss: 0.0336 - val_auc_2: 0.9827\n", "Epoch 10/10\n", "997/997 [==============================] - ETA: 0s - loss: 0.0357 - auc_2: 0.9782" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_16_layer_call_fn, lstm_cell_16_layer_call_and_return_conditional_losses, lstm_cell_17_layer_call_fn, lstm_cell_17_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: log_dir/models/model/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "997/997 [==============================] - 1395s 1s/step - loss: 0.0357 - auc_2: 0.9782 - val_loss: 0.0328 - val_auc_2: 0.9819\n" ] } ], "source": [ "history = model.fit(train, \n", " epochs=Config.EPOCHS,\n", " steps_per_epoch=len(train),\n", " validation_data=val,\n", " callbacks=callbacks_list)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T18:36:42.693133Z", "iopub.status.busy": "2023-05-03T18:36:42.692246Z", "iopub.status.idle": "2023-05-03T18:36:42.702544Z", "shell.execute_reply": "2023-05-03T18:36:42.701196Z", "shell.execute_reply.started": "2023-05-03T18:36:42.693095Z" } }, "outputs": [], "source": [ "def model_evaluation(model, vectorizer: TextVectorization, pred_data: pd.Series, y_true):\n", " #pred_data = Text_Cleaner(pred_data).clean_text()\n", " #vectorized_text = vectorizer(pred_data)\n", " y_pred = model.predict(pred_data)\n", " try:\n", " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n", " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n", " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n", " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n", " except Exception as e:\n", " print(e)\n", " \n", " print(f\"Precision: {precision}\\n\"\n", " f\"Recall: {recall}\\n\"\n", " f\"F1-Score: {f1}\\n\"\n", " f\"ROC-AUC-Score: {auc}\")\n", " return (precision, recall, f1, auc)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.evaluate(test)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T18:36:28.884733Z", "iopub.status.busy": "2023-05-03T18:36:28.883953Z", "iopub.status.idle": "2023-05-03T18:36:29.233282Z", "shell.execute_reply": "2023-05-03T18:36:29.231964Z", "shell.execute_reply.started": "2023-05-03T18:36:28.884694Z" } }, "outputs": [], "source": [ "model.save(\"model_4.h5\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T18:51:24.530412Z", "iopub.status.busy": "2023-05-03T18:51:24.529307Z", "iopub.status.idle": "2023-05-03T19:20:36.675080Z", "shell.execute_reply": "2023-05-03T19:20:36.673739Z", "shell.execute_reply.started": "2023-05-03T18:51:24.530375Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3988/3988 [==============================] - 1747s 438ms/step\n", "Precision: 0.034067329786671804\n", "Recall: 0.03396435372259718\n", "F1-Score: 0.03375883387877523\n", "ROC-AUC-Score: 0.4963643308231378\n" ] } ], "source": [ "x_train = np.concatenate([x for x, y in train])\n", "y_train = np.concatenate([y for x, y in train])\n", "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_train, y_true=y_train)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "execution": { "iopub.execute_input": "2023-05-03T18:49:02.718178Z", "iopub.status.busy": "2023-05-03T18:49:02.717234Z", "iopub.status.idle": "2023-05-03T18:49:50.438077Z", "shell.execute_reply": "2023-05-03T18:49:50.436458Z", "shell.execute_reply.started": "2023-05-03T18:49:02.718132Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "996/996 [==============================] - 43s 42ms/step\n", "Precision: 0.03615509646190422\n", "Recall: 0.03674059129986899\n", "F1-Score: 0.03625622443975915\n", "ROC-AUC-Score: 0.4868083116383068\n" ] } ], "source": [ "x_val = np.concatenate([x for x, y in val])\n", "y_val = np.concatenate([y for x, y in val])\n", "result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_val, y_true=y_val)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }