{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "2c30c254", "metadata": {}, "outputs": [], "source": [ "import os\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import tensorflow as tf\n", "#import tensorflow_gpu\n", "import urllib\n", "from tensorflow.keras.layers import TextVectorization\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n", "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n", "from sklearn.metrics import roc_auc_score, f1_score\n", "\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "import re\n", "import string\n", "nltk.download('stopwords')\n", "nltk.download('omw-1.4')\n", "nltk.download('wordnet')\n", "nltk.download('wordnet2022')" ] }, { "cell_type": "code", "execution_count": null, "id": "2487874b", "metadata": {}, "outputs": [], "source": [ "def tf_tpu_or_gpu(device: str='gpu'):\n", " if device.lower() == 'gpu':\n", " print(\"Setting up GPU.....\")\n", " device_name = tf.test.gpu_device_name()\n", " if \"GPU\" not in device_name:\n", " print(\"GPU device not found\")\n", " print('Found GPU at: {}'.format(device_name))\n", " config = tf.compat.v1.ConfigProto() \n", " config.gpu_options.allow_growth = True \n", " sess = tf.compat.v1.Session(config=config) \n", " tf.compat.v1.keras.backend.set_session(sess)\n", " print(config)\n", " \n", " elif device.lower() == 'tpu':\n", " print(\"Setting up TPU.....\")\n", " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n", " print('Running on TPU ', tpu.master())\n", " tf.config.experimental_connect_to_cluster(tpu)\n", " tf.tpu.experimental.initialize_tpu_system(tpu)\n", " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n", " print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n", "\n", " else:\n", " raise Exception(\"Wrong Device Paramter Passed\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4fb1df02", "metadata": {}, "outputs": [], "source": [ "tf_tpu_or_gpu(device='tpu')" ] }, { "cell_type": "code", "execution_count": null, "id": "3377596d", "metadata": {}, "outputs": [], "source": [ "class Config:\n", " URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n", " FILE_NAME = \"toxic_comment_data.csv\"\n", " VOCAB_SIZE = 200000\n", " OUTPUT_DIM = 1800\n", " BUFFER_SIZE = 160000\n", " BATCH_SIZE = 16*8\n", " EPOCHS = 10\n", " BASE_LOG_DIR = \"log_dir\"\n", " CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6ca4db64", "metadata": {}, "outputs": [], "source": [ "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n", "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n", "data.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "3f687273", "metadata": {}, "outputs": [], "source": [ "X = data['comment_text']\n", "y = data[data.columns[2:]].values" ] }, { "cell_type": "code", "execution_count": null, "id": "403cbd7d", "metadata": {}, "outputs": [], "source": [ "X" ] }, { "cell_type": "code", "execution_count": null, "id": "e012a53e", "metadata": {}, "outputs": [], "source": [ "y" ] }, { "cell_type": "code", "execution_count": null, "id": "d383e72a", "metadata": {}, "outputs": [], "source": [ "class Text_Cleaner:\n", " def __init__(self, data):\n", " self.data = data\n", " self.STOPWORDS = stopwords.words('english')\n", " self.wordnet = WordNetLemmatizer()\n", " \n", " def new_line_code(self, x:str)->str:\n", " pattern = \"\\n\"\n", " x = re.sub(pattern,' ', x).strip().lower()\n", " return x\n", "\n", " def remove_punctuations(self, x:str)->str:\n", " x = x.translate(str.maketrans('','',string.punctuation))\n", " return x\n", "\n", " def remove_stopwords(self, x:str)->str:\n", " sent=[]\n", " for word in x.split():\n", " if word not in self.STOPWORDS:\n", " sent.append(word)\n", " return ' '.join(sent)\n", "\n", " def lemmatization(self, x:str)->str:\n", " sent=[]\n", " for word in x.split():\n", " sent.append(self.wordnet.lemmatize(word))\n", " return ' '.join(sent)\n", " \n", " def clean_text(self):\n", " self.data = self.data.apply(self.new_line_code)\n", " self.data = self.data.apply(self.remove_punctuations)\n", " self.data = self.data.apply(self.remove_stopwords)\n", " self.data = self.data.apply(self.lemmatization)\n", " self.data = self.data.apply(lambda x: x.strip())\n", " return self.data" ] }, { "cell_type": "code", "execution_count": null, "id": "b121fd12", "metadata": {}, "outputs": [], "source": [ "X = Text_Cleaner(X).clean_text()" ] }, { "cell_type": "code", "execution_count": null, "id": "81c860cf", "metadata": {}, "outputs": [], "source": [ "X" ] }, { "cell_type": "code", "execution_count": null, "id": "d5b374af", "metadata": {}, "outputs": [], "source": [ "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n", " output_sequence_length=Config.OUTPUT_DIM,\n", " output_mode='int')\n", "vectorizer.adapt(X.values)\n", "vectorized_text = vectorizer(X.values)`" ] }, { "cell_type": "code", "execution_count": null, "id": "c5b25ecc", "metadata": {}, "outputs": [], "source": [ "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n", "dataset = dataset.cache()\n", "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n", "dataset = dataset.batch(Config.BATCH_SIZE)\n", "dataset = dataset.prefetch(tf.data.AUTOTUNE)" ] }, { "cell_type": "code", "execution_count": null, "id": "a60be072", "metadata": {}, "outputs": [], "source": [ "train = dataset.take(int(len(dataset)*0.8))\n", "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n", "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))" ] }, { "cell_type": "code", "execution_count": null, "id": "6d4c3d18", "metadata": {}, "outputs": [], "source": [ "def callbacks(base_dir=\".\"):\n", " early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n", " ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n", " os.makedirs(ckpt_file,exist_ok=True)\n", "\n", " ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n", " filepath = ckpt_file,\n", " save_best_only = True)\n", "\n", " callback_list = [early_stopping,\n", " ckpt_cb]\n", " return callback_list\n", "callbacks_list = callbacks()" ] }, { "cell_type": "code", "execution_count": null, "id": "8cf70d04", "metadata": {}, "outputs": [], "source": [ "def create_model():\n", " LAYERS = [\n", " Embedding(Config.VOCAB_SIZE+1, 32),\n", " Bidirectional(LSTM(64, return_sequences=True, activation='tanh')),\n", " Bidirectional(LSTM(32)),\n", " Dense(128, activation='relu'),\n", " Dense(256, activation='relu'),\n", " Dense(128, activation='relu'),\n", " Dense(6, activation='sigmoid')]\n", " \n", " model = Sequential(LAYERS)\n", " return model" ] }, { "cell_type": "code", "execution_count": null, "id": "26a56966", "metadata": {}, "outputs": [], "source": [ "with tpu_strategy.scope():\n", " model = create_model()\n", " model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n", " loss=tf.keras.losses.binary_crossentropy,\n", " metrics=AUC(multi_label=True, num_labels=6))\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "891727f6", "metadata": {}, "outputs": [], "source": [ "history = model.fit(train, \n", " epochs=Config.EPOCHS,\n", " steps_per_epoch=len(train),\n", " validation_data=val,\n", " callbacks=callbacks_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "533cd762", "metadata": {}, "outputs": [], "source": [ "def model_evaluation(model, pred_data: pd.Series, y_true):\n", " y_pred = model.predict(pred_data)\n", " try:\n", " precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n", " recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n", " f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n", " auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n", " except Exception as e:\n", " print(e)\n", " \n", " print(f\"Precision: {precision}\\n\"\n", " f\"Recall: {recall}\\n\"\n", " f\"F1-Score: {f1}\\n\"\n", " f\"ROC-AUC-Score: {auc}\")\n", " return (precision, recall, f1, auc)" ] }, { "cell_type": "code", "execution_count": null, "id": "a2f19754", "metadata": {}, "outputs": [], "source": [ "model.save(\"model_3.h5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "314be9bc", "metadata": {}, "outputs": [], "source": [ "x_train = np.concatenate([x for x, y in train])\n", "y_train = np.concatenate([y for x, y in train])\n", "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "ec45f5ad", "metadata": {}, "outputs": [], "source": [ "x_val = np.concatenate([x for x, y in val])\n", "y_val = np.concatenate([y for x, y in val])\n", "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }