Spaces:

shivansh-ka
/

Toxic-Comment-Classifier

Sleeping

File size: 9,672 Bytes

0734f8e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c30c254",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import tensorflow as tf\n",
    "#import tensorflow_gpu\n",
    "import urllib\n",
    "from tensorflow.keras.layers import TextVectorization\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding\n",
    "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy\n",
    "from sklearn.metrics import roc_auc_score, f1_score\n",
    "\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "import re\n",
    "import string\n",
    "nltk.download('stopwords')\n",
    "nltk.download('omw-1.4')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('wordnet2022')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2487874b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tf_tpu_or_gpu(device: str='gpu'):\n",
    "    if device.lower() == 'gpu':\n",
    "        print(\"Setting up GPU.....\")\n",
    "        device_name = tf.test.gpu_device_name()\n",
    "        if \"GPU\" not in device_name:\n",
    "            print(\"GPU device not found\")\n",
    "        print('Found GPU at: {}'.format(device_name))\n",
    "        config = tf.compat.v1.ConfigProto() \n",
    "        config.gpu_options.allow_growth = True \n",
    "        sess = tf.compat.v1.Session(config=config) \n",
    "        tf.compat.v1.keras.backend.set_session(sess)\n",
    "        print(config)\n",
    "    \n",
    "    elif device.lower() == 'tpu':\n",
    "        print(\"Setting up TPU.....\")\n",
    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
    "        print('Running on TPU ', tpu.master())\n",
    "        tf.config.experimental_connect_to_cluster(tpu)\n",
    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
    "        tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
    "        print(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)\n",
    "\n",
    "    else:\n",
    "        raise Exception(\"Wrong Device Paramter Passed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fb1df02",
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_tpu_or_gpu(device='tpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3377596d",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Config:\n",
    "    URL = f\"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv\"\n",
    "    FILE_NAME = \"toxic_comment_data.csv\"\n",
    "    VOCAB_SIZE = 200000\n",
    "    OUTPUT_DIM = 1800\n",
    "    BUFFER_SIZE = 160000\n",
    "    BATCH_SIZE = 16*8\n",
    "    EPOCHS = 10\n",
    "    BASE_LOG_DIR = \"log_dir\"\n",
    "    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,\"models\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ca4db64",
   "metadata": {},
   "outputs": [],
   "source": [
    "data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)\n",
    "data = pd.read_csv(\"/kaggle/working/toxic_comment_data.csv\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f687273",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data['comment_text']\n",
    "y = data[data.columns[2:]].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "403cbd7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e012a53e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6db618c",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,\n",
    "                               output_sequence_length=Config.OUTPUT_DIM,\n",
    "                               output_mode='int')\n",
    "vectorizer.adapt(X.values)\n",
    "vectorized_text = vectorizer(X.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5b25ecc",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
    "dataset = dataset.cache()\n",
    "dataset = dataset.shuffle(Config.BUFFER_SIZE)\n",
    "dataset = dataset.batch(Config.BATCH_SIZE)\n",
    "dataset = dataset.prefetch(tf.data.AUTOTUNE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a60be072",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = dataset.take(int(len(dataset)*0.8))\n",
    "val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))\n",
    "#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d4c3d18",
   "metadata": {},
   "outputs": [],
   "source": [
    "def callbacks(base_dir=\".\"):\n",
    "    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=2)\n",
    "    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,\"model\")\n",
    "    os.makedirs(ckpt_file,exist_ok=True)\n",
    "\n",
    "    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(\n",
    "      filepath = ckpt_file,\n",
    "      save_best_only = True)\n",
    "\n",
    "    callback_list = [early_stopping,\n",
    "                     ckpt_cb]\n",
    "    return callback_list\n",
    "callbacks_list = callbacks()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8cf70d04",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_model():\n",
    "    LAYERS = [\n",
    "              Embedding(Config.VOCAB_SIZE+1, 32),,\n",
    "              Bidirectional(LSTM(64, activation='tanh')),\n",
    "              Dense(128, activation='relu'),\n",
    "              Dense(256, activation='relu'),\n",
    "              Dense(128, activation='relu'),\n",
    "              Dense(6, activation='sigmoid')]\n",
    "    \n",
    "    model = Sequential(LAYERS)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26a56966",
   "metadata": {},
   "outputs": [],
   "source": [
    "with tpu_strategy.scope():\n",
    "    model = create_model()\n",
    "    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),\n",
    "                  loss=tf.keras.losses.binary_crossentropy,\n",
    "                  metrics=AUC(multi_label=True, num_labels=6))\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "891727f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(train, \n",
    "                    epochs=Config.EPOCHS,\n",
    "                    steps_per_epoch=len(train),\n",
    "                    validation_data=val,\n",
    "                    callbacks=callbacks_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "533cd762",
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_evaluation(model, pred_data: pd.Series, y_true):\n",
    "    y_pred = model.predict(pred_data)\n",
    "    try:\n",
    "        precision = precision_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
    "        recall = recall_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
    "        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average=\"macro\")\n",
    "        auc = roc_auc_score(y_true, y_pred, average=\"macro\")\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "    print(f\"Precision: {precision}\\n\"\n",
    "          f\"Recall: {recall}\\n\"\n",
    "          f\"F1-Score: {f1}\\n\"\n",
    "          f\"ROC-AUC-Score: {auc}\")\n",
    "    return (precision, recall, f1, auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2f19754",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save(\"baseline_model_1.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "314be9bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = np.concatenate([x for x, y in train])\n",
    "y_train = np.concatenate([y for x, y in train])\n",
    "result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec45f5ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_val = np.concatenate([x for x, y in val])\n",
    "y_val = np.concatenate([y for x, y in val])\n",
    "result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}