df = pd.read_csv('/content/drive/MyDrive/Data-for-ML-projects/comment_toxicity/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')

df.sample(6)

id comment_text toxic severe_toxic obscene threat insult identity_hate
88124 ebbccab2b7350f91 I have removed your 3 unsourced edits. Feel fr... 0 0 0 0 0 0
72052 c0e58fe6aaf5efe6 . Back to edits. In the coming days I will wor... 0 0 0 0 0 0
13275 231c58ea2f2ff9bf " \n\nDave1185 is just an insulting, rude edit... 1 0 0 0 0 0
46559 7c66f44d1ae93fc5 Alright, and also \n\nI want to make Mileena a... 0 0 0 0 0 0
15342 288bbd602abedf63 "\n\n My talk page \nI kind of feel silly, I'v... 0 0 0 0 0 0
115375 68ffaeaeb144964b (Presumably still active, last edit was in mid... 0 0 0 0 0 0 The more words we store the larger our model is effectively going to be.\n", "\n", "If we've got massive word embeddings then we will need one word embeddings for every single word." ], "metadata": { "id": "tQh2As2lYQ6h" } }, { "cell_type": "code", "source": [ "MAX_FEATURES = 200000 # no of words in the vocabulary (vocab)" ], "metadata": { "id": "JIm2iAzhYHhn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer = TextVectorization(max_tokens= MAX_FEATURES, output_sequence_length=1800, output_mode='int')\n", "# the max length of the sentence from our dataset that we're going to analyze is going to be capped at 1800 words. If we increase this limit that would also mean increasing the computational load\n" ], "metadata": { "id": "qUUmc2s3dFnR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer.adapt(X.values)" ], "metadata": { "id": "Du9O7d3WdgDv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# print(type(vectorizer.get_vocabulary())) # vectorizer.get_vocabulary() -> returns a python 'list' containing the words learned by the vectorizer\n", "# vectorizer.get_vocabulary() # taking a look at some of the words learned by the vectorizer\n", "print(vectorizer.get_vocabulary()[:25])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pqmfuNqEfDP1", "outputId": "498d43d6-9939-4567-cdaa-2699f05cd359" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is', 'that', 'in', 'it', 'for', 'this', 'not', 'on', 'be', 'as', 'have', 'are', 'your', 'with', 'if', 'article']\n" ] } ] }, { "cell_type": "code", "source": [ "# vectorizer(\"hello everyone how's it going\")" ], "metadata": { "id": "MHzsP-3NfW1v" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorized_text = vectorizer(X.values)" ], "metadata": { "id": "WQp7kXamrqxT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(vectorized_text)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xswHvLKutPYT", "outputId": "545cb3cd-def0-410d-8b25-cbd362693a67" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "tf.Tensor(\n", "[[ 645 76 2 ... 0 0 0]\n", " [ 1 54 2489 ... 0 0 0]\n", " [ 425 441 70 ... 0 0 0]\n", " ...\n", " [32445 7392 383 ... 0 0 0]\n", " [ 5 12 534 ... 0 0 0]\n", " [ 5 8 130 ... 0 0 0]], shape=(159571, 1800), dtype=int64)\n" ] } ] }, { "cell_type": "code", "source": [ "# Steps to creating a pipeline -> MCSHBAP - map, cache, shuffle, batch, prefetch\n", "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n", "dataset = dataset.cache()\n", "dataset = dataset.shuffle(160000)\n", "dataset = dataset.batch(16)\n", "dataset = dataset.prefetch(8) # helps prevent bottlenecks" ], "metadata": { "id": "kebPi3X0gX1k" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "batch_X , batch_y = dataset.as_numpy_iterator().next()" ], "metadata": { "id": "DugdUJQFrg7E" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# batch_X will be our vectorized text examples\n", "batch_X.shape # note that we've got 16 examples, of 1800 words at max in each" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MSxyQ430sF_Y", "outputId": "fa0b322a-a177-4945-f722-49cf131b84f7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(16, 1800)" ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "batch_y.shape # note we got 16 samples, and a vector of size 6 corresponding to each" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_zG1QQtDxo7V", "outputId": "cf28b30d-0a09-445f-d917-46ff00c978be" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(16, 6)" ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "markdown", "source": [ "## Creating our training, validation and test partitions" ], "metadata": { "id": "gZgMwlFsyR8v" } }, { "cell_type": "code", "source": [ "print(f\"No of batches in the dataset : {len(dataset)}\")\n", "print(f\"No of samples in the whole dataset : {len(dataset) * 16}\") # since each batch contains 16 samples" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nB8TcEzTykW-", "outputId": "62179c94-a210-4aa3-908b-ac7aa9afed0a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "No of batches in the dataset : 9974\n", "No of samples in the whole dataset : 159584\n" ] } ] }, { "cell_type": "code", "source": [ "# we'll use 70% of the length of our dataset for training\n", "train = dataset.take(int(len(dataset) * 0.7))\n", "\n", "# skip 70% of the datset and take the next 20% for the validation set\n", "val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))\n", "\n", "# skip 90% of the dataset and take the next 10% for the test set\n", "test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))\n" ], "metadata": { "id": "p34ekUDWxyBO" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(f\"Length of the train set : {len(train)}. 'Type' of the training set : {type(train)}\")\n", "print(f\"Length of the validation set : {len(val)}. 'Type' of the training set : {type(val)}\")\n", "print(f\"Length of the test set : {len(test)}. 'Type' of the training set : {type(test)}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MorPEljt2u7d", "outputId": "9cf155af-e778-42f0-a788-e992e6690bc0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Length of the train set : 6981. 'Type' of the training set : \n", "Length of the validation set : 1994. 'Type' of the training set : \n", "Length of the test set : 997. 'Type' of the training set : \n" ] } ] }, { "cell_type": "markdown", "source": [ "## Building our neural n/w" ], "metadata": { "id": "vG2TaCvS31HY" } }, { "cell_type": "code", "source": [ "# Create a sequential model (import the necessary dependencies for the same)\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding" ], "metadata": { "id": "lPqW8SrQ3H2p" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# print(y[0])\n", "print(y.shape) # (159571, 6) => 159571 samples and 6 parameters (a vector of 6 values) corresponding to each\n", "y[0] # taking a look at one of the samples" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JR_D26MB93b6", "outputId": "f4b90cc9-379d-4335-8945-4747b3818709" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(159571, 6)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "array([0, 0, 0, 0, 0, 0])" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "model = Sequential()\n", "# First - Create the embedding layer\n", "model.add(Embedding(MAX_FEATURES+1, 32))\n", "# Bidirectional LSTM Layer\n", "# 41:00 - 42:00 -> 'Bidirectional' is reqd because in a way we'd like to enable the neural n/w to be able to look at the sentences from both directions??\n", "# for eg. -> The sentence - \"I don't hate you\" - the presence of \"don't\" before the word \"hate\" significantly affects the meaning of the sentence\n", "model.add(Bidirectional(LSTM(32, activation='tanh')))\n", "# 41:00 -> the reason that we're using 'tanh' as the activation function, instead of the immensely popular 'relu', is because the gpu acceleration that\n", "# is required for an lstm layout needs to be tanh (this is something that is dictated by tensorflow)\n", "\n", "# Feature extractor Fully connected layers\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dense(256, activation='relu'))\n", "model.add(Dense(128, activation='relu'))\n", "\n", "# Final layer\n", "# By having 6 layers in the final o/p layer, we're going to be able to o/p the exact same style of o/p as our target labels (which contains a vector of 6 values)\n", "model.add(Dense(6, activation='sigmoid'))" ], "metadata": { "id": "LN89EAY64Hlf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 45:00 onwards - explanation of why we're using 'BinaryCrossentropy'\n", "\n", "model.compile(loss='BinaryCrossentropy', optimizer='Adam')" ], "metadata": { "id": "22yxOAlK-nns" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.summary()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_jvnynfL-2N1", "outputId": "8b81832b-f790-4490-e1a0-72d8d1dfdc81" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding (Embedding) (None, None, 32) 6400032 \n", " \n", " bidirectional (Bidirectiona (None, 64) 16640 \n", " l) \n", " \n", " dense (Dense) (None, 128) 8320 \n", " \n", " dense_1 (Dense) (None, 256) 33024 \n", " \n", " dense_2 (Dense) (None, 128) 32896 \n", " \n", " dense_3 (Dense) (None, 6) 774 \n", " \n", "=================================================================\n", "Total params: 6,491,686\n", "Trainable params: 6,491,686\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ] }, { "cell_type": "code", "source": [ "# Assuming you have defined and compiled your model\n", "# model.save('/content/drive/MyDrive/Data-for-ML-projects/comment_toxicity')\n" ], "metadata": { "id": "kTIP5RFWGryj" }, "execution_count": null, "metadata": { "id": "XGF5aRzQ3edS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pred_res = model.predict(np.expand_dims(input_str, 0))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gnZY3AYW5ZJ9", "outputId": "68c3433f-391f-41e2-fd25-fd62d3096646" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1/1 [==============================] - 1s 1s/step\n" ] } ] }, { "cell_type": "code", "source": [ "pred_res" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "06aL99Gg5jQG", "outputId": "1870511d-2486-44fa-d7e2-d5760672d14a" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0.993094 , 0.34385905, 0.9560297 , 0.0484327 , 0.8572675 ,\n", " 0.22735107]], dtype=float32)" ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "code", "source": [ "df.columns[2:]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6fn0SZkf5ylg", "outputId": "1ad5c0e8-067d-4373-9577-6c942bbc343e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n", " 'identity_hate'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [ "def score_comment(comment): # a function that we're going to hook into our Gradio model\n", " \"\"\"\n", " This function will take in a comment and then pass it through a prediction pipeline.\n", " \"\"\"\n", " vectorized_comment = vectorizer([comment]) # First, we pass the comment through a vectorizer to convert the text into a vector of numbers\n", " results = model.predict(vectorized_comment) # Second, we then run the vectorized text (i.e. the comment) through our model to get the predictions\n", "\n", " # We then unpack all the results. The loop below goes through all of the 6 columns corresponding to any comment which describe its characteristics, viz. 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'\n", " text = ''\n", " for idx, col in enumerate(df.columns[2:]):\n", " text += '{}: {}\\n'.format(col, results[0][idx] > 0.40)\n", "\n", " return text\n" ], "metadata": { "id": "1YmsVJfn53M_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "interface = gr.Interface(fn=score_comment,\n", " inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),\n", " outputs='text')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5-naT_gm7eGY", "outputId": "70879b8a-9562-4438-93ef-1297ae8d8ce6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":2: GradioDeprecationWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n", " inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),\n", ":2: GradioDeprecationWarning: `optional` parameter is deprecated, and it has no effect\n", " inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),\n", ":2: GradioDeprecationWarning: `numeric` parameter is deprecated, and it has no effect\n", " inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),\n" ] } ] }, { "cell_type": "code", "source": [ "interface.launch(share=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 591 }, "id": "c-QSTB4y78Ds", "outputId": "ccf7a71a-6720-477c-ddc4-a3303aaea4f9" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Colab notebook detected. 