{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8DfEKlbt_TMI", "outputId": "79666846-0691-490a-88b0-5f56f4769772" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive/\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive/')" ], "id": "8DfEKlbt_TMI" }, { "cell_type": "markdown", "metadata": { "id": "8c25705b" }, "source": [ "# 1. Import libraries and load data" ], "id": "8c25705b" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5b07ecd3" }, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import tensorflow as tf\n", "import numpy as np" ], "id": "5b07ecd3" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "91d7e1f0" }, "outputs": [], "source": [ "df = pd.read_csv(os.path.join(\"/content/drive/MyDrive/ColabNotebooks/data\", \"train.csv\"))" ], "id": "91d7e1f0" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 815 }, "id": "1be479a4", "outputId": "88d487c7-8f13-43fe-e866-3c472a6f03d9" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id comment_text \\\n", "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n", "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n", "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n", "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n", "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n", "... ... ... \n", "159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n", "159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n", "159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n", "159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n", "159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n", "\n", " toxic severe_toxic obscene threat insult identity_hate \n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", "... ... ... ... ... ... ... \n", "159566 0 0 0 0 0 0 \n", "159567 0 0 0 0 0 0 \n", "159568 0 0 0 0 0 0 \n", "159569 0 0 0 0 0 0 \n", "159570 0 0 0 0 0 0 \n", "\n", "[159571 rows x 8 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\\nWhy the edits made under my usern...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\\nMore\\nI can't make any real suggestions on ...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
...........................
159566ffe987279560d7ff\":::::And for the second time of asking, when ...000000
159567ffea4adeee384e90You should be ashamed of yourself \\n\\nThat is ...000000
159568ffee36eab5c267c9Spitzer \\n\\nUmm, theres no actual article for ...000000
159569fff125370e4aaaf3And it looks like it was actually you who put ...000000
159570fff46fc426af1f9a\"\\nAnd ... I really don't think you understand...000000
\n", "

159571 rows × 8 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 4 } ], "source": [ "df" ], "id": "1be479a4" }, { "cell_type": "markdown", "metadata": { "id": "e352d92f" }, "source": [ "# 2. Preprocessing" ], "id": "e352d92f" }, { "cell_type": "markdown", "metadata": { "id": "dc5fe893" }, "source": [ "## 2.1. Data overview" ], "id": "dc5fe893" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "ea6fd11e", "outputId": "adb8a890-565d-4e5b-da14-d7f11db89735" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " toxic severe_toxic obscene threat insult identity_hate\n", "0 0 0 0 0 0 0\n", "1 0 0 0 0 0 0\n", "2 0 0 0 0 0 0\n", "3 0 0 0 0 0 0\n", "4 0 0 0 0 0 0\n", "... ... ... ... ... ... ...\n", "159566 0 0 0 0 0 0\n", "159567 0 0 0 0 0 0\n", "159568 0 0 0 0 0 0\n", "159569 0 0 0 0 0 0\n", "159570 0 0 0 0 0 0\n", "\n", "[159571 rows x 6 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toxicsevere_toxicobscenethreatinsultidentity_hate
0000000
1000000
2000000
3000000
4000000
.....................
159566000000
159567000000
159568000000
159569000000
159570000000
\n", "

159571 rows × 6 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 5 } ], "source": [ "df[df.columns[2:]]" ], "id": "ea6fd11e" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 389 }, "id": "7eb94a81", "outputId": "c765800b-02a7-4e91-fe92-ae13b8d943ba" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id comment_text \\\n", "6 0002bcb3da6cb337 COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK \n", "12 0005c987bdfc9d4b Hey... what is it..\\n@ | talk .\\nWhat is it...... \n", "16 0007e25b2121310b Bye! \\n\\nDon't look, come or think of comming ... \n", "42 001810bf8c45bf5f You are gay or antisemmitian? \\n\\nArchangel WH... \n", "43 00190820581d90ce FUCK YOUR FILTHY MOTHER IN THE ASS, DRY! \n", "\n", " toxic severe_toxic obscene threat insult identity_hate \n", "6 1 1 1 0 1 0 \n", "12 1 0 0 0 0 0 \n", "16 1 0 0 0 0 0 \n", "42 1 0 1 0 1 1 \n", "43 1 0 1 0 1 0 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
60002bcb3da6cb337COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK111010
120005c987bdfc9d4bHey... what is it..\\n@ | talk .\\nWhat is it......100000
160007e25b2121310bBye! \\n\\nDon't look, come or think of comming ...100000
42001810bf8c45bf5fYou are gay or antisemmitian? \\n\\nArchangel WH...101011
4300190820581d90ceFUCK YOUR FILTHY MOTHER IN THE ASS, DRY!101010
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "df.loc[df.iloc[:, 2]==1].head()" ], "id": "7eb94a81" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 87 }, "id": "2bb35d57", "outputId": "c9531968-a5f4-4348-a833-4a366ee59010" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'Hey... what is it..\\n@ | talk .\\nWhat is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?\\n\\nAsk Sityush to clean up his behavior than issue me nonsensical warnings...'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 7 } ], "source": [ "df.iloc[12].comment_text" ], "id": "2bb35d57" }, { "cell_type": "markdown", "metadata": { "id": "1fdd25c4" }, "source": [ "## 2.2. Data preprocessing" ], "id": "1fdd25c4" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c8bd9d59" }, "outputs": [], "source": [ "from tensorflow.keras.layers import TextVectorization" ], "id": "c8bd9d59" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b8c03840", "outputId": "d16ec2c2-b1d1-4956-a11b-6f8e1960a5b8" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',\n", " 'insult', 'identity_hate'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 9 } ], "source": [ "df.columns" ], "id": "b8c03840" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2e64c456" }, "outputs": [], "source": [ "X = df.comment_text\n", "y = df.iloc[:,2:].values" ], "id": "2e64c456" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c924ed65" }, "outputs": [], "source": [ "# number of words in vocab\n", "MAX_VOCAB = 200000" ], "id": "c924ed65" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "d9e74b26" }, "outputs": [], "source": [ "vectorizer = TextVectorization(max_tokens=MAX_VOCAB, \n", " output_sequence_length=1800, \n", " output_mode='int')" ], "id": "d9e74b26" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b89a019a" }, "outputs": [], "source": [ "vectorizer.adapt(X.values)" ], "id": "b89a019a" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "832c78b5", "outputId": "c5d8489f-1b6e-4bcc-e4e8-42359d85c4ce" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 14 } ], "source": [ "vectorizer('Hello world, welcome to this project')[:6]" ], "id": "832c78b5" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "d90fea8a" }, "outputs": [], "source": [ "processed_text = vectorizer(X.values)" ], "id": "d90fea8a" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9891f1b3", "outputId": "16715f82-cc03-4bc9-bc4c-0d964111d0d3" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 16 } ], "source": [ "processed_text" ], "id": "9891f1b3" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9176a3c0" }, "outputs": [], "source": [ "# MCSHBAP - map, cache, shuffle, batch, prefetch\n", "# from_tensor_slices OR list_file\n", "data = tf.data.Dataset.from_tensor_slices((processed_text, y))\n", "data = data.cache()\n", "data = data.shuffle(160000)\n", "data = data.batch(16)\n", "data = data.prefetch(8) # prevent bottleneck" ], "id": "9176a3c0" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "042126d5" }, "outputs": [], "source": [ "batch_X, batch_y = data.as_numpy_iterator().next()" ], "id": "042126d5" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5b73aea2", "outputId": "be6a586c-2d6e-459a-8748-ae4b4ec03125" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(16, 1800)" ] }, "metadata": {}, "execution_count": 19 } ], "source": [ "batch_X.shape" ], "id": "5b73aea2" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8286ce71" }, "outputs": [], "source": [ "train = data.take(int(len(data) * .7))\n", "val = data.skip(int(len(data) * .7)).take(int(len(data)*.2))\n", "test = data.take(int(len(data) * .9)).take(int(len(data)*.1))" ], "id": "8286ce71" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f06e8067", "outputId": "8dadd560-5bfb-4d58-8301-d9f56f30a0b0" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "6981" ] }, "metadata": {}, "execution_count": 21 } ], "source": [ "len(train)" ], "id": "f06e8067" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "74d5fb4e", "outputId": "7ddc0d55-360e-4283-a955-ce3dab49fc07" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(array([[ 5495, 51, 29, ..., 0, 0, 0],\n", " [ 33, 7, 69, ..., 0, 0, 0],\n", " [ 24, 1805, 2256, ..., 0, 0, 0],\n", " ...,\n", " [ 46, 1377, 31, ..., 0, 0, 0],\n", " [ 4354, 41514, 8, ..., 0, 0, 0],\n", " [ 215, 8, 477, ..., 0, 0, 0]]),\n", " array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]]))" ] }, "metadata": {}, "execution_count": 22 } ], "source": [ "train.as_numpy_iterator().next()" ], "id": "74d5fb4e" }, { "cell_type": "markdown", "metadata": { "id": "-8f_Bi-OAc03" }, "source": [ "# 3. Buiding model" ], "id": "-8f_Bi-OAc03" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ItiVy4-S1pK5" }, "outputs": [], "source": [ "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding" ], "id": "ItiVy4-S1pK5" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8U9TmSbxAvEw" }, "outputs": [], "source": [ "model = Sequential()\n", "model.add(Embedding(MAX_VOCAB + 1, 32))\n", "model.add(Bidirectional(LSTM(32, activation='tanh')))\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dense(256, activation='relu'))\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dense(64, activation='relu'))\n", "model.add(Dense(32, activation='relu'))\n", "model.add(Dense(6, activation='sigmoid'))" ], "id": "8U9TmSbxAvEw" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pF_pooL4CY91" }, "outputs": [], "source": [ "model.compile(loss='BinaryCrossentropy', optimizer='Adam')" ], "id": "pF_pooL4CY91" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZtPm1Gp2GJza", "outputId": "222a35f6-4ad4-4c16-a240-3a18c0392525" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential_2\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding_2 (Embedding) (None, None, 32) 6400032 \n", " \n", " bidirectional_2 (Bidirectio (None, 64) 16640 \n", " nal) \n", " \n", " dense_12 (Dense) (None, 128) 8320 \n", " \n", " dense_13 (Dense) (None, 256) 33024 \n", " \n", " dense_14 (Dense) (None, 128) 32896 \n", " \n", " dense_15 (Dense) (None, 64) 8256 \n", " \n", " dense_16 (Dense) (None, 32) 2080 \n", " \n", " dense_17 (Dense) (None, 6) 198 \n", " \n", "=================================================================\n", "Total params: 6,501,446\n", "Trainable params: 6,501,446\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "model.summary()" ], "id": "ZtPm1Gp2GJza" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Cu-uCQaEJpjK", "outputId": "dd00becf-d085-47d2-ad04-fc121471ebef" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/10\n", "6981/6981 [==============================] - 642s 92ms/step - loss: 0.0645 - val_loss: 0.0441\n", "Epoch 2/10\n", "6981/6981 [==============================] - 639s 91ms/step - loss: 0.0458 - val_loss: 0.0398\n", "Epoch 3/10\n", "6981/6981 [==============================] - 660s 94ms/step - loss: 0.0412 - val_loss: 0.0366\n", "Epoch 4/10\n", "6981/6981 [==============================] - 639s 91ms/step - loss: 0.0371 - val_loss: 0.0335\n", "Epoch 5/10\n", "6981/6981 [==============================] - 648s 93ms/step - loss: 0.0335 - val_loss: 0.0297\n", "Epoch 6/10\n", "6981/6981 [==============================] - 634s 91ms/step - loss: 0.0307 - val_loss: 0.0261\n", "Epoch 7/10\n", "6981/6981 [==============================] - 634s 91ms/step - loss: 0.0278 - val_loss: 0.0254\n", "Epoch 8/10\n", "6981/6981 [==============================] - 634s 91ms/step - loss: 0.0252 - val_loss: 0.0231\n", "Epoch 9/10\n", "6981/6981 [==============================] - 623s 89ms/step - loss: 0.0234 - val_loss: 0.0193\n", "Epoch 10/10\n", "6981/6981 [==============================] - 627s 90ms/step - loss: 0.0214 - val_loss: 0.0197\n" ] } ], "source": [ "history = model.fit(train, epochs=10, validation_data=val)" ], "id": "Cu-uCQaEJpjK" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ylqg0nwFGPBL" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ], "id": "Ylqg0nwFGPBL" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cD_u8JR4OYFL", "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "outputId": "61bab891-c2a2-44f4-afa4-c17e7aa37f05" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ], "source": [ "plt.figure(figsize=(8, 5))\n", "pd.DataFrame(history.history).plot()\n", "plt.show()" ], "id": "cD_u8JR4OYFL" }, { "cell_type": "markdown", "metadata": { "id": "OJxNheOEVGoD" }, "source": [ "# 4. Make predictions" ], "id": "OJxNheOEVGoD" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qAlM31wVVFIx", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "86d60e93-348e-478b-991c-d5e86693157a" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 64 } ], "source": [ "text = vectorizer(\"you shit\")\n", "text" ], "id": "qAlM31wVVFIx" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5Nlk_v_Da-Pi", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "ad328b76-840f-44e9-d048-23d6d5443cd9" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[ 7, 318, 0, ..., 0, 0, 0]])" ] }, "metadata": {}, "execution_count": 65 } ], "source": [ "np.expand_dims(text, 0)" ], "id": "5Nlk_v_Da-Pi" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ReideBKOVhAY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5e6e9aab-332b-4de0-a590-f55d2dc6bfdf" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0.9876286 , 0.15251058, 0.9701179 , 0.0023339 , 0.33286613,\n", " 0.00344882]], dtype=float32)" ] }, "metadata": {}, "execution_count": 66 } ], "source": [ "res = model.predict(np.expand_dims(text, 0))\n", "res" ], "id": "ReideBKOVhAY" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-uAI_l6XVvMC", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "76562f4d-0884-4e9b-96e9-351bc933a66e" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n", " 'identity_hate'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 67 } ], "source": [ "df.columns[2:]" ], "id": "-uAI_l6XVvMC" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ROi-r6MGVT1T" }, "outputs": [], "source": [ "batch_X, batch_y = test.as_numpy_iterator().next()" ], "id": "ROi-r6MGVT1T" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vcTgLwQjYehR", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8b25d0d8-bbe4-49ac-e67f-e8a929524bae" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [1, 0, 1, 0, 1, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]])" ] }, "metadata": {}, "execution_count": 69 } ], "source": [ "pred = (model.predict(batch_X) > 0.5).astype(int)\n", "pred" ], "id": "vcTgLwQjYehR" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kVWGgNWxc1LY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8be7ac60-e9d2-4007-9c06-358f1a58ab89" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0])" ] }, "metadata": {}, "execution_count": 70 } ], "source": [ "pred = pred.flatten()\n", "pred" ], "id": "kVWGgNWxc1LY" }, { "cell_type": "markdown", "metadata": { "id": "INW-U2pcaXHV" }, "source": [ "# 5. Evaluate model" ], "id": "INW-U2pcaXHV" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6UfuO4WBaWre" }, "outputs": [], "source": [ "from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy" ], "id": "6UfuO4WBaWre" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zJ-1rJDuaJCp" }, "outputs": [], "source": [ "pre = Precision()\n", "re = Recall()\n", "acc = CategoricalAccuracy()" ], "id": "zJ-1rJDuaJCp" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sQFmLI5JbQJZ" }, "outputs": [], "source": [ "for batch in test.as_numpy_iterator():\n", " X_true, y_true = batch\n", " pred = model.predict(X_true)\n", "\n", " y_true = y_true.flatten()\n", " pred = pred.flatten()\n", "\n", " pre.update_state(y_true, pred)\n", " re.update_state(y_true, pred)\n", " acc.update_state(y_true, pred)" ], "id": "sQFmLI5JbQJZ" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TRs7GXOddNAw", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "95910681-d680-4272-94bd-c6a94b4bfcc0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Precision: 0.9102380275726318, Recall: 0.9139072895050049, Accuracy: 0.49949848651885986\n" ] } ], "source": [ "print(f\"Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}\")" ], "id": "TRs7GXOddNAw" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1oEUJDL5eymH" }, "outputs": [], "source": [ "model.save('toxic-detect.h5')" ], "id": "1oEUJDL5eymH" }, { "cell_type": "markdown", "metadata": { "id": "jFglatzteIXT" }, "source": [ "# 5. Test and Gradio" ], "id": "jFglatzteIXT" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Tg_jFNCOdC3V" }, "outputs": [], "source": [ "!pip install gradio jinja2" ], "id": "Tg_jFNCOdC3V" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dKH2Er6Eenim" }, "outputs": [], "source": [ "import gradio as gr" ], "id": "dKH2Er6Eenim" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JES3zWnRfHKt" }, "outputs": [], "source": [ "model = tf.keras.models.load_model('toxic-detect.h5')" ], "id": "JES3zWnRfHKt" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "q_zuX1vVfYHq" }, "outputs": [], "source": [ "def evaluate_comment(Comment):\n", " processed_Comment = vectorizer([Comment])\n", " res = model.predict(processed_Comment)\n", "\n", " text = ''\n", " for i, col in enumerate(df.columns[2:]):\n", " text += '{}: {}\\n'.format(col, 'Violate' if res[0][i] > 0.5 else 'None')\n", " \n", " return text" ], "id": "q_zuX1vVfYHq" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TpJeqs__gsCh" }, "outputs": [], "source": [ "interface = gr.Interface(fn = evaluate_comment, \n", " inputs = gr.inputs.Textbox(lines = 4, placeholder='Comment to evaluate'), \n", " outputs = 'text')" ], "id": "TpJeqs__gsCh" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "a3DOdPazhGuW" }, "outputs": [], "source": [ "interface.launch(share=True)" ], "id": "a3DOdPazhGuW" } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }