Spaces:

Kartik17
/

TOXIC-COMMENT

Build error

App Files Files Community

Kartik17 commited on Apr 9

Commit

1b606c3

•

1 Parent(s): 368a008

Upload PROJECT TOXIC COMMENT ANALYZER.ipynb

Browse files

Files changed (1) hide show

PROJECT TOXIC COMMENT ANALYZER.ipynb +1486 -0

PROJECT TOXIC COMMENT ANALYZER.ipynb ADDED Viewed

	@@ -0,0 +1,1486 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9a3a647",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import tensorflow as tf\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 167,
+   "id": "52960768",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>comment_text</th>\n",
+       "      <th>toxic</th>\n",
+       "      <th>severe_toxic</th>\n",
+       "      <th>obscene</th>\n",
+       "      <th>threat</th>\n",
+       "      <th>insult</th>\n",
+       "      <th>identity_hate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0000997932d777bf</td>\n",
+       "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>000103f0d9cfb60f</td>\n",
+       "      <td>D'aww! He matches this background colour I'm s...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>000113f07ec002fd</td>\n",
+       "      <td>Hey man, I'm really not trying to edit war. It...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0001b41b1c6bb37e</td>\n",
+       "      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0001d958c54c6e35</td>\n",
+       "      <td>You, sir, are my hero. Any chance you remember...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 id                                       comment_text  toxic  \\\n",
+       "0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0   \n",
+       "1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n",
+       "2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n",
+       "3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n",
+       "4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n",
+       "\n",
+       "   severe_toxic  obscene  threat  insult  identity_hate  \n",
+       "0             0        0       0       0              0  \n",
+       "1             0        0       0       0              0  \n",
+       "2             0        0       0       0              0  \n",
+       "3             0        0       0       0              0  \n",
+       "4             0        0       0       0              0  "
+      ]
+     },
+     "execution_count": 167,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 0s 327ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "data=pd.read_csv('train.csv')\n",
+    "data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4bb87073",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encyclopedic so one can use it for school as a reference. I have been to the selective breeding page but it's almost a stub. It points to 'animal breeding' which is a short messy article that gives you no info. There must be someone around with expertise in eugenics? 93.161.107.169\""
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['comment_text'][8]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c6e7509b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',\n",
+       "       'insult', 'identity_hate'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    " data.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2802af7a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(159571, 8)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "97449fcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "toxic            0\n",
+       "severe_toxic     0\n",
+       "obscene          0\n",
+       "threat           0\n",
+       "insult           0\n",
+       "identity_hate    0\n",
+       "Name: 9, dtype: int64"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[data.columns[2:]].iloc[9]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8844c1b7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bbd67b78",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6d23f922",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.layers import TextVectorization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a3d9e014",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x=data['comment_text']\n",
+    "y=data[data.columns[2:]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "eb1eefc0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0         Explanation\\nWhy the edits made under my usern...\n",
+       "1         D'aww! He matches this background colour I'm s...\n",
+       "2         Hey man, I'm really not trying to edit war. It...\n",
+       "3         \"\\nMore\\nI can't make any real suggestions on ...\n",
+       "4         You, sir, are my hero. Any chance you remember...\n",
+       "                                ...                        \n",
+       "159566    \":::::And for the second time of asking, when ...\n",
+       "159567    You should be ashamed of yourself \\n\\nThat is ...\n",
+       "159568    Spitzer \\n\\nUmm, theres no actual article for ...\n",
+       "159569    And it looks like it was actually you who put ...\n",
+       "159570    \"\\nAnd ... I really don't think you understand...\n",
+       "Name: comment_text, Length: 159571, dtype: object"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "414f8a4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       ...,\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0]], dtype=int64)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "70ec2244",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_features=200000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b6a83b69",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "vectorizer=TextVectorization(max_tokens=max_features,\n",
+    "                            output_sequence_length=1800,\n",
+    "                            output_mode='int')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "ba246221",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['', '[UNK]']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorizer.get_vocabulary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9648914d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\utils\\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "vectorizer.adapt(x.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "75b035a9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tf.Tensor: shape=(5,), dtype=int64, numpy=array([  19,    7, 3666, 2891,  338], dtype=int64)>"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorizer(\"have you watched breaking bad\")[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "8854984d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorized_text=vectorizer(x.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "9fb407a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=\n",
+       "array([[  645,    76,     2, ...,     0,     0,     0],\n",
+       "       [    1,    54,  2489, ...,     0,     0,     0],\n",
+       "       [  425,   441,    70, ...,     0,     0,     0],\n",
+       "       ...,\n",
+       "       [32445,  7392,   383, ...,     0,     0,     0],\n",
+       "       [    5,    12,   534, ...,     0,     0,     0],\n",
+       "       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorized_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0aa74efc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset=tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
+    "dataset=dataset.cache()\n",
+    "dataset=dataset.shuffle(160000)\n",
+    "dataset=dataset.batch(16)\n",
+    "dataset=dataset.prefetch(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "ff040bf8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9973.1875"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "159571/16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "fd8b18f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_x, batch_y = dataset.as_numpy_iterator().next()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "d81bb1af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(16, 1800)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch_x.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "2cfeca51",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(16, 6)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch_y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "9d8a90ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9974"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "5a111205",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6981"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "int(len(dataset)*.7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "34094209",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train=dataset.take(int(len(dataset)*.7))\n",
+    "val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))\n",
+    "test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "2e5369af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6981, 1994, 997)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train),len(val),len(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "3bb32ca4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_generator=train.as_numpy_iterator()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "32f4500b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([[    73,      9,     12, ...,      0,      0,      0],\n",
+       "        [182862,     88,      7, ...,      0,      0,      0],\n",
+       "        [  4384,    274,    139, ...,      0,      0,      0],\n",
+       "        ...,\n",
+       "        [    14,      9,     21, ...,      0,      0,      0],\n",
+       "        [  1188,    399,    123, ...,      0,      0,      0],\n",
+       "        [ 46927,    175,    425, ...,      0,      0,      0]], dtype=int64),\n",
+       " array([[0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [1, 0, 1, 0, 1, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0]], dtype=int64))"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_generator.next()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "cbc9a9b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "6dd6bf3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model=Sequential()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "e33e5c86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.add(Embedding(max_features+1, 32))\n",
+    "model.add(Bidirectional(LSTM(32, activation='tanh')))\n",
+    "model.add(Dense(128, activation='relu'))\n",
+    "model.add(Dense(256, activation='relu'))\n",
+    "model.add(Dense(128, activation='relu'))\n",
+    "model.add(Dense(6, activation='sigmoid'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "6821b620",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\optimizers\\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.compile(loss='BinaryCrossentropy', optimizer='adam', metrics=['accuracy'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "f06f01e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential\"\n",
+      "_________________________________________________________________\n",
+      " Layer (type)                Output Shape              Param #   \n",
+      "=================================================================\n",
+      " embedding (Embedding)       (None, None, 32)          6400032   \n",
+      "                                                                 \n",
+      " bidirectional (Bidirection  (None, 64)                16640     \n",
+      " al)                                                             \n",
+      "                                                                 \n",
+      " dense (Dense)               (None, 128)               8320      \n",
+      "                                                                 \n",
+      " dense_1 (Dense)             (None, 256)               33024     \n",
+      "                                                                 \n",
+      " dense_2 (Dense)             (None, 128)               32896     \n",
+      "                                                                 \n",
+      " dense_3 (Dense)             (None, 6)                 774       \n",
+      "                                                                 \n",
+      "=================================================================\n",
+      "Total params: 6491686 (24.76 MB)\n",
+      "Trainable params: 6491686 (24.76 MB)\n",
+      "Non-trainable params: 0 (0.00 Byte)\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "376ceed5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/10\n",
+      "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\engine\\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n",
+      "\n",
+      "6981/6981 [==============================] - 5071s 726ms/step - loss: 0.0635 - accuracy: 0.9855 - val_loss: 0.0452 - val_accuracy: 0.9946\n",
+      "Epoch 2/10\n",
+      "6981/6981 [==============================] - 4516s 647ms/step - loss: 0.0454 - accuracy: 0.9942 - val_loss: 0.0399 - val_accuracy: 0.9938\n",
+      "Epoch 3/10\n",
+      "6981/6981 [==============================] - 4100s 587ms/step - loss: 0.0407 - accuracy: 0.9889 - val_loss: 0.0373 - val_accuracy: 0.9941\n",
+      "Epoch 4/10\n",
+      "6981/6981 [==============================] - 4111s 589ms/step - loss: 0.0371 - accuracy: 0.9920 - val_loss: 0.0327 - val_accuracy: 0.9948\n",
+      "Epoch 5/10\n",
+      "6981/6981 [==============================] - 4691s 672ms/step - loss: 0.0334 - accuracy: 0.9941 - val_loss: 0.0302 - val_accuracy: 0.9940\n",
+      "Epoch 6/10\n",
+      "6981/6981 [==============================] - 5055s 724ms/step - loss: 0.0311 - accuracy: 0.9841 - val_loss: 0.0275 - val_accuracy: 0.9944\n",
+      "Epoch 7/10\n",
+      "6981/6981 [==============================] - 4508s 646ms/step - loss: 0.0277 - accuracy: 0.9937 - val_loss: 0.0245 - val_accuracy: 0.9930\n",
+      "Epoch 8/10\n",
+      "6981/6981 [==============================] - 4479s 642ms/step - loss: 0.0254 - accuracy: 0.9907 - val_loss: 0.0228 - val_accuracy: 0.9940\n",
+      "Epoch 9/10\n",
+      "6981/6981 [==============================] - 4501s 645ms/step - loss: 0.0228 - accuracy: 0.9892 - val_loss: 0.0193 - val_accuracy: 0.9950\n",
+      "Epoch 10/10\n",
+      "6981/6981 [==============================] - 4523s 648ms/step - loss: 0.0209 - accuracy: 0.9200 - val_loss: 0.0192 - val_accuracy: 0.9943\n"
+     ]
+    }
+   ],
+   "source": [
+    "history=model.fit(train, epochs=10, validation_data=val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "cb6501e6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "997/997 [==============================] - 158s 146ms/step - loss: 0.0188 - accuracy: 0.9940\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[0.018809018656611443, 0.9939819574356079]"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.evaluate(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "92408998",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_batch, y_batch = test.as_numpy_iterator().next()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "1c555107",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 2s 2s/step\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [1, 0, 1, 0, 1, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(model.predict(x_batch) > 0.5).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "26a06914",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [1, 0, 1, 0, 1, 0],\n",
+       "       [0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0]], dtype=int64)"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "0ef7c06b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_text=vectorizer('I am coming to kill you pal')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "5bb057fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tf.Tensor: shape=(7,), dtype=int64, numpy=array([   8,   74,  939,    3,  950,    7, 5762], dtype=int64)>"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_text[:7]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "7ab223e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch=test.as_numpy_iterator().next()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "3986d97b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 0s 78ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=model.predict(np.expand_dims(input_text,0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "5df2d7da",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n",
+       "       'identity_hate'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.columns[2:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "ee22bb73",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0.54140395, 0.00114176, 0.01782109, 0.10045966, 0.0319472 ,\n",
+       "        0.02094165]], dtype=float32)"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa7378c8",
+   "metadata": {},
+   "source": [
+    "## Evaluate the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "c2b08a8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('finalproject.keras')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "71e114bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\engine\\training.py:3103: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n",
+      "  saving_api.save_model(\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.save('finalprojecttoxic.h5')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6abdcdb8",
+   "metadata": {},
+   "source": [
+    "## Making a Language Translation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "442cd16b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "id": "95b31788",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "translator_german=pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-de-en\", tokenizer=\"Helsinki-NLP/opus-mt-de-en\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "id": "7e882490",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "german=\"Hallo, wie heißt du?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "id": "dcfefba8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"Hello, what's your name?\""
+      ]
+     },
+     "execution_count": 126,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_to_german=translator_german(german)\n",
+    "en_to_german[0]['translation_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "ea54de34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "translator_spanish = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-es-en\", tokenizer=\"Helsinki-NLP/opus-mt-es-en\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "id": "07f1c640",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spanish_text = \"hola como estas\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "id": "76b5f447",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Hello, how are you?'"
+      ]
+     },
+     "execution_count": 124,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_to_spanish = translator(spanish_text)\n",
+    "en_to_spanish[0]['translation_text']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e08fc4e7",
+   "metadata": {},
+   "source": [
+    "## Test and Gradio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "7d5cdcb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "560ec8e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model=tf.keras.models.load_model('finalprojecttoxic.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "aaf4a3cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_str=vectorizer('Hey i freaking hate you!. I\\'m going to hurt you!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "id": "54761270",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 0s 88ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=model.predict(np.expand_dims(input_str,0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "ba15136b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0.9133858 , 0.00198671, 0.0333592 , 0.00411558, 0.71037763,\n",
+       "        0.00563182]], dtype=float32)"
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "c189f6c9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n",
+       "       'identity_hate'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.columns[2:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "id": "8c1fbac0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "translator_hindi = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-hi-en\", tokenizer=\"Helsinki-NLP/opus-mt-hi-en\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "c8db9d6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hindi_text = \"नमस्ते, आप कैसे हैं?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "id": "9c95d205",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Hello, how are you?'"
+      ]
+     },
+     "execution_count": 123,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_to_hin = translator_hindi(hindi_text)\n",
+    "en_to_hin[0]['translation_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "id": "3d25803f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def translate_hindi(from_text):\n",
+    "    result2 = translator_hindi(from_text)\n",
+    "    \n",
+    "    return result2[0]['translation_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "id": "52108859",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Hello, how are you?'"
+      ]
+     },
+     "execution_count": 133,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "translate_hindi('नमस्ते, आप कैसे हैं?')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "837c3093",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def score_comment(comment):\n",
+    "    vectorized_comment = vectorizer([comment])\n",
+    "    results=model.predict(vectorized_comment)\n",
+    "    \n",
+    "    text=''\n",
+    "    for idx, col in enumerate(data.columns[2:]):\n",
+    "        text+= '{}: {}\\n'.format(col, results[0][idx]>0.5)\n",
+    "        \n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 163,
+   "id": "21ea015f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def combined_models(input):\n",
+    "    output1=translate_hindi(input)\n",
+    "    output2=score_comment(input)\n",
+    "    \n",
+    "    return output1, output2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 166,
+   "id": "ca5d14a9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 0s 109ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "interface = gr.Interface(fn=combined_models, inputs=\"text\", outputs=[\"text\",\"text\"],title=\"Toxic Comment Analyzer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 168,
+   "id": "cb485bb9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7871\n",
+      "Running on public URL: https://27f88e54e3177749fa.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://27f88e54e3177749fa.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 168,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 0s 426ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "interface.launch(share=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e30aa7aa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}