ksvmuralidhar commited on
Commit
7ddd561
1 Parent(s): 86866f1

Upload Jupyter notebook

Browse files
Files changed (1) hide show
  1. toxic_comment_classifier.ipynb +1 -0
toxic_comment_classifier.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":7296872,"sourceType":"datasetVersion","datasetId":4232592},{"sourceId":7296911,"sourceType":"datasetVersion","datasetId":4232619}],"dockerImageVersionId":30626,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# !pip install tensorflow==2.10","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nimport string\nfrom unidecode import unidecode\nimport tensorflow as tf \nfrom sklearn.utils import class_weight\nfrom tensorflow.keras.utils import to_categorical\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nimport cloudpickle\nimport os\nfrom transformers import DistilBertTokenizerFast\nfrom transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization\nfrom tensorflow.keras.optimizers.schedules import PolynomialDecay\nfrom tensorflow.keras.callbacks import EarlyStopping","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class TextPreprocessor:\n def __init__(self, remove_punct: bool = True, remove_digits: bool = True,\n remove_stop_words: bool = True,\n remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,\n bottom_p: float = None):\n self.remove_punct = remove_punct\n self.remove_digits = remove_digits\n self.remove_stop_words = remove_stop_words\n self.remove_short_words = remove_short_words\n self.minlen = minlen\n self.maxlen = maxlen\n self.top_p = top_p\n self.bottom_p = bottom_p\n self.words_to_remove = []\n self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',\n 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',\n 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',\n 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',\n 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',\n 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',\n 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',\n 'into', 'through', 'during', 'before', 'after', 'to', 'from',\n 'in', 'out', 'on', 'off', 'further', 'then', 'once',\n 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',\n 'other', 'such', 'own', 'same', 'so', 'than', 'can', 'will', 'should','now']\n\n self.contraction_to_expansion = {\"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"cannot\",\n \"can't've\": \"cannot have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it had\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so is\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there had\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we had\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'alls\": \"you alls\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you had\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you you will\",\n \"you'll've\": \"you you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n @staticmethod\n def __remove_double_whitespaces(string: str):\n return \" \".join(string.split())\n\n def __remove_url(self, string_series: pd.Series):\n \"\"\"\n Removes URLs m text\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(\n pat=r\"(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})\",\n repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __expand(self, string_series: pd.Series):\n \"\"\"\n Replaces contractions with expansions. eg. don't wit do not.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n for c, e in self.contraction_to_expansion.items():\n clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_punct(self, string_series: pd.Series):\n \"\"\"\n Removes punctuations from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n puncts = [r'\\n', r'\\r', r'\\t']\n puncts.extend(list(string.punctuation))\n for i in puncts:\n clean_string_series = clean_string_series.str.replace(pat=i, repl=\" \", regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_digits(self, string_series: pd.Series):\n \"\"\"\n Removes digits from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(pat=r'\\d', repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n @staticmethod\n def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):\n \"\"\"\n Reomves words/tokens where minlen <= len <= maxlen.\n :param string_series: pd.Series, input string series\n :param minlen: int, minimum length of token to be removed.\n :param maxlen: int, maximum length of token to be removed.\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split() if\n (len(word) > maxlen) or (len(word) < minlen)]))\n return clean_string_series\n\n def __remove_stop_words(self, string_series: pd.Series):\n \"\"\"\n Removes stop words from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n def str_remove_stop_words(string: str):\n stops = self.stop_words\n return \" \".join([token for token in string.split() if token not in stops])\n\n return string_series.map(str_remove_stop_words)\n\n def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,\n bottom_p: int = None, dataset: str = 'train'):\n \"\"\"\n Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.\n :param string_series: pd.Series, input string series\n :param top_p: float, percent of frequent words to remove.\n :param bottom_p: float, percent of rare words to remove.\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n if dataset == 'train':\n if top_p is None:\n top_p = 0\n if bottom_p is None:\n bottom_p = 0\n\n if top_p > 0 or bottom_p > 0:\n word_freq = pd.Series(\" \".join(string_series).split()).value_counts()\n n_words = len(word_freq)\n\n if top_p > 0:\n self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])\n\n if bottom_p > 0:\n self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])\n\n if len(self.words_to_remove) == 0:\n return string_series\n else:\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split()\n if word not in self.words_to_remove]))\n return clean_string_series\n\n def preprocess(self, string_series: pd.Series, dataset: str = \"train\"):\n \"\"\"\n Entry point.\n :param string_series: pd.Series, input string series\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n string_series = string_series.str.lower()\n string_series = string_series.map(unidecode)\n string_series = self.__remove_url(string_series=string_series)\n string_series = self.__expand(string_series=string_series)\n\n if self.remove_punct:\n string_series = self.__remove_punct(string_series=string_series)\n if self.remove_digits:\n string_series = self.__remove_digits(string_series=string_series)\n if self.remove_stop_words:\n string_series = self.__remove_stop_words(string_series=string_series)\n if self.remove_short_words:\n string_series = self.__remove_short_words(string_series=string_series,\n minlen=self.minlen,\n maxlen=self.maxlen)\n string_series = self.__remove_top_bottom_words(string_series=string_series,\n top_p=self.top_p,\n bottom_p=self.bottom_p, dataset=dataset)\n\n string_series = string_series.str.strip()\n string_series.replace(to_replace=\"\", value=\"this is an empty message\", inplace=True)\n\n return string_series","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:30:59.635659Z","iopub.execute_input":"2023-12-31T07:30:59.636337Z","iopub.status.idle":"2023-12-31T07:30:59.678677Z","shell.execute_reply.started":"2023-12-31T07:30:59.636307Z","shell.execute_reply":"2023-12-31T07:30:59.677974Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"data = pd.read_csv('train.csv')","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:30:59.680601Z","iopub.execute_input":"2023-12-31T07:30:59.680867Z","iopub.status.idle":"2023-12-31T07:31:01.412865Z","shell.execute_reply.started":"2023-12-31T07:30:59.680843Z","shell.execute_reply":"2023-12-31T07:31:01.412060Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.413911Z","iopub.execute_input":"2023-12-31T07:31:01.414204Z","iopub.status.idle":"2023-12-31T07:31:01.440183Z","shell.execute_reply.started":"2023-12-31T07:31:01.414177Z","shell.execute_reply":"2023-12-31T07:31:01.439351Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" id comment_text \\\n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n... ... ... \n159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n\n toxic severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 \n2 0 0 0 0 0 0 \n3 0 0 0 0 0 0 \n4 0 0 0 0 0 0 \n... ... ... ... ... ... ... \n159566 0 0 0 0 0 0 \n159567 0 0 0 0 0 0 \n159568 0 0 0 0 0 0 \n159569 0 0 0 0 0 0 \n159570 0 0 0 0 0 0 \n\n[159571 rows x 8 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0000997932d777bf</td>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>000103f0d9cfb60f</td>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>000113f07ec002fd</td>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0001b41b1c6bb37e</td>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0001d958c54c6e35</td>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>159566</th>\n <td>ffe987279560d7ff</td>\n <td>\":::::And for the second time of asking, when ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159567</th>\n <td>ffea4adeee384e90</td>\n <td>You should be ashamed of yourself \\n\\nThat is ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159568</th>\n <td>ffee36eab5c267c9</td>\n <td>Spitzer \\n\\nUmm, theres no actual article for ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159569</th>\n <td>fff125370e4aaaf3</td>\n <td>And it looks like it was actually you who put ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159570</th>\n <td>fff46fc426af1f9a</td>\n <td>\"\\nAnd ... I really don't think you understand...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>159571 rows × 8 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"data.drop(columns='id', inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.441248Z","iopub.execute_input":"2023-12-31T07:31:01.441559Z","iopub.status.idle":"2023-12-31T07:31:01.471169Z","shell.execute_reply.started":"2023-12-31T07:31:01.441534Z","shell.execute_reply":"2023-12-31T07:31:01.470315Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.472666Z","iopub.execute_input":"2023-12-31T07:31:01.473306Z","iopub.status.idle":"2023-12-31T07:31:01.498256Z","shell.execute_reply.started":"2023-12-31T07:31:01.473272Z","shell.execute_reply":"2023-12-31T07:31:01.497046Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" comment_text toxic \\\n0 Explanation\\nWhy the edits made under my usern... 0 \n1 D'aww! He matches this background colour I'm s... 0 \n2 Hey man, I'm really not trying to edit war. It... 0 \n3 \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 You, sir, are my hero. Any chance you remember... 0 \n... ... ... \n159566 \":::::And for the second time of asking, when ... 0 \n159567 You should be ashamed of yourself \\n\\nThat is ... 0 \n159568 Spitzer \\n\\nUmm, theres no actual article for ... 0 \n159569 And it looks like it was actually you who put ... 0 \n159570 \"\\nAnd ... I really don't think you understand... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 \n... ... ... ... ... ... \n159566 0 0 0 0 0 \n159567 0 0 0 0 0 \n159568 0 0 0 0 0 \n159569 0 0 0 0 0 \n159570 0 0 0 0 0 \n\n[159571 rows x 7 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>159566</th>\n <td>\":::::And for the second time of asking, when ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159567</th>\n <td>You should be ashamed of yourself \\n\\nThat is ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159568</th>\n <td>Spitzer \\n\\nUmm, theres no actual article for ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159569</th>\n <td>And it looks like it was actually you who put ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159570</th>\n <td>\"\\nAnd ... I really don't think you understand...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>159571 rows × 7 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"data.rename(columns={'comment_text': 'text'}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.499556Z","iopub.execute_input":"2023-12-31T07:31:01.500387Z","iopub.status.idle":"2023-12-31T07:31:01.548770Z","shell.execute_reply.started":"2023-12-31T07:31:01.500353Z","shell.execute_reply":"2023-12-31T07:31:01.547759Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"data.shape","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.550009Z","iopub.execute_input":"2023-12-31T07:31:01.550326Z","iopub.status.idle":"2023-12-31T07:31:01.596006Z","shell.execute_reply.started":"2023-12-31T07:31:01.550298Z","shell.execute_reply":"2023-12-31T07:31:01.594926Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(159571, 7)"},"metadata":{}}]},{"cell_type":"code","source":"data.dtypes","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.599806Z","iopub.execute_input":"2023-12-31T07:31:01.600108Z","iopub.status.idle":"2023-12-31T07:31:01.646405Z","shell.execute_reply.started":"2023-12-31T07:31:01.600085Z","shell.execute_reply":"2023-12-31T07:31:01.645539Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"text object\ntoxic int64\nsevere_toxic int64\nobscene int64\nthreat int64\ninsult int64\nidentity_hate int64\ndtype: object"},"metadata":{}}]},{"cell_type":"code","source":"# data.drop(columns='categories', inplace=True)\ndata.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.647555Z","iopub.execute_input":"2023-12-31T07:31:01.647856Z","iopub.status.idle":"2023-12-31T07:31:01.737114Z","shell.execute_reply.started":"2023-12-31T07:31:01.647831Z","shell.execute_reply":"2023-12-31T07:31:01.736103Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.739003Z","iopub.execute_input":"2023-12-31T07:31:01.739433Z","iopub.status.idle":"2023-12-31T07:31:01.854310Z","shell.execute_reply.started":"2023-12-31T07:31:01.739398Z","shell.execute_reply":"2023-12-31T07:31:01.853223Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":" text toxic \\\n0 Explanation\\nWhy the edits made under my usern... 0 \n1 D'aww! He matches this background colour I'm s... 0 \n2 Hey man, I'm really not trying to edit war. It... 0 \n3 \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 You, sir, are my hero. Any chance you remember... 0 \n... ... ... \n159566 \":::::And for the second time of asking, when ... 0 \n159567 You should be ashamed of yourself \\n\\nThat is ... 0 \n159568 Spitzer \\n\\nUmm, theres no actual article for ... 0 \n159569 And it looks like it was actually you who put ... 0 \n159570 \"\\nAnd ... I really don't think you understand... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 \n... ... ... ... ... ... \n159566 0 0 0 0 0 \n159567 0 0 0 0 0 \n159568 0 0 0 0 0 \n159569 0 0 0 0 0 \n159570 0 0 0 0 0 \n\n[159571 rows x 7 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>159566</th>\n <td>\":::::And for the second time of asking, when ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159567</th>\n <td>You should be ashamed of yourself \\n\\nThat is ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159568</th>\n <td>Spitzer \\n\\nUmm, theres no actual article for ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159569</th>\n <td>And it looks like it was actually you who put ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>159570</th>\n <td>\"\\nAnd ... I really don't think you understand...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>159571 rows × 7 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.855770Z","iopub.execute_input":"2023-12-31T07:31:01.856752Z","iopub.status.idle":"2023-12-31T07:31:01.910184Z","shell.execute_reply.started":"2023-12-31T07:31:01.856715Z","shell.execute_reply":"2023-12-31T07:31:01.909374Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"\"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.\""},"metadata":{}}]},{"cell_type":"code","source":"CLASS_NAMES = [*data.columns][1:]\nprint(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.911346Z","iopub.execute_input":"2023-12-31T07:31:01.911647Z","iopub.status.idle":"2023-12-31T07:31:01.921374Z","shell.execute_reply.started":"2023-12-31T07:31:01.911622Z","shell.execute_reply":"2023-12-31T07:31:01.920546Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n","output_type":"stream"}]},{"cell_type":"code","source":"tp = TextPreprocessor()\ndata['text'] = tp.preprocess(data['text'])","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.922648Z","iopub.execute_input":"2023-12-31T07:31:01.923004Z","iopub.status.idle":"2023-12-31T07:31:53.140912Z","shell.execute_reply.started":"2023-12-31T07:31:01.922973Z","shell.execute_reply":"2023-12-31T07:31:53.140089Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.142143Z","iopub.execute_input":"2023-12-31T07:31:53.142437Z","iopub.status.idle":"2023-12-31T07:31:53.148219Z","shell.execute_reply.started":"2023-12-31T07:31:53.142410Z","shell.execute_reply":"2023-12-31T07:31:53.147230Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"'hey man really not trying edit war just guy constantly removing relevant information talking edits instead talk page seems care more formatting actual info'"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"wb\") as model_file_obj:\n cloudpickle.dump((tp, CLASS_NAMES), model_file_obj)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.149417Z","iopub.execute_input":"2023-12-31T07:31:53.149706Z","iopub.status.idle":"2023-12-31T07:31:53.170819Z","shell.execute_reply.started":"2023-12-31T07:31:53.149682Z","shell.execute_reply":"2023-12-31T07:31:53.169945Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"x = data['text']\ny = data.drop(columns='text').values.copy()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.172088Z","iopub.execute_input":"2023-12-31T07:31:53.172429Z","iopub.status.idle":"2023-12-31T07:31:53.186905Z","shell.execute_reply.started":"2023-12-31T07:31:53.172396Z","shell.execute_reply":"2023-12-31T07:31:53.186117Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"x","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.188005Z","iopub.execute_input":"2023-12-31T07:31:53.188275Z","iopub.status.idle":"2023-12-31T07:31:53.198595Z","shell.execute_reply.started":"2023-12-31T07:31:53.188251Z","shell.execute_reply":"2023-12-31T07:31:53.197549Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"0 explanation edits made under username hardcore...\n1 aww matches background colour seemingly stuck ...\n2 hey man really not trying edit war just guy co...\n3 more cannot make real suggestions improvement ...\n4 sir hero chance remember page\n ... \n159566 second time asking view completely contradicts...\n159567 ashamed horrible thing put talk page\n159568 spitzer umm theres no actual article prostitut...\n159569 looks like actually put speedy first version d...\n159570 really not think understand came idea bad righ...\nName: text, Length: 159571, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"y","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.199909Z","iopub.execute_input":"2023-12-31T07:31:53.200274Z","iopub.status.idle":"2023-12-31T07:31:53.212733Z","shell.execute_reply.started":"2023-12-31T07:31:53.200239Z","shell.execute_reply":"2023-12-31T07:31:53.211770Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"array([[0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n ...,\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0]])"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.213823Z","iopub.execute_input":"2023-12-31T07:31:53.214062Z","iopub.status.idle":"2023-12-31T07:31:53.249561Z","shell.execute_reply.started":"2023-12-31T07:31:53.214040Z","shell.execute_reply":"2023-12-31T07:31:53.248733Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"x_train.shape, x_test.shape, y_train.shape, y_test.shape","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.250622Z","iopub.execute_input":"2023-12-31T07:31:53.250874Z","iopub.status.idle":"2023-12-31T07:31:53.287390Z","shell.execute_reply.started":"2023-12-31T07:31:53.250851Z","shell.execute_reply":"2023-12-31T07:31:53.286544Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"((127656,), (31915,), (127656, 6), (31915, 6))"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test = x_train.to_list(), x_test.to_list()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.288810Z","iopub.execute_input":"2023-12-31T07:31:53.289151Z","iopub.status.idle":"2023-12-31T07:31:53.307507Z","shell.execute_reply.started":"2023-12-31T07:31:53.289119Z","shell.execute_reply":"2023-12-31T07:31:53.306916Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"from transformers import DistilBertTokenizerFast","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.308641Z","iopub.execute_input":"2023-12-31T07:31:53.308902Z","iopub.status.idle":"2023-12-31T07:31:53.319637Z","shell.execute_reply.started":"2023-12-31T07:31:53.308879Z","shell.execute_reply":"2023-12-31T07:31:53.318812Z"},"trusted":true},"execution_count":24,"outputs":[]},{"cell_type":"code","source":"model_checkpoint = \"distilbert-base-uncased\"\ntokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(x_train[0])\nprint(tokenizer.tokenize(x_train[0]))\nprint(tokenizer(x_train[0]))","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:54.158520Z","iopub.execute_input":"2023-12-31T07:31:54.158802Z","iopub.status.idle":"2023-12-31T07:31:54.170494Z","shell.execute_reply.started":"2023-12-31T07:31:54.158777Z","shell.execute_reply":"2023-12-31T07:31:54.169665Z"},"trusted":true},"execution_count":26,"outputs":[{"name":"stdout","text":"grandma terri burn trash grandma terri trash hate grandma terri hell\n['grandma', 'terri', 'burn', 'trash', 'grandma', 'terri', 'trash', 'hate', 'grandma', 'terri', 'hell']\n{'input_ids': [101, 13055, 26568, 6402, 11669, 13055, 26568, 11669, 5223, 13055, 26568, 3109, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n","output_type":"stream"}]},{"cell_type":"code","source":"strategy = tf.distribute.MirroredStrategy()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:54.171594Z","iopub.execute_input":"2023-12-31T07:31:54.171911Z","iopub.status.idle":"2023-12-31T07:32:02.081175Z","shell.execute_reply.started":"2023-12-31T07:31:54.171887Z","shell.execute_reply":"2023-12-31T07:32:02.080424Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"BATCH_SIZE = 16 * strategy.num_replicas_in_sync\nN_TOKENS = 512\nN_CLASSES = len(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:02.087842Z","iopub.execute_input":"2023-12-31T07:32:02.088137Z","iopub.status.idle":"2023-12-31T07:32:02.092108Z","shell.execute_reply.started":"2023-12-31T07:32:02.088110Z","shell.execute_reply":"2023-12-31T07:32:02.091404Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"train_tokens = tokenizer(x_train, max_length=N_TOKENS, padding=\"max_length\", truncation=True, return_tensors=\"tf\", return_attention_mask=True, return_token_type_ids=False)\ntest_tokens = tokenizer(x_test, max_length=N_TOKENS, padding=\"max_length\", truncation=True, return_tensors=\"tf\", return_attention_mask=True, return_token_type_ids=False)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:02.093570Z","iopub.execute_input":"2023-12-31T07:32:02.094195Z","iopub.status.idle":"2023-12-31T07:32:30.386628Z","shell.execute_reply.started":"2023-12-31T07:32:02.094161Z","shell.execute_reply":"2023-12-31T07:32:30.385835Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"train_tokens[:5]","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.390616Z","iopub.execute_input":"2023-12-31T07:32:30.390916Z","iopub.status.idle":"2023-12-31T07:32:30.396763Z","shell.execute_reply.started":"2023-12-31T07:32:30.390891Z","shell.execute_reply":"2023-12-31T07:32:30.395947Z"},"trusted":true},"execution_count":30,"outputs":[{"execution_count":30,"output_type":"execute_result","data":{"text/plain":"[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]"},"metadata":{}}]},{"cell_type":"code","source":"sample_weight_param = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)\nsample_weight_param","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.397786Z","iopub.execute_input":"2023-12-31T07:32:30.398043Z","iopub.status.idle":"2023-12-31T07:32:30.588369Z","shell.execute_reply.started":"2023-12-31T07:32:30.398021Z","shell.execute_reply":"2023-12-31T07:32:30.587462Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"array([0.18495376, 0.01961102, 0.01961102, ..., 0.18495376, 0.01961102,\n 0.01961102])"},"metadata":{}}]},{"cell_type":"code","source":"len(sample_weight_param)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.589627Z","iopub.execute_input":"2023-12-31T07:32:30.589973Z","iopub.status.idle":"2023-12-31T07:32:30.595964Z","shell.execute_reply.started":"2023-12-31T07:32:30.589946Z","shell.execute_reply":"2023-12-31T07:32:30.594963Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":"127656"},"metadata":{}}]},{"cell_type":"code","source":"# train_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), y_train, sample_weight_param))\ntrain_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), y_train))\ntest_tf_data = tf.data.Dataset.from_tensor_slices((dict(test_tokens), y_test))","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.597196Z","iopub.execute_input":"2023-12-31T07:32:30.597885Z","iopub.status.idle":"2023-12-31T07:32:30.619144Z","shell.execute_reply.started":"2023-12-31T07:32:30.597852Z","shell.execute_reply":"2023-12-31T07:32:30.618351Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"del(data)\ndel(train_tokens)\ndel(test_tokens)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.620239Z","iopub.execute_input":"2023-12-31T07:32:30.620611Z","iopub.status.idle":"2023-12-31T07:32:34.275696Z","shell.execute_reply.started":"2023-12-31T07:32:30.620583Z","shell.execute_reply":"2023-12-31T07:32:34.274852Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"train_tf_data=train_tf_data.prefetch(tf.data.AUTOTUNE)\ntest_tf_data=test_tf_data.prefetch(tf.data.AUTOTUNE)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.276970Z","iopub.execute_input":"2023-12-31T07:32:34.277317Z","iopub.status.idle":"2023-12-31T07:32:34.289536Z","shell.execute_reply.started":"2023-12-31T07:32:34.277290Z","shell.execute_reply":"2023-12-31T07:32:34.288697Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"for i in train_tf_data.take(1):\n print(i)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.290687Z","iopub.execute_input":"2023-12-31T07:32:34.290966Z","iopub.status.idle":"2023-12-31T07:32:34.387248Z","shell.execute_reply.started":"2023-12-31T07:32:34.290942Z","shell.execute_reply":"2023-12-31T07:32:34.386228Z"},"trusted":true},"execution_count":36,"outputs":[{"name":"stdout","text":"({'input_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=\narray([ 101, 13055, 26568, 6402, 11669, 13055, 26568, 11669, 5223,\n 13055, 26568, 3109, 102, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0],\n dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(512,), dtype=int32, numpy=\narray([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0], dtype=int32)>}, <tf.Tensor: shape=(6,), dtype=int64, numpy=array([1, 0, 0, 0, 0, 0])>)\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.388735Z","iopub.execute_input":"2023-12-31T07:32:34.389134Z","iopub.status.idle":"2023-12-31T07:32:34.394189Z","shell.execute_reply.started":"2023-12-31T07:32:34.389098Z","shell.execute_reply":"2023-12-31T07:32:34.393217Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"config = DistilBertConfig.from_pretrained(model_checkpoint, output_hidden_states=False)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.395453Z","iopub.execute_input":"2023-12-31T07:32:34.395832Z","iopub.status.idle":"2023-12-31T07:32:34.465568Z","shell.execute_reply.started":"2023-12-31T07:32:34.395785Z","shell.execute_reply":"2023-12-31T07:32:34.464698Z"},"trusted":true},"execution_count":38,"outputs":[]},{"cell_type":"code","source":"from tensorflow.keras.optimizers.schedules import PolynomialDecay\nwith strategy.scope():\n model = TFDistilBertModel.from_pretrained(model_checkpoint, config=config)\n learning_schedule = PolynomialDecay(initial_learning_rate=2e-5, decay_steps=len(train_tf_data) * 10, end_learning_rate=0)\n input_ids = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"input_ids\")\n attention_mask = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"attention_mask\")\n x = model([input_ids, attention_mask])[0][:,0,:] # [CLS] token of last hidden state\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n x = Dense(1024, activation=\"relu\")(x)\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n x = Dense(512, activation=\"relu\")(x)\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n output = Dense(N_CLASSES, activation=\"sigmoid\", name=\"output\")(x)\n model = tf.keras.Model(inputs=[input_ids, attention_mask],outputs=output)\n metric = [tf.keras.metrics.AUC(multi_label=True, num_labels=N_CLASSES)]\n model.compile(optimizer=tf.keras.optimizers.Adam(learning_schedule), metrics=metric, loss=tf.keras.losses.BinaryCrossentropy())","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.466800Z","iopub.execute_input":"2023-12-31T07:32:34.467131Z","iopub.status.idle":"2023-12-31T07:32:45.278768Z","shell.execute_reply.started":"2023-12-31T07:32:34.467100Z","shell.execute_reply":"2023-12-31T07:32:45.277811Z"},"trusted":true},"execution_count":39,"outputs":[{"output_type":"display_data","data":{"text/plain":"model.safetensors: 0%| | 0.00/268M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e5f53f73c4884c0bbe76f9739b7c82ba"}},"metadata":{}},{"name":"stderr","text":"Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']\n- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\nAll the weights of TFDistilBertModel were initialized from the PyTorch model.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:45.279918Z","iopub.execute_input":"2023-12-31T07:32:45.280169Z","iopub.status.idle":"2023-12-31T07:32:45.332178Z","shell.execute_reply.started":"2023-12-31T07:32:45.280146Z","shell.execute_reply":"2023-12-31T07:32:45.331301Z"},"trusted":true},"execution_count":40,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 512)] 0 [] \n \n attention_mask (InputLayer) [(None, 512)] 0 [] \n \n tf_distil_bert_model (TFDistil TFBaseModelOutput(l 66362880 ['input_ids[0][0]', \n BertModel) ast_hidden_state=(N 'attention_mask[0][0]'] \n one, 512, 768), \n hidden_states=None \n , attentions=None) \n \n tf.__operators__.getitem (Slic (None, 768) 0 ['tf_distil_bert_model[0][0]'] \n ingOpLambda) \n \n dropout_19 (Dropout) (None, 768) 0 ['tf.__operators__.getitem[0][0]'\n ] \n \n batch_normalization (BatchNorm (None, 768) 3072 ['dropout_19[0][0]'] \n alization) \n \n dense (Dense) (None, 1024) 787456 ['batch_normalization[0][0]'] \n \n dropout_20 (Dropout) (None, 1024) 0 ['dense[0][0]'] \n \n batch_normalization_1 (BatchNo (None, 1024) 4096 ['dropout_20[0][0]'] \n rmalization) \n \n dense_1 (Dense) (None, 512) 524800 ['batch_normalization_1[0][0]'] \n \n dropout_21 (Dropout) (None, 512) 0 ['dense_1[0][0]'] \n \n batch_normalization_2 (BatchNo (None, 512) 2048 ['dropout_21[0][0]'] \n rmalization) \n \n output (Dense) (None, 6) 3078 ['batch_normalization_2[0][0]'] \n \n==================================================================================================\nTotal params: 67,687,430\nTrainable params: 67,682,822\nNon-trainable params: 4,608\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"from tensorflow.keras.callbacks import EarlyStopping\nearly_stop = EarlyStopping(monitor=\"val_loss\",patience=1,mode=\"min\")","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:45.333227Z","iopub.execute_input":"2023-12-31T07:32:45.333520Z","iopub.status.idle":"2023-12-31T07:32:45.337904Z","shell.execute_reply.started":"2023-12-31T07:32:45.333470Z","shell.execute_reply":"2023-12-31T07:32:45.337070Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"model.fit(train_tf_data.shuffle(len(train_tf_data)).batch(BATCH_SIZE), validation_data=test_tf_data.shuffle(len(test_tf_data)).batch(BATCH_SIZE), \n epochs=10, callbacks=[early_stop])","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.save(\"toxic_comment_classifier_hf_distilbert.h5\")","metadata":{"execution":{"iopub.status.busy":"2023-12-31T11:52:06.891428Z","iopub.execute_input":"2023-12-31T11:52:06.892293Z","iopub.status.idle":"2023-12-31T11:52:08.415774Z","shell.execute_reply.started":"2023-12-31T11:52:06.892257Z","shell.execute_reply":"2023-12-31T11:52:08.414793Z"},"trusted":true},"execution_count":43,"outputs":[]},{"cell_type":"code","source":"tf_model = tf.keras.models.load_model('toxic_comment_classifier_hf_distilbert.h5', custom_objects={\"TFDistilBertModel\": TFDistilBertModel})","metadata":{"execution":{"iopub.status.busy":"2023-12-31T11:52:11.625873Z","iopub.execute_input":"2023-12-31T11:52:11.626691Z","iopub.status.idle":"2023-12-31T11:52:15.566176Z","shell.execute_reply.started":"2023-12-31T11:52:11.626657Z","shell.execute_reply":"2023-12-31T11:52:15.565369Z"},"trusted":true},"execution_count":44,"outputs":[]},{"cell_type":"code","source":"import pathlib\nconverter = tf.lite.TFLiteConverter.from_keras_model(tf_model)\nconverter.optimizations = [tf.lite.Optimize.DEFAULT]\ntflite_model = converter.convert()\n\ntflite_models_dir = pathlib.Path(os.path.join(\"tflite_models\"))\ntflite_models_dir.mkdir(exist_ok=True, parents=True)\ntflite_model_file = tflite_models_dir/\"toxic_comment_classifier_hf_distilbert.tflite\"\ntflite_model_file.write_bytes(tflite_model)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T11:52:18.659943Z","iopub.execute_input":"2023-12-31T11:52:18.660325Z","iopub.status.idle":"2023-12-31T11:53:23.173578Z","shell.execute_reply.started":"2023-12-31T11:52:18.660286Z","shell.execute_reply":"2023-12-31T11:53:23.172498Z"},"trusted":true},"execution_count":45,"outputs":[{"execution_count":45,"output_type":"execute_result","data":{"text/plain":"69201064"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"rb\") as model_file_obj:\n text_preprocessor, class_names = cloudpickle.load(model_file_obj)\n \ninterpreter = tf.lite.Interpreter(model_path=os.path.join(\"tflite_models\", \"toxic_comment_classifier_hf_distilbert.tflite\"))\n","metadata":{"execution":{"iopub.status.busy":"2023-12-31T11:53:36.669385Z","iopub.execute_input":"2023-12-31T11:53:36.670249Z","iopub.status.idle":"2023-12-31T11:53:36.677289Z","shell.execute_reply.started":"2023-12-31T11:53:36.670213Z","shell.execute_reply":"2023-12-31T11:53:36.676443Z"},"trusted":true},"execution_count":46,"outputs":[]},{"cell_type":"code","source":"def inference(text):\n text = text_preprocessor.preprocess(pd.Series(text))[0]\n \n model_checkpoint = \"distilbert-base-uncased\"\n tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)\n tokens = tokenizer(text, max_length=512, padding=\"max_length\", truncation=True, return_tensors=\"tf\")\n \n # tflite model inference \n interpreter.allocate_tensors()\n input_details = interpreter.get_input_details()\n output_details = interpreter.get_output_details()[0]\n attention_mask, input_ids = tokens['attention_mask'], tokens['input_ids']\n interpreter.set_tensor(input_details[0][\"index\"], attention_mask)\n interpreter.set_tensor(input_details[1][\"index\"], input_ids)\n interpreter.invoke()\n tflite_pred = interpreter.get_tensor(output_details[\"index\"])[0]\n result_df = pd.DataFrame({'class': class_names, 'prob': tflite_pred})\n result_df.sort_values(by='prob', ascending=True, inplace=True)\n return result_df","metadata":{"execution":{"iopub.status.busy":"2023-12-31T11:53:39.408810Z","iopub.execute_input":"2023-12-31T11:53:39.409545Z","iopub.status.idle":"2023-12-31T11:53:39.417228Z","shell.execute_reply.started":"2023-12-31T11:53:39.409511Z","shell.execute_reply":"2023-12-31T11:53:39.416216Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"inference(\"Hello!! How are you?\")","metadata":{"execution":{"iopub.status.busy":"2023-12-31T11:55:34.056030Z","iopub.execute_input":"2023-12-31T11:55:34.056848Z","iopub.status.idle":"2023-12-31T11:55:36.356401Z","shell.execute_reply.started":"2023-12-31T11:55:34.056803Z","shell.execute_reply":"2023-12-31T11:55:36.355463Z"},"trusted":true},"execution_count":49,"outputs":[{"execution_count":49,"output_type":"execute_result","data":{"text/plain":" class prob\n3 threat 0.000621\n1 severe_toxic 0.000848\n5 identity_hate 0.000876\n2 obscene 0.001126\n4 insult 0.001540\n0 toxic 0.002890","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>class</th>\n <th>prob</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>3</th>\n <td>threat</td>\n <td>0.000621</td>\n </tr>\n <tr>\n <th>1</th>\n <td>severe_toxic</td>\n <td>0.000848</td>\n </tr>\n <tr>\n <th>5</th>\n <td>identity_hate</td>\n <td>0.000876</td>\n </tr>\n <tr>\n <th>2</th>\n <td>obscene</td>\n <td>0.001126</td>\n </tr>\n <tr>\n <th>4</th>\n <td>insult</td>\n <td>0.001540</td>\n </tr>\n <tr>\n <th>0</th>\n <td>toxic</td>\n <td>0.002890</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}