{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":7296872,"sourceType":"datasetVersion","datasetId":4232592},{"sourceId":7296911,"sourceType":"datasetVersion","datasetId":4232619}],"dockerImageVersionId":30626,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# !pip install tensorflow==2.10","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nimport string\nfrom unidecode import unidecode\nimport tensorflow as tf \nfrom sklearn.utils import class_weight\nfrom tensorflow.keras.utils import to_categorical\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nimport cloudpickle\nimport os\nfrom transformers import DistilBertTokenizerFast\nfrom transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization\nfrom tensorflow.keras.optimizers.schedules import PolynomialDecay\nfrom tensorflow.keras.callbacks import EarlyStopping","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class TextPreprocessor:\n def __init__(self, remove_punct: bool = True, remove_digits: bool = True,\n remove_stop_words: bool = True,\n remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,\n bottom_p: float = None):\n self.remove_punct = remove_punct\n self.remove_digits = remove_digits\n self.remove_stop_words = remove_stop_words\n self.remove_short_words = remove_short_words\n self.minlen = minlen\n self.maxlen = maxlen\n self.top_p = top_p\n self.bottom_p = bottom_p\n self.words_to_remove = []\n self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',\n 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',\n 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',\n 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',\n 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',\n 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',\n 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',\n 'into', 'through', 'during', 'before', 'after', 'to', 'from',\n 'in', 'out', 'on', 'off', 'further', 'then', 'once',\n 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',\n 'other', 'such', 'own', 'same', 'so', 'than', 'can', 'will', 'should','now']\n\n self.contraction_to_expansion = {\"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"cannot\",\n \"can't've\": \"cannot have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it had\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so is\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there had\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we had\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'alls\": \"you alls\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you had\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you you will\",\n \"you'll've\": \"you you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n @staticmethod\n def __remove_double_whitespaces(string: str):\n return \" \".join(string.split())\n\n def __remove_url(self, string_series: pd.Series):\n \"\"\"\n Removes URLs m text\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(\n pat=r\"(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})\",\n repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __expand(self, string_series: pd.Series):\n \"\"\"\n Replaces contractions with expansions. eg. don't wit do not.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n for c, e in self.contraction_to_expansion.items():\n clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_punct(self, string_series: pd.Series):\n \"\"\"\n Removes punctuations from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n puncts = [r'\\n', r'\\r', r'\\t']\n puncts.extend(list(string.punctuation))\n for i in puncts:\n clean_string_series = clean_string_series.str.replace(pat=i, repl=\" \", regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_digits(self, string_series: pd.Series):\n \"\"\"\n Removes digits from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(pat=r'\\d', repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n @staticmethod\n def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):\n \"\"\"\n Reomves words/tokens where minlen <= len <= maxlen.\n :param string_series: pd.Series, input string series\n :param minlen: int, minimum length of token to be removed.\n :param maxlen: int, maximum length of token to be removed.\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split() if\n (len(word) > maxlen) or (len(word) < minlen)]))\n return clean_string_series\n\n def __remove_stop_words(self, string_series: pd.Series):\n \"\"\"\n Removes stop words from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n def str_remove_stop_words(string: str):\n stops = self.stop_words\n return \" \".join([token for token in string.split() if token not in stops])\n\n return string_series.map(str_remove_stop_words)\n\n def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,\n bottom_p: int = None, dataset: str = 'train'):\n \"\"\"\n Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.\n :param string_series: pd.Series, input string series\n :param top_p: float, percent of frequent words to remove.\n :param bottom_p: float, percent of rare words to remove.\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n if dataset == 'train':\n if top_p is None:\n top_p = 0\n if bottom_p is None:\n bottom_p = 0\n\n if top_p > 0 or bottom_p > 0:\n word_freq = pd.Series(\" \".join(string_series).split()).value_counts()\n n_words = len(word_freq)\n\n if top_p > 0:\n self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])\n\n if bottom_p > 0:\n self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])\n\n if len(self.words_to_remove) == 0:\n return string_series\n else:\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split()\n if word not in self.words_to_remove]))\n return clean_string_series\n\n def preprocess(self, string_series: pd.Series, dataset: str = \"train\"):\n \"\"\"\n Entry point.\n :param string_series: pd.Series, input string series\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n string_series = string_series.str.lower()\n string_series = string_series.map(unidecode)\n string_series = self.__remove_url(string_series=string_series)\n string_series = self.__expand(string_series=string_series)\n\n if self.remove_punct:\n string_series = self.__remove_punct(string_series=string_series)\n if self.remove_digits:\n string_series = self.__remove_digits(string_series=string_series)\n if self.remove_stop_words:\n string_series = self.__remove_stop_words(string_series=string_series)\n if self.remove_short_words:\n string_series = self.__remove_short_words(string_series=string_series,\n minlen=self.minlen,\n maxlen=self.maxlen)\n string_series = self.__remove_top_bottom_words(string_series=string_series,\n top_p=self.top_p,\n bottom_p=self.bottom_p, dataset=dataset)\n\n string_series = string_series.str.strip()\n string_series.replace(to_replace=\"\", value=\"this is an empty message\", inplace=True)\n\n return string_series","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:30:59.635659Z","iopub.execute_input":"2023-12-31T07:30:59.636337Z","iopub.status.idle":"2023-12-31T07:30:59.678677Z","shell.execute_reply.started":"2023-12-31T07:30:59.636307Z","shell.execute_reply":"2023-12-31T07:30:59.677974Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"data = pd.read_csv('train.csv')","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:30:59.680601Z","iopub.execute_input":"2023-12-31T07:30:59.680867Z","iopub.status.idle":"2023-12-31T07:31:01.412865Z","shell.execute_reply.started":"2023-12-31T07:30:59.680843Z","shell.execute_reply":"2023-12-31T07:31:01.412060Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.413911Z","iopub.execute_input":"2023-12-31T07:31:01.414204Z","iopub.status.idle":"2023-12-31T07:31:01.440183Z","shell.execute_reply.started":"2023-12-31T07:31:01.414177Z","shell.execute_reply":"2023-12-31T07:31:01.439351Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" id comment_text \\\n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n... ... ... \n159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n\n toxic severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 \n2 0 0 0 0 0 0 \n3 0 0 0 0 0 0 \n4 0 0 0 0 0 0 \n... ... ... ... ... ... ... \n159566 0 0 0 0 0 0 \n159567 0 0 0 0 0 0 \n159568 0 0 0 0 0 0 \n159569 0 0 0 0 0 0 \n159570 0 0 0 0 0 0 \n\n[159571 rows x 8 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\\nWhy the edits made under my usern...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\\nMore\\nI can't make any real suggestions on ...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
...........................
159566ffe987279560d7ff\":::::And for the second time of asking, when ...000000
159567ffea4adeee384e90You should be ashamed of yourself \\n\\nThat is ...000000
159568ffee36eab5c267c9Spitzer \\n\\nUmm, theres no actual article for ...000000
159569fff125370e4aaaf3And it looks like it was actually you who put ...000000
159570fff46fc426af1f9a\"\\nAnd ... I really don't think you understand...000000
\n

159571 rows × 8 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"data.drop(columns='id', inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.441248Z","iopub.execute_input":"2023-12-31T07:31:01.441559Z","iopub.status.idle":"2023-12-31T07:31:01.471169Z","shell.execute_reply.started":"2023-12-31T07:31:01.441534Z","shell.execute_reply":"2023-12-31T07:31:01.470315Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.472666Z","iopub.execute_input":"2023-12-31T07:31:01.473306Z","iopub.status.idle":"2023-12-31T07:31:01.498256Z","shell.execute_reply.started":"2023-12-31T07:31:01.473272Z","shell.execute_reply":"2023-12-31T07:31:01.497046Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" comment_text toxic \\\n0 Explanation\\nWhy the edits made under my usern... 0 \n1 D'aww! He matches this background colour I'm s... 0 \n2 Hey man, I'm really not trying to edit war. It... 0 \n3 \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 You, sir, are my hero. Any chance you remember... 0 \n... ... ... \n159566 \":::::And for the second time of asking, when ... 0 \n159567 You should be ashamed of yourself \\n\\nThat is ... 0 \n159568 Spitzer \\n\\nUmm, theres no actual article for ... 0 \n159569 And it looks like it was actually you who put ... 0 \n159570 \"\\nAnd ... I really don't think you understand... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 \n... ... ... ... ... ... \n159566 0 0 0 0 0 \n159567 0 0 0 0 0 \n159568 0 0 0 0 0 \n159569 0 0 0 0 0 \n159570 0 0 0 0 0 \n\n[159571 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
comment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Explanation\\nWhy the edits made under my usern...000000
1D'aww! He matches this background colour I'm s...000000
2Hey man, I'm really not trying to edit war. It...000000
3\"\\nMore\\nI can't make any real suggestions on ...000000
4You, sir, are my hero. Any chance you remember...000000
........................
159566\":::::And for the second time of asking, when ...000000
159567You should be ashamed of yourself \\n\\nThat is ...000000
159568Spitzer \\n\\nUmm, theres no actual article for ...000000
159569And it looks like it was actually you who put ...000000
159570\"\\nAnd ... I really don't think you understand...000000
\n

159571 rows × 7 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"data.rename(columns={'comment_text': 'text'}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.499556Z","iopub.execute_input":"2023-12-31T07:31:01.500387Z","iopub.status.idle":"2023-12-31T07:31:01.548770Z","shell.execute_reply.started":"2023-12-31T07:31:01.500353Z","shell.execute_reply":"2023-12-31T07:31:01.547759Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"data.shape","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.550009Z","iopub.execute_input":"2023-12-31T07:31:01.550326Z","iopub.status.idle":"2023-12-31T07:31:01.596006Z","shell.execute_reply.started":"2023-12-31T07:31:01.550298Z","shell.execute_reply":"2023-12-31T07:31:01.594926Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(159571, 7)"},"metadata":{}}]},{"cell_type":"code","source":"data.dtypes","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.599806Z","iopub.execute_input":"2023-12-31T07:31:01.600108Z","iopub.status.idle":"2023-12-31T07:31:01.646405Z","shell.execute_reply.started":"2023-12-31T07:31:01.600085Z","shell.execute_reply":"2023-12-31T07:31:01.645539Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"text object\ntoxic int64\nsevere_toxic int64\nobscene int64\nthreat int64\ninsult int64\nidentity_hate int64\ndtype: object"},"metadata":{}}]},{"cell_type":"code","source":"# data.drop(columns='categories', inplace=True)\ndata.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.647555Z","iopub.execute_input":"2023-12-31T07:31:01.647856Z","iopub.status.idle":"2023-12-31T07:31:01.737114Z","shell.execute_reply.started":"2023-12-31T07:31:01.647831Z","shell.execute_reply":"2023-12-31T07:31:01.736103Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.739003Z","iopub.execute_input":"2023-12-31T07:31:01.739433Z","iopub.status.idle":"2023-12-31T07:31:01.854310Z","shell.execute_reply.started":"2023-12-31T07:31:01.739398Z","shell.execute_reply":"2023-12-31T07:31:01.853223Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":" text toxic \\\n0 Explanation\\nWhy the edits made under my usern... 0 \n1 D'aww! He matches this background colour I'm s... 0 \n2 Hey man, I'm really not trying to edit war. It... 0 \n3 \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 You, sir, are my hero. Any chance you remember... 0 \n... ... ... \n159566 \":::::And for the second time of asking, when ... 0 \n159567 You should be ashamed of yourself \\n\\nThat is ... 0 \n159568 Spitzer \\n\\nUmm, theres no actual article for ... 0 \n159569 And it looks like it was actually you who put ... 0 \n159570 \"\\nAnd ... I really don't think you understand... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 \n... ... ... ... ... ... \n159566 0 0 0 0 0 \n159567 0 0 0 0 0 \n159568 0 0 0 0 0 \n159569 0 0 0 0 0 \n159570 0 0 0 0 0 \n\n[159571 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Explanation\\nWhy the edits made under my usern...000000
1D'aww! He matches this background colour I'm s...000000
2Hey man, I'm really not trying to edit war. It...000000
3\"\\nMore\\nI can't make any real suggestions on ...000000
4You, sir, are my hero. Any chance you remember...000000
........................
159566\":::::And for the second time of asking, when ...000000
159567You should be ashamed of yourself \\n\\nThat is ...000000
159568Spitzer \\n\\nUmm, theres no actual article for ...000000
159569And it looks like it was actually you who put ...000000
159570\"\\nAnd ... I really don't think you understand...000000
\n

159571 rows × 7 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.855770Z","iopub.execute_input":"2023-12-31T07:31:01.856752Z","iopub.status.idle":"2023-12-31T07:31:01.910184Z","shell.execute_reply.started":"2023-12-31T07:31:01.856715Z","shell.execute_reply":"2023-12-31T07:31:01.909374Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"\"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.\""},"metadata":{}}]},{"cell_type":"code","source":"CLASS_NAMES = [*data.columns][1:]\nprint(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.911346Z","iopub.execute_input":"2023-12-31T07:31:01.911647Z","iopub.status.idle":"2023-12-31T07:31:01.921374Z","shell.execute_reply.started":"2023-12-31T07:31:01.911622Z","shell.execute_reply":"2023-12-31T07:31:01.920546Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n","output_type":"stream"}]},{"cell_type":"code","source":"tp = TextPreprocessor()\ndata['text'] = tp.preprocess(data['text'])","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.922648Z","iopub.execute_input":"2023-12-31T07:31:01.923004Z","iopub.status.idle":"2023-12-31T07:31:53.140912Z","shell.execute_reply.started":"2023-12-31T07:31:01.922973Z","shell.execute_reply":"2023-12-31T07:31:53.140089Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.142143Z","iopub.execute_input":"2023-12-31T07:31:53.142437Z","iopub.status.idle":"2023-12-31T07:31:53.148219Z","shell.execute_reply.started":"2023-12-31T07:31:53.142410Z","shell.execute_reply":"2023-12-31T07:31:53.147230Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"'hey man really not trying edit war just guy constantly removing relevant information talking edits instead talk page seems care more formatting actual info'"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"wb\") as model_file_obj:\n cloudpickle.dump((tp, CLASS_NAMES), model_file_obj)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.149417Z","iopub.execute_input":"2023-12-31T07:31:53.149706Z","iopub.status.idle":"2023-12-31T07:31:53.170819Z","shell.execute_reply.started":"2023-12-31T07:31:53.149682Z","shell.execute_reply":"2023-12-31T07:31:53.169945Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"x = data['text']\ny = data.drop(columns='text').values.copy()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.172088Z","iopub.execute_input":"2023-12-31T07:31:53.172429Z","iopub.status.idle":"2023-12-31T07:31:53.186905Z","shell.execute_reply.started":"2023-12-31T07:31:53.172396Z","shell.execute_reply":"2023-12-31T07:31:53.186117Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"x","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.188005Z","iopub.execute_input":"2023-12-31T07:31:53.188275Z","iopub.status.idle":"2023-12-31T07:31:53.198595Z","shell.execute_reply.started":"2023-12-31T07:31:53.188251Z","shell.execute_reply":"2023-12-31T07:31:53.197549Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"0 explanation edits made under username hardcore...\n1 aww matches background colour seemingly stuck ...\n2 hey man really not trying edit war just guy co...\n3 more cannot make real suggestions improvement ...\n4 sir hero chance remember page\n ... \n159566 second time asking view completely contradicts...\n159567 ashamed horrible thing put talk page\n159568 spitzer umm theres no actual article prostitut...\n159569 looks like actually put speedy first version d...\n159570 really not think understand came idea bad righ...\nName: text, Length: 159571, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"y","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.199909Z","iopub.execute_input":"2023-12-31T07:31:53.200274Z","iopub.status.idle":"2023-12-31T07:31:53.212733Z","shell.execute_reply.started":"2023-12-31T07:31:53.200239Z","shell.execute_reply":"2023-12-31T07:31:53.211770Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"array([[0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n ...,\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0]])"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.213823Z","iopub.execute_input":"2023-12-31T07:31:53.214062Z","iopub.status.idle":"2023-12-31T07:31:53.249561Z","shell.execute_reply.started":"2023-12-31T07:31:53.214040Z","shell.execute_reply":"2023-12-31T07:31:53.248733Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"x_train.shape, x_test.shape, y_train.shape, y_test.shape","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.250622Z","iopub.execute_input":"2023-12-31T07:31:53.250874Z","iopub.status.idle":"2023-12-31T07:31:53.287390Z","shell.execute_reply.started":"2023-12-31T07:31:53.250851Z","shell.execute_reply":"2023-12-31T07:31:53.286544Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"((127656,), (31915,), (127656, 6), (31915, 6))"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test = x_train.to_list(), x_test.to_list()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.288810Z","iopub.execute_input":"2023-12-31T07:31:53.289151Z","iopub.status.idle":"2023-12-31T07:31:53.307507Z","shell.execute_reply.started":"2023-12-31T07:31:53.289119Z","shell.execute_reply":"2023-12-31T07:31:53.306916Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"from transformers import DistilBertTokenizerFast","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:53.308641Z","iopub.execute_input":"2023-12-31T07:31:53.308902Z","iopub.status.idle":"2023-12-31T07:31:53.319637Z","shell.execute_reply.started":"2023-12-31T07:31:53.308879Z","shell.execute_reply":"2023-12-31T07:31:53.318812Z"},"trusted":true},"execution_count":24,"outputs":[]},{"cell_type":"code","source":"model_checkpoint = \"distilbert-base-uncased\"\ntokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(x_train[0])\nprint(tokenizer.tokenize(x_train[0]))\nprint(tokenizer(x_train[0]))","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:54.158520Z","iopub.execute_input":"2023-12-31T07:31:54.158802Z","iopub.status.idle":"2023-12-31T07:31:54.170494Z","shell.execute_reply.started":"2023-12-31T07:31:54.158777Z","shell.execute_reply":"2023-12-31T07:31:54.169665Z"},"trusted":true},"execution_count":26,"outputs":[{"name":"stdout","text":"grandma terri burn trash grandma terri trash hate grandma terri hell\n['grandma', 'terri', 'burn', 'trash', 'grandma', 'terri', 'trash', 'hate', 'grandma', 'terri', 'hell']\n{'input_ids': [101, 13055, 26568, 6402, 11669, 13055, 26568, 11669, 5223, 13055, 26568, 3109, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n","output_type":"stream"}]},{"cell_type":"code","source":"strategy = tf.distribute.MirroredStrategy()","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:54.171594Z","iopub.execute_input":"2023-12-31T07:31:54.171911Z","iopub.status.idle":"2023-12-31T07:32:02.081175Z","shell.execute_reply.started":"2023-12-31T07:31:54.171887Z","shell.execute_reply":"2023-12-31T07:32:02.080424Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"BATCH_SIZE = 16 * strategy.num_replicas_in_sync\nN_TOKENS = 512\nN_CLASSES = len(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:02.087842Z","iopub.execute_input":"2023-12-31T07:32:02.088137Z","iopub.status.idle":"2023-12-31T07:32:02.092108Z","shell.execute_reply.started":"2023-12-31T07:32:02.088110Z","shell.execute_reply":"2023-12-31T07:32:02.091404Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"train_tokens = tokenizer(x_train, max_length=N_TOKENS, padding=\"max_length\", truncation=True, return_tensors=\"tf\", return_attention_mask=True, return_token_type_ids=False)\ntest_tokens = tokenizer(x_test, max_length=N_TOKENS, padding=\"max_length\", truncation=True, return_tensors=\"tf\", return_attention_mask=True, return_token_type_ids=False)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:02.093570Z","iopub.execute_input":"2023-12-31T07:32:02.094195Z","iopub.status.idle":"2023-12-31T07:32:30.386628Z","shell.execute_reply.started":"2023-12-31T07:32:02.094161Z","shell.execute_reply":"2023-12-31T07:32:30.385835Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"train_tokens[:5]","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.390616Z","iopub.execute_input":"2023-12-31T07:32:30.390916Z","iopub.status.idle":"2023-12-31T07:32:30.396763Z","shell.execute_reply.started":"2023-12-31T07:32:30.390891Z","shell.execute_reply":"2023-12-31T07:32:30.395947Z"},"trusted":true},"execution_count":30,"outputs":[{"execution_count":30,"output_type":"execute_result","data":{"text/plain":"[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]"},"metadata":{}}]},{"cell_type":"code","source":"sample_weight_param = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)\nsample_weight_param","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.397786Z","iopub.execute_input":"2023-12-31T07:32:30.398043Z","iopub.status.idle":"2023-12-31T07:32:30.588369Z","shell.execute_reply.started":"2023-12-31T07:32:30.398021Z","shell.execute_reply":"2023-12-31T07:32:30.587462Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"array([0.18495376, 0.01961102, 0.01961102, ..., 0.18495376, 0.01961102,\n 0.01961102])"},"metadata":{}}]},{"cell_type":"code","source":"len(sample_weight_param)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.589627Z","iopub.execute_input":"2023-12-31T07:32:30.589973Z","iopub.status.idle":"2023-12-31T07:32:30.595964Z","shell.execute_reply.started":"2023-12-31T07:32:30.589946Z","shell.execute_reply":"2023-12-31T07:32:30.594963Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":"127656"},"metadata":{}}]},{"cell_type":"code","source":"# train_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), y_train, sample_weight_param))\ntrain_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), y_train))\ntest_tf_data = tf.data.Dataset.from_tensor_slices((dict(test_tokens), y_test))","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.597196Z","iopub.execute_input":"2023-12-31T07:32:30.597885Z","iopub.status.idle":"2023-12-31T07:32:30.619144Z","shell.execute_reply.started":"2023-12-31T07:32:30.597852Z","shell.execute_reply":"2023-12-31T07:32:30.618351Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"del(data)\ndel(train_tokens)\ndel(test_tokens)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:30.620239Z","iopub.execute_input":"2023-12-31T07:32:30.620611Z","iopub.status.idle":"2023-12-31T07:32:34.275696Z","shell.execute_reply.started":"2023-12-31T07:32:30.620583Z","shell.execute_reply":"2023-12-31T07:32:34.274852Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"train_tf_data=train_tf_data.prefetch(tf.data.AUTOTUNE)\ntest_tf_data=test_tf_data.prefetch(tf.data.AUTOTUNE)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.276970Z","iopub.execute_input":"2023-12-31T07:32:34.277317Z","iopub.status.idle":"2023-12-31T07:32:34.289536Z","shell.execute_reply.started":"2023-12-31T07:32:34.277290Z","shell.execute_reply":"2023-12-31T07:32:34.288697Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"for i in train_tf_data.take(1):\n print(i)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.290687Z","iopub.execute_input":"2023-12-31T07:32:34.290966Z","iopub.status.idle":"2023-12-31T07:32:34.387248Z","shell.execute_reply.started":"2023-12-31T07:32:34.290942Z","shell.execute_reply":"2023-12-31T07:32:34.386228Z"},"trusted":true},"execution_count":36,"outputs":[{"name":"stdout","text":"({'input_ids': , 'attention_mask': }, )\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.388735Z","iopub.execute_input":"2023-12-31T07:32:34.389134Z","iopub.status.idle":"2023-12-31T07:32:34.394189Z","shell.execute_reply.started":"2023-12-31T07:32:34.389098Z","shell.execute_reply":"2023-12-31T07:32:34.393217Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"config = DistilBertConfig.from_pretrained(model_checkpoint, output_hidden_states=False)","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.395453Z","iopub.execute_input":"2023-12-31T07:32:34.395832Z","iopub.status.idle":"2023-12-31T07:32:34.465568Z","shell.execute_reply.started":"2023-12-31T07:32:34.395785Z","shell.execute_reply":"2023-12-31T07:32:34.464698Z"},"trusted":true},"execution_count":38,"outputs":[]},{"cell_type":"code","source":"from tensorflow.keras.optimizers.schedules import PolynomialDecay\nwith strategy.scope():\n model = TFDistilBertModel.from_pretrained(model_checkpoint, config=config)\n learning_schedule = PolynomialDecay(initial_learning_rate=2e-5, decay_steps=len(train_tf_data) * 10, end_learning_rate=0)\n input_ids = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"input_ids\")\n attention_mask = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"attention_mask\")\n x = model([input_ids, attention_mask])[0][:,0,:] # [CLS] token of last hidden state\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n x = Dense(1024, activation=\"relu\")(x)\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n x = Dense(512, activation=\"relu\")(x)\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n output = Dense(N_CLASSES, activation=\"sigmoid\", name=\"output\")(x)\n model = tf.keras.Model(inputs=[input_ids, attention_mask],outputs=output)\n metric = [tf.keras.metrics.AUC(multi_label=True, num_labels=N_CLASSES)]\n model.compile(optimizer=tf.keras.optimizers.Adam(learning_schedule), metrics=metric, loss=tf.keras.losses.BinaryCrossentropy())","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:32:34.466800Z","iopub.execute_input":"2023-12-31T07:32:34.467131Z","iopub.status.idle":"2023-12-31T07:32:45.278768Z","shell.execute_reply.started":"2023-12-31T07:32:34.467100Z","shell.execute_reply":"2023-12-31T07:32:45.277811Z"},"trusted":true},"execution_count":39,"outputs":[{"output_type":"display_data","data":{"text/plain":"model.safetensors: 0%| | 0.00/268M [00:00\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
classprob
3threat0.000621
1severe_toxic0.000848
5identity_hate0.000876
2obscene0.001126
4insult0.001540
0toxic0.002890
\n"},"metadata":{}}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}