{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":7296872,"sourceType":"datasetVersion","datasetId":4232592},{"sourceId":7296911,"sourceType":"datasetVersion","datasetId":4232619},{"sourceId":7318087,"sourceType":"datasetVersion","datasetId":4246746}],"dockerImageVersionId":30626,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install tensorflow==2.10","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nimport string\nfrom unidecode import unidecode\nimport tensorflow as tf \nfrom sklearn.utils import class_weight\nfrom tensorflow.keras.utils import to_categorical\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nimport cloudpickle\nimport os\nfrom transformers import DistilBertTokenizerFast\nfrom transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization\nfrom tensorflow.keras.optimizers.schedules import PolynomialDecay\nfrom tensorflow.keras.callbacks import EarlyStopping","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class TextPreprocessor:\n def __init__(self, remove_punct: bool = True, remove_digits: bool = True,\n remove_stop_words: bool = False,\n remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,\n bottom_p: float = None):\n self.remove_punct = remove_punct\n self.remove_digits = remove_digits\n self.remove_stop_words = remove_stop_words\n self.remove_short_words = remove_short_words\n self.minlen = minlen\n self.maxlen = maxlen\n self.top_p = top_p\n self.bottom_p = bottom_p\n self.words_to_remove = []\n self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',\n 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',\n 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',\n 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',\n 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',\n 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',\n 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',\n 'into', 'through', 'during', 'before', 'after', 'to', 'from',\n 'in', 'out', 'on', 'off', 'further', 'then', 'once',\n 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',\n 'other', 'such', 'own', 'same', 'so', 'than', 'can', 'will', 'should','now']\n\n self.contraction_to_expansion = {\"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"cannot\",\n \"can't've\": \"cannot have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it had\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so is\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there had\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we had\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'alls\": \"you alls\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you had\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you you will\",\n \"you'll've\": \"you you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n @staticmethod\n def __remove_double_whitespaces(string: str):\n return \" \".join(string.split())\n\n def __remove_url(self, string_series: pd.Series):\n \"\"\"\n Removes URLs m text\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(\n pat=r\"(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})\",\n repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __expand(self, string_series: pd.Series):\n \"\"\"\n Replaces contractions with expansions. eg. don't wit do not.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n for c, e in self.contraction_to_expansion.items():\n clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_punct(self, string_series: pd.Series):\n \"\"\"\n Removes punctuations from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n puncts = [r'\\n', r'\\r', r'\\t']\n puncts.extend(list(string.punctuation))\n for i in puncts:\n clean_string_series = clean_string_series.str.replace(pat=i, repl=\" \", regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_digits(self, string_series: pd.Series):\n \"\"\"\n Removes digits from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(pat=r'\\d', repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n @staticmethod\n def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):\n \"\"\"\n Reomves words/tokens where minlen <= len <= maxlen.\n :param string_series: pd.Series, input string series\n :param minlen: int, minimum length of token to be removed.\n :param maxlen: int, maximum length of token to be removed.\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split() if\n (len(word) > maxlen) or (len(word) < minlen)]))\n return clean_string_series\n\n def __remove_stop_words(self, string_series: pd.Series):\n \"\"\"\n Removes stop words from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n def str_remove_stop_words(string: str):\n stops = self.stop_words\n return \" \".join([token for token in string.split() if token not in stops])\n\n return string_series.map(str_remove_stop_words)\n\n def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,\n bottom_p: int = None, dataset: str = 'train'):\n \"\"\"\n Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.\n :param string_series: pd.Series, input string series\n :param top_p: float, percent of frequent words to remove.\n :param bottom_p: float, percent of rare words to remove.\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n if dataset == 'train':\n if top_p is None:\n top_p = 0\n if bottom_p is None:\n bottom_p = 0\n\n if top_p > 0 or bottom_p > 0:\n word_freq = pd.Series(\" \".join(string_series).split()).value_counts()\n n_words = len(word_freq)\n\n if top_p > 0:\n self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])\n\n if bottom_p > 0:\n self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])\n\n if len(self.words_to_remove) == 0:\n return string_series\n else:\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split()\n if word not in self.words_to_remove]))\n return clean_string_series\n\n def preprocess(self, string_series: pd.Series, dataset: str = \"train\"):\n \"\"\"\n Entry point.\n :param string_series: pd.Series, input string series\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n string_series = string_series.str.lower()\n string_series = string_series.map(unidecode)\n string_series = self.__remove_url(string_series=string_series)\n string_series = self.__expand(string_series=string_series)\n\n if self.remove_punct:\n string_series = self.__remove_punct(string_series=string_series)\n if self.remove_digits:\n string_series = self.__remove_digits(string_series=string_series)\n if self.remove_stop_words:\n string_series = self.__remove_stop_words(string_series=string_series)\n if self.remove_short_words:\n string_series = self.__remove_short_words(string_series=string_series,\n minlen=self.minlen,\n maxlen=self.maxlen)\n string_series = self.__remove_top_bottom_words(string_series=string_series,\n top_p=self.top_p,\n bottom_p=self.bottom_p, dataset=dataset)\n\n string_series = string_series.str.strip()\n string_series.replace(to_replace=\"\", value=\"this is an empty message\", inplace=True)\n\n return string_series","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:08.561739Z","iopub.execute_input":"2024-01-01T15:47:08.562416Z","iopub.status.idle":"2024-01-01T15:47:08.604271Z","shell.execute_reply.started":"2024-01-01T15:47:08.562388Z","shell.execute_reply":"2024-01-01T15:47:08.603034Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"data = pd.read_csv('train.csv')","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:08.606763Z","iopub.execute_input":"2024-01-01T15:47:08.607049Z","iopub.status.idle":"2024-01-01T15:47:10.359700Z","shell.execute_reply.started":"2024-01-01T15:47:08.607024Z","shell.execute_reply":"2024-01-01T15:47:10.358885Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.360758Z","iopub.execute_input":"2024-01-01T15:47:10.361041Z","iopub.status.idle":"2024-01-01T15:47:10.387262Z","shell.execute_reply.started":"2024-01-01T15:47:10.361017Z","shell.execute_reply":"2024-01-01T15:47:10.386476Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" id comment_text \\\n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n... ... ... \n159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n\n toxic severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 \n2 0 0 0 0 0 0 \n3 0 0 0 0 0 0 \n4 0 0 0 0 0 0 \n... ... ... ... ... ... ... \n159566 0 0 0 0 0 0 \n159567 0 0 0 0 0 0 \n159568 0 0 0 0 0 0 \n159569 0 0 0 0 0 0 \n159570 0 0 0 0 0 0 \n\n[159571 rows x 8 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\\nWhy the edits made under my usern...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\\nMore\\nI can't make any real suggestions on ...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
...........................
159566ffe987279560d7ff\":::::And for the second time of asking, when ...000000
159567ffea4adeee384e90You should be ashamed of yourself \\n\\nThat is ...000000
159568ffee36eab5c267c9Spitzer \\n\\nUmm, theres no actual article for ...000000
159569fff125370e4aaaf3And it looks like it was actually you who put ...000000
159570fff46fc426af1f9a\"\\nAnd ... I really don't think you understand...000000
\n

159571 rows × 8 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"data.iloc[:, 2:].apply(np.mean)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.388359Z","iopub.execute_input":"2024-01-01T15:47:10.388620Z","iopub.status.idle":"2024-01-01T15:47:10.409686Z","shell.execute_reply.started":"2024-01-01T15:47:10.388597Z","shell.execute_reply":"2024-01-01T15:47:10.408904Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"toxic 0.095844\nsevere_toxic 0.009996\nobscene 0.052948\nthreat 0.002996\ninsult 0.049364\nidentity_hate 0.008805\ndtype: float64"},"metadata":{}}]},{"cell_type":"code","source":"data.loc[data['toxic']==1, 'comment_text']","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.410789Z","iopub.execute_input":"2024-01-01T15:47:10.411391Z","iopub.status.idle":"2024-01-01T15:47:10.420370Z","shell.execute_reply.started":"2024-01-01T15:47:10.411356Z","shell.execute_reply":"2024-01-01T15:47:10.419483Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"6 COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK\n12 Hey... what is it..\\n@ | talk .\\nWhat is it......\n16 Bye! \\n\\nDon't look, come or think of comming ...\n42 You are gay or antisemmitian? \\n\\nArchangel WH...\n43 FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!\n ... \n159494 \"\\n\\n our previous conversation \\n\\nyou fuckin...\n159514 YOU ARE A MISCHIEVIOUS PUBIC HAIR\n159541 Your absurd edits \\n\\nYour absurd edits on gre...\n159546 \"\\n\\nHey listen don't you ever!!!! Delete my e...\n159554 and i'm going to keep posting the stuff u dele...\nName: comment_text, Length: 15294, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"data.drop(columns='id', inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.421480Z","iopub.execute_input":"2024-01-01T15:47:10.421751Z","iopub.status.idle":"2024-01-01T15:47:10.446263Z","shell.execute_reply.started":"2024-01-01T15:47:10.421727Z","shell.execute_reply":"2024-01-01T15:47:10.445444Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.447467Z","iopub.execute_input":"2024-01-01T15:47:10.448193Z","iopub.status.idle":"2024-01-01T15:47:10.461011Z","shell.execute_reply.started":"2024-01-01T15:47:10.448136Z","shell.execute_reply":"2024-01-01T15:47:10.460160Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" comment_text toxic \\\n0 Explanation\\nWhy the edits made under my usern... 0 \n1 D'aww! He matches this background colour I'm s... 0 \n2 Hey man, I'm really not trying to edit war. It... 0 \n3 \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 You, sir, are my hero. Any chance you remember... 0 \n... ... ... \n159566 \":::::And for the second time of asking, when ... 0 \n159567 You should be ashamed of yourself \\n\\nThat is ... 0 \n159568 Spitzer \\n\\nUmm, theres no actual article for ... 0 \n159569 And it looks like it was actually you who put ... 0 \n159570 \"\\nAnd ... I really don't think you understand... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 \n... ... ... ... ... ... \n159566 0 0 0 0 0 \n159567 0 0 0 0 0 \n159568 0 0 0 0 0 \n159569 0 0 0 0 0 \n159570 0 0 0 0 0 \n\n[159571 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
comment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Explanation\\nWhy the edits made under my usern...000000
1D'aww! He matches this background colour I'm s...000000
2Hey man, I'm really not trying to edit war. It...000000
3\"\\nMore\\nI can't make any real suggestions on ...000000
4You, sir, are my hero. Any chance you remember...000000
........................
159566\":::::And for the second time of asking, when ...000000
159567You should be ashamed of yourself \\n\\nThat is ...000000
159568Spitzer \\n\\nUmm, theres no actual article for ...000000
159569And it looks like it was actually you who put ...000000
159570\"\\nAnd ... I really don't think you understand...000000
\n

159571 rows × 7 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"data.rename(columns={'comment_text': 'text'}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.464669Z","iopub.execute_input":"2024-01-01T15:47:10.464933Z","iopub.status.idle":"2024-01-01T15:47:10.472614Z","shell.execute_reply.started":"2024-01-01T15:47:10.464902Z","shell.execute_reply":"2024-01-01T15:47:10.471928Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"data.shape","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.473840Z","iopub.execute_input":"2024-01-01T15:47:10.474531Z","iopub.status.idle":"2024-01-01T15:47:10.486073Z","shell.execute_reply.started":"2024-01-01T15:47:10.474501Z","shell.execute_reply":"2024-01-01T15:47:10.485243Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"(159571, 7)"},"metadata":{}}]},{"cell_type":"code","source":"data.dtypes","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.486990Z","iopub.execute_input":"2024-01-01T15:47:10.487272Z","iopub.status.idle":"2024-01-01T15:47:10.499292Z","shell.execute_reply.started":"2024-01-01T15:47:10.487249Z","shell.execute_reply":"2024-01-01T15:47:10.498422Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"text object\ntoxic int64\nsevere_toxic int64\nobscene int64\nthreat int64\ninsult int64\nidentity_hate int64\ndtype: object"},"metadata":{}}]},{"cell_type":"code","source":"# data.drop(columns='categories', inplace=True)\ndata.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.500596Z","iopub.execute_input":"2024-01-01T15:47:10.500858Z","iopub.status.idle":"2024-01-01T15:47:10.534055Z","shell.execute_reply.started":"2024-01-01T15:47:10.500835Z","shell.execute_reply":"2024-01-01T15:47:10.533213Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.535327Z","iopub.execute_input":"2024-01-01T15:47:10.535998Z","iopub.status.idle":"2024-01-01T15:47:10.550373Z","shell.execute_reply.started":"2024-01-01T15:47:10.535965Z","shell.execute_reply":"2024-01-01T15:47:10.549243Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":" text toxic \\\n0 Explanation\\nWhy the edits made under my usern... 0 \n1 D'aww! He matches this background colour I'm s... 0 \n2 Hey man, I'm really not trying to edit war. It... 0 \n3 \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 You, sir, are my hero. Any chance you remember... 0 \n... ... ... \n159566 \":::::And for the second time of asking, when ... 0 \n159567 You should be ashamed of yourself \\n\\nThat is ... 0 \n159568 Spitzer \\n\\nUmm, theres no actual article for ... 0 \n159569 And it looks like it was actually you who put ... 0 \n159570 \"\\nAnd ... I really don't think you understand... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 \n... ... ... ... ... ... \n159566 0 0 0 0 0 \n159567 0 0 0 0 0 \n159568 0 0 0 0 0 \n159569 0 0 0 0 0 \n159570 0 0 0 0 0 \n\n[159571 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
texttoxicsevere_toxicobscenethreatinsultidentity_hate
0Explanation\\nWhy the edits made under my usern...000000
1D'aww! He matches this background colour I'm s...000000
2Hey man, I'm really not trying to edit war. It...000000
3\"\\nMore\\nI can't make any real suggestions on ...000000
4You, sir, are my hero. Any chance you remember...000000
........................
159566\":::::And for the second time of asking, when ...000000
159567You should be ashamed of yourself \\n\\nThat is ...000000
159568Spitzer \\n\\nUmm, theres no actual article for ...000000
159569And it looks like it was actually you who put ...000000
159570\"\\nAnd ... I really don't think you understand...000000
\n

159571 rows × 7 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.551372Z","iopub.execute_input":"2024-01-01T15:47:10.551662Z","iopub.status.idle":"2024-01-01T15:47:10.560442Z","shell.execute_reply.started":"2024-01-01T15:47:10.551631Z","shell.execute_reply":"2024-01-01T15:47:10.559694Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"\"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.\""},"metadata":{}}]},{"cell_type":"code","source":"CLASS_NAMES = [*data.columns][1:]\nprint(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.561389Z","iopub.execute_input":"2024-01-01T15:47:10.561696Z","iopub.status.idle":"2024-01-01T15:47:10.570269Z","shell.execute_reply.started":"2024-01-01T15:47:10.561673Z","shell.execute_reply":"2024-01-01T15:47:10.569424Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n","output_type":"stream"}]},{"cell_type":"code","source":"tp = TextPreprocessor()\ndata['text'] = tp.preprocess(data['text'])","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.571497Z","iopub.execute_input":"2024-01-01T15:47:10.571829Z","iopub.status.idle":"2024-01-01T15:47:46.305794Z","shell.execute_reply.started":"2024-01-01T15:47:10.571799Z","shell.execute_reply":"2024-01-01T15:47:46.305002Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.306763Z","iopub.execute_input":"2024-01-01T15:47:46.307015Z","iopub.status.idle":"2024-01-01T15:47:46.312867Z","shell.execute_reply.started":"2024-01-01T15:47:46.306993Z","shell.execute_reply":"2024-01-01T15:47:46.311963Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"'hey man i am really not trying to edit war it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info'"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"wb\") as model_file_obj:\n cloudpickle.dump((tp, CLASS_NAMES), model_file_obj)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.314062Z","iopub.execute_input":"2024-01-01T15:47:46.314481Z","iopub.status.idle":"2024-01-01T15:47:46.332489Z","shell.execute_reply.started":"2024-01-01T15:47:46.314449Z","shell.execute_reply":"2024-01-01T15:47:46.331749Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"x = data['text']\ny = data.drop(columns='text').values.copy()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.333549Z","iopub.execute_input":"2024-01-01T15:47:46.333811Z","iopub.status.idle":"2024-01-01T15:47:46.440198Z","shell.execute_reply.started":"2024-01-01T15:47:46.333789Z","shell.execute_reply":"2024-01-01T15:47:46.439230Z"},"trusted":true},"execution_count":20,"outputs":[]},{"cell_type":"code","source":"x","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.441552Z","iopub.execute_input":"2024-01-01T15:47:46.442022Z","iopub.status.idle":"2024-01-01T15:47:46.450997Z","shell.execute_reply.started":"2024-01-01T15:47:46.441987Z","shell.execute_reply":"2024-01-01T15:47:46.450118Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"0 explanation why the edits made under my userna...\n1 d aww he matches this background colour i am s...\n2 hey man i am really not trying to edit war it ...\n3 more i cannot make any real suggestions on imp...\n4 you sir are my hero any chance you remember wh...\n ... \n159566 and for the second time of asking when your vi...\n159567 you should be ashamed of yourself that is a ho...\n159568 spitzer umm theres no actual article for prost...\n159569 and it looks like it was actually you who put ...\n159570 and i really do not think you understand i cam...\nName: text, Length: 159571, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"y","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.452005Z","iopub.execute_input":"2024-01-01T15:47:46.452267Z","iopub.status.idle":"2024-01-01T15:47:46.464143Z","shell.execute_reply.started":"2024-01-01T15:47:46.452245Z","shell.execute_reply":"2024-01-01T15:47:46.463182Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"array([[0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n ...,\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0]])"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.465279Z","iopub.execute_input":"2024-01-01T15:47:46.465582Z","iopub.status.idle":"2024-01-01T15:47:46.501516Z","shell.execute_reply.started":"2024-01-01T15:47:46.465556Z","shell.execute_reply":"2024-01-01T15:47:46.500757Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"x_train.shape, x_test.shape, y_train.shape, y_test.shape","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.502491Z","iopub.execute_input":"2024-01-01T15:47:46.502756Z","iopub.status.idle":"2024-01-01T15:47:46.508855Z","shell.execute_reply.started":"2024-01-01T15:47:46.502733Z","shell.execute_reply":"2024-01-01T15:47:46.507998Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"((111699,), (47872,), (111699, 6), (47872, 6))"},"metadata":{}}]},{"cell_type":"code","source":"def compute_pos_weight(y_train):\n num_positives = np.sum(y_train, axis=0)\n total_examples = y_train.shape[0]\n class_weights = num_positives / total_examples\n pos_weight = 1.0 / class_weights \n return pos_weight","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.509931Z","iopub.execute_input":"2024-01-01T15:47:46.510222Z","iopub.status.idle":"2024-01-01T15:47:46.520489Z","shell.execute_reply.started":"2024-01-01T15:47:46.510193Z","shell.execute_reply":"2024-01-01T15:47:46.519690Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"POS_WEIGHT = compute_pos_weight(y_train)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.521479Z","iopub.execute_input":"2024-01-01T15:47:46.521709Z","iopub.status.idle":"2024-01-01T15:47:46.535983Z","shell.execute_reply.started":"2024-01-01T15:47:46.521689Z","shell.execute_reply":"2024-01-01T15:47:46.535146Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"POS_WEIGHT","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.536943Z","iopub.execute_input":"2024-01-01T15:47:46.537232Z","iopub.status.idle":"2024-01-01T15:47:46.547418Z","shell.execute_reply.started":"2024-01-01T15:47:46.537200Z","shell.execute_reply":"2024-01-01T15:47:46.546609Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"array([ 10.42746453, 100.72046889, 18.95452231, 326.60526316,\n 20.35331633, 114.79856115])"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test = x_train.to_list(), x_test.to_list()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.553940Z","iopub.execute_input":"2024-01-01T15:47:46.554221Z","iopub.status.idle":"2024-01-01T15:47:46.566178Z","shell.execute_reply.started":"2024-01-01T15:47:46.554192Z","shell.execute_reply":"2024-01-01T15:47:46.565562Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"from transformers import DistilBertTokenizerFast","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.567076Z","iopub.execute_input":"2024-01-01T15:47:46.567346Z","iopub.status.idle":"2024-01-01T15:47:46.578372Z","shell.execute_reply.started":"2024-01-01T15:47:46.567324Z","shell.execute_reply":"2024-01-01T15:47:46.577596Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"model_checkpoint = \"distilbert-base-uncased\"\ntokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.579486Z","iopub.execute_input":"2024-01-01T15:47:46.579731Z","iopub.status.idle":"2024-01-01T15:47:47.602960Z","shell.execute_reply.started":"2024-01-01T15:47:46.579709Z","shell.execute_reply":"2024-01-01T15:47:47.602058Z"},"trusted":true},"execution_count":30,"outputs":[{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0%| | 0.00/28.0 [00:00, 'attention_mask': }, )\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:42.885276Z","iopub.execute_input":"2024-01-01T15:48:42.886195Z","iopub.status.idle":"2024-01-01T15:48:42.890691Z","shell.execute_reply.started":"2024-01-01T15:48:42.886141Z","shell.execute_reply":"2024-01-01T15:48:42.889813Z"},"trusted":true},"execution_count":40,"outputs":[]},{"cell_type":"code","source":"config = DistilBertConfig.from_pretrained(model_checkpoint, output_hidden_states=False)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:43.619284Z","iopub.execute_input":"2024-01-01T15:48:43.620217Z","iopub.status.idle":"2024-01-01T15:48:43.698896Z","shell.execute_reply.started":"2024-01-01T15:48:43.620159Z","shell.execute_reply":"2024-01-01T15:48:43.697941Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"def weighted_binary_crossentropy(y_true, y_pred):\n # handle class imbalance\n y_true = tf.cast(y_true, tf.float32)\n loss = tf.nn.weighted_cross_entropy_with_logits(labels=y_true, logits=y_pred, pos_weight=POS_WEIGHT)\n return tf.reduce_mean(loss)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:44.492801Z","iopub.execute_input":"2024-01-01T15:48:44.493181Z","iopub.status.idle":"2024-01-01T15:48:44.500229Z","shell.execute_reply.started":"2024-01-01T15:48:44.493138Z","shell.execute_reply":"2024-01-01T15:48:44.499225Z"},"trusted":true},"execution_count":42,"outputs":[]},{"cell_type":"code","source":"from tensorflow.keras.optimizers.schedules import PolynomialDecay\nwith strategy.scope():\n model = TFDistilBertModel.from_pretrained(model_checkpoint, config=config)\n learning_schedule = PolynomialDecay(initial_learning_rate=1e-4, decay_steps=len(train_tf_data) * 10, end_learning_rate=0)\n input_ids = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"input_ids\")\n attention_mask = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"attention_mask\")\n x = model([input_ids, attention_mask])[0][:,0,:] # [CLS] token of last hidden state\n x = BatchNormalization()(x)\n x = Dropout(0.3)(x)\n x = Dense(512, activation=\"relu\")(x)\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n x = Dense(512, activation=\"relu\")(x)\n x = Dropout(0.3)(x)\n x = BatchNormalization()(x)\n# x = Dense(256, activation=\"relu\")(x)\n# x = Dropout(0.3)(x)\n# x = BatchNormalization()(x)\n output = Dense(N_CLASSES, name=\"output\")(x) # no sigmoid activation since loss is computed using logits\n model = tf.keras.Model(inputs=[input_ids, attention_mask],outputs=output)\n metric = [tf.keras.metrics.AUC(multi_label=True, num_labels=N_CLASSES)]\n model.compile(optimizer=tf.keras.optimizers.Adam(learning_schedule), metrics=metric, loss=weighted_binary_crossentropy)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:57.245859Z","iopub.execute_input":"2024-01-01T15:48:57.246233Z","iopub.status.idle":"2024-01-01T15:48:57.304709Z","shell.execute_reply.started":"2024-01-01T15:48:57.246198Z","shell.execute_reply":"2024-01-01T15:48:57.303858Z"},"trusted":true},"execution_count":44,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 512)] 0 [] \n \n attention_mask (InputLayer) [(None, 512)] 0 [] \n \n tf_distil_bert_model (TFDistil TFBaseModelOutput(l 66362880 ['input_ids[0][0]', \n BertModel) ast_hidden_state=(N 'attention_mask[0][0]'] \n one, 512, 768), \n hidden_states=None \n , attentions=None) \n \n tf.__operators__.getitem (Slic (None, 768) 0 ['tf_distil_bert_model[0][0]'] \n ingOpLambda) \n \n batch_normalization (BatchNorm (None, 768) 3072 ['tf.__operators__.getitem[0][0]'\n alization) ] \n \n dropout_19 (Dropout) (None, 768) 0 ['batch_normalization[0][0]'] \n \n dense (Dense) (None, 512) 393728 ['dropout_19[0][0]'] \n \n dropout_20 (Dropout) (None, 512) 0 ['dense[0][0]'] \n \n batch_normalization_1 (BatchNo (None, 512) 2048 ['dropout_20[0][0]'] \n rmalization) \n \n dense_1 (Dense) (None, 512) 262656 ['batch_normalization_1[0][0]'] \n \n dropout_21 (Dropout) (None, 512) 0 ['dense_1[0][0]'] \n \n batch_normalization_2 (BatchNo (None, 512) 2048 ['dropout_21[0][0]'] \n rmalization) \n \n output (Dense) (None, 6) 3078 ['batch_normalization_2[0][0]'] \n \n==================================================================================================\nTotal params: 67,029,510\nTrainable params: 67,025,926\nNon-trainable params: 3,584\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"from tensorflow.keras.callbacks import EarlyStopping\nearly_stop = EarlyStopping(monitor=\"val_loss\",patience=1,mode=\"min\")","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:57.305988Z","iopub.execute_input":"2024-01-01T15:48:57.306663Z","iopub.status.idle":"2024-01-01T15:48:57.311370Z","shell.execute_reply.started":"2024-01-01T15:48:57.306617Z","shell.execute_reply":"2024-01-01T15:48:57.310499Z"},"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"model.fit(train_tf_data.shuffle(len(train_tf_data)).batch(BATCH_SIZE), validation_data=test_tf_data.shuffle(len(test_tf_data)).batch(BATCH_SIZE), \n epochs=10, callbacks=[early_stop])","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:57.313364Z","iopub.execute_input":"2024-01-01T15:48:57.313701Z","iopub.status.idle":"2024-01-01T17:50:28.473405Z","shell.execute_reply.started":"2024-01-01T15:48:57.313667Z","shell.execute_reply":"2024-01-01T17:50:28.472543Z"},"trusted":true},"execution_count":46,"outputs":[{"name":"stdout","text":"Epoch 1/10\n2234/2234 [==============================] - 3660s 2s/step - loss: 0.7597 - auc: 0.8941 - val_loss: 0.4980 - val_auc: 0.9317\nEpoch 2/10\n2234/2234 [==============================] - 3629s 2s/step - loss: 0.5204 - auc: 0.9265 - val_loss: 0.5717 - val_auc: 0.9456\n","output_type":"stream"},{"execution_count":46,"output_type":"execute_result","data":{"text/plain":""},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"toxic_comment_classifier_hf_distilbert.h5\")","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:50:34.269685Z","iopub.execute_input":"2024-01-01T17:50:34.270054Z","iopub.status.idle":"2024-01-01T17:50:35.987233Z","shell.execute_reply.started":"2024-01-01T17:50:34.270024Z","shell.execute_reply":"2024-01-01T17:50:35.986422Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"tf_model = tf.keras.models.load_model('toxic_comment_classifier_hf_distilbert.h5', custom_objects={\"TFDistilBertModel\": TFDistilBertModel, 'weighted_binary_crossentropy': weighted_binary_crossentropy})","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:50:38.175471Z","iopub.execute_input":"2024-01-01T17:50:38.175809Z","iopub.status.idle":"2024-01-01T17:50:42.024993Z","shell.execute_reply.started":"2024-01-01T17:50:38.175784Z","shell.execute_reply":"2024-01-01T17:50:42.024221Z"},"trusted":true},"execution_count":48,"outputs":[]},{"cell_type":"code","source":"def sigmoid(x):\n return 1 / (1 + np.exp(-x))","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:51:59.959469Z","iopub.execute_input":"2024-01-01T17:51:59.959766Z","iopub.status.idle":"2024-01-01T17:51:59.964272Z","shell.execute_reply.started":"2024-01-01T17:51:59.959742Z","shell.execute_reply":"2024-01-01T17:51:59.963375Z"},"trusted":true},"execution_count":50,"outputs":[]},{"cell_type":"markdown","source":"COMPUTING THRESHOLD ","metadata":{}},{"cell_type":"code","source":"def inference(text):\n model_checkpoint = \"distilbert-base-uncased\"\n tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)\n input_=tf.data.Dataset.from_tensor_slices((dict(tokenizer(text, \n max_length=512, padding=\"max_length\", \n truncation=True, return_tensors=\"tf\"))))\n pred = tf_model.predict(input_, verbose=0)\n \n return pred","metadata":{"execution":{"iopub.status.busy":"2024-01-01T14:58:41.888372Z","iopub.execute_input":"2024-01-01T14:58:41.888762Z","iopub.status.idle":"2024-01-01T14:58:41.894960Z","shell.execute_reply.started":"2024-01-01T14:58:41.888731Z","shell.execute_reply":"2024-01-01T14:58:41.894020Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"preds = inference([*x_test])","metadata":{"execution":{"iopub.status.busy":"2024-01-01T14:59:08.532073Z","iopub.execute_input":"2024-01-01T14:59:08.532927Z","iopub.status.idle":"2024-01-01T15:06:56.850495Z","shell.execute_reply.started":"2024-01-01T14:59:08.532890Z","shell.execute_reply":"2024-01-01T15:06:56.849347Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"preds.shape","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:08:43.065216Z","iopub.execute_input":"2024-01-01T15:08:43.065711Z","iopub.status.idle":"2024-01-01T15:08:43.071901Z","shell.execute_reply.started":"2024-01-01T15:08:43.065676Z","shell.execute_reply":"2024-01-01T15:08:43.071037Z"},"trusted":true},"execution_count":37,"outputs":[{"execution_count":37,"output_type":"execute_result","data":{"text/plain":"(31915, 6)"},"metadata":{}}]},{"cell_type":"code","source":"y_test","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:08:55.045644Z","iopub.execute_input":"2024-01-01T15:08:55.046555Z","iopub.status.idle":"2024-01-01T15:08:55.052833Z","shell.execute_reply.started":"2024-01-01T15:08:55.046521Z","shell.execute_reply":"2024-01-01T15:08:55.051817Z"},"trusted":true},"execution_count":38,"outputs":[{"execution_count":38,"output_type":"execute_result","data":{"text/plain":"array([[0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0],\n ...,\n [0, 0, 0, 0, 0, 0],\n [1, 0, 1, 0, 1, 0],\n [0, 0, 0, 0, 0, 0]])"},"metadata":{}}]},{"cell_type":"code","source":"from sklearn.metrics import roc_curve","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:09:16.517466Z","iopub.execute_input":"2024-01-01T15:09:16.518374Z","iopub.status.idle":"2024-01-01T15:09:16.522589Z","shell.execute_reply.started":"2024-01-01T15:09:16.518339Z","shell.execute_reply":"2024-01-01T15:09:16.521397Z"},"trusted":true},"execution_count":39,"outputs":[]},{"cell_type":"code","source":"label_threshold = []\nsigmoid_preds = sigmoid(preds)\nfor i in range(y_test.shape[1]):\n fpr, tpr, thresholds = roc_curve(y_test[:, i], sigmoid_preds[:, i])\n j = tpr - fpr\n idx = np.argmax(j)\n best_threshold = thresholds[idx]\n label_threshold.append(best_threshold)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:15:15.956147Z","iopub.execute_input":"2024-01-01T15:15:15.956576Z","iopub.status.idle":"2024-01-01T15:15:15.999900Z","shell.execute_reply.started":"2024-01-01T15:15:15.956550Z","shell.execute_reply":"2024-01-01T15:15:15.999028Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"label_threshold","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:15:17.282955Z","iopub.execute_input":"2024-01-01T15:15:17.283495Z","iopub.status.idle":"2024-01-01T15:15:17.289870Z","shell.execute_reply.started":"2024-01-01T15:15:17.283462Z","shell.execute_reply":"2024-01-01T15:15:17.288876Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"[0.5054522, 0.1555657, 0.5025445, 0.17308293, 0.48295122, 0.07352413]"},"metadata":{}}]},{"cell_type":"markdown","source":"QUANTIZATION","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:16:12.511941Z","iopub.execute_input":"2024-01-01T15:16:12.512377Z","iopub.status.idle":"2024-01-01T15:16:12.558722Z","shell.execute_reply.started":"2024-01-01T15:16:12.512345Z","shell.execute_reply":"2024-01-01T15:16:12.557405Z"}}},{"cell_type":"code","source":"import pathlib\nconverter = tf.lite.TFLiteConverter.from_keras_model(tf_model)\nconverter.optimizations = [tf.lite.Optimize.DEFAULT]\ntflite_model = converter.convert()\n\ntflite_models_dir = pathlib.Path(os.path.join(\"tflite_models\"))\ntflite_models_dir.mkdir(exist_ok=True, parents=True)\ntflite_model_file = tflite_models_dir/\"toxic_comment_classifier_hf_distilbert.tflite\"\ntflite_model_file.write_bytes(tflite_model)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:50:55.945400Z","iopub.execute_input":"2024-01-01T17:50:55.946058Z","iopub.status.idle":"2024-01-01T17:51:59.957942Z","shell.execute_reply.started":"2024-01-01T17:50:55.946024Z","shell.execute_reply":"2024-01-01T17:51:59.956990Z"},"trusted":true},"execution_count":49,"outputs":[{"execution_count":49,"output_type":"execute_result","data":{"text/plain":"68543400"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"rb\") as model_file_obj:\n text_preprocessor, class_names = cloudpickle.load(model_file_obj)\n \ninterpreter = tf.lite.Interpreter(model_path=os.path.join(\"tflite_models\", \"toxic_comment_classifier_hf_distilbert.tflite\"))\n","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:52:04.252505Z","iopub.execute_input":"2024-01-01T17:52:04.253263Z","iopub.status.idle":"2024-01-01T17:52:04.261732Z","shell.execute_reply.started":"2024-01-01T17:52:04.253214Z","shell.execute_reply":"2024-01-01T17:52:04.260705Z"},"trusted":true},"execution_count":51,"outputs":[]},{"cell_type":"code","source":"def inference(text):\n text = text_preprocessor.preprocess(pd.Series(text))[0]\n \n model_checkpoint = \"distilbert-base-uncased\"\n tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)\n tokens = tokenizer(text, max_length=512, padding=\"max_length\", truncation=True, return_tensors=\"tf\")\n \n # tflite model inference \n interpreter.allocate_tensors()\n input_details = interpreter.get_input_details()\n output_details = interpreter.get_output_details()[0]\n attention_mask, input_ids = tokens['attention_mask'], tokens['input_ids']\n interpreter.set_tensor(input_details[0][\"index\"], attention_mask)\n interpreter.set_tensor(input_details[1][\"index\"], input_ids)\n interpreter.invoke()\n tflite_logits = interpreter.get_tensor(output_details[\"index\"])[0]\n tflite_pred = sigmoid(tflite_logits)\n result_df = pd.DataFrame({'class': class_names, 'prob': tflite_pred})\n result_df.sort_values(by='prob', ascending=True, inplace=True)\n return result_df","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:52:06.735831Z","iopub.execute_input":"2024-01-01T17:52:06.736217Z","iopub.status.idle":"2024-01-01T17:52:06.744507Z","shell.execute_reply.started":"2024-01-01T17:52:06.736184Z","shell.execute_reply":"2024-01-01T17:52:06.743562Z"},"trusted":true},"execution_count":52,"outputs":[]},{"cell_type":"code","source":"","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:53:28.258034Z","iopub.execute_input":"2024-01-01T17:53:28.258950Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}