File size: 80,587 Bytes
162ccbe
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":7296872,"sourceType":"datasetVersion","datasetId":4232592},{"sourceId":7296911,"sourceType":"datasetVersion","datasetId":4232619},{"sourceId":7318087,"sourceType":"datasetVersion","datasetId":4246746}],"dockerImageVersionId":30626,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install tensorflow==2.10","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nimport string\nfrom unidecode import unidecode\nimport tensorflow as tf \nfrom sklearn.utils import class_weight\nfrom tensorflow.keras.utils import to_categorical\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nimport cloudpickle\nimport os\nfrom transformers import DistilBertTokenizerFast\nfrom transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization\nfrom tensorflow.keras.optimizers.schedules import PolynomialDecay\nfrom tensorflow.keras.callbacks import EarlyStopping","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class TextPreprocessor:\n    def __init__(self, remove_punct: bool = True, remove_digits: bool = True,\n                 remove_stop_words: bool = False,\n                 remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,\n                 bottom_p: float = None):\n        self.remove_punct = remove_punct\n        self.remove_digits = remove_digits\n        self.remove_stop_words = remove_stop_words\n        self.remove_short_words = remove_short_words\n        self.minlen = minlen\n        self.maxlen = maxlen\n        self.top_p = top_p\n        self.bottom_p = bottom_p\n        self.words_to_remove = []\n        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',\n                           'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',\n                           'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',\n                           'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',\n                           'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',\n                           'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',\n                           'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',\n                           'into', 'through', 'during', 'before', 'after', 'to', 'from',\n                           'in', 'out', 'on', 'off', 'further', 'then', 'once',\n                           'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',\n                           'other', 'such', 'own', 'same', 'so', 'than', 'can', 'will', 'should','now']\n\n        self.contraction_to_expansion = {\"ain't\": \"am not\",\n                                         \"aren't\": \"are not\",\n                                         \"can't\": \"cannot\",\n                                         \"can't've\": \"cannot have\",\n                                         \"'cause\": \"because\",\n                                         \"could've\": \"could have\",\n                                         \"couldn't\": \"could not\",\n                                         \"couldn't've\": \"could not have\",\n                                         \"didn't\": \"did not\",\n                                         \"doesn't\": \"does not\",\n                                         \"don't\": \"do not\",\n                                         \"hadn't\": \"had not\",\n                                         \"hadn't've\": \"had not have\",\n                                         \"hasn't\": \"has not\",\n                                         \"haven't\": \"have not\",\n                                         \"he'd\": \"he would\",\n                                         \"he'd've\": \"he would have\",\n                                         \"he'll\": \"he will\",\n                                         \"he'll've\": \"he will have\",\n                                         \"he's\": \"he is\",\n                                         \"how'd\": \"how did\",\n                                         \"how'd'y\": \"how do you\",\n                                         \"how'll\": \"how will\",\n                                         \"how's\": \"how is\",\n                                         \"i'd\": \"i would\",\n                                         \"i'd've\": \"i would have\",\n                                         \"i'll\": \"i will\",\n                                         \"i'll've\": \"i will have\",\n                                         \"i'm\": \"i am\",\n                                         \"i've\": \"i have\",\n                                         \"isn't\": \"is not\",\n                                         \"it'd\": \"it had\",\n                                         \"it'd've\": \"it would have\",\n                                         \"it'll\": \"it will\",\n                                         \"it'll've\": \"it will have\",\n                                         \"it's\": \"it is\",\n                                         \"let's\": \"let us\",\n                                         \"ma'am\": \"madam\",\n                                         \"mayn't\": \"may not\",\n                                         \"might've\": \"might have\",\n                                         \"mightn't\": \"might not\",\n                                         \"mightn't've\": \"might not have\",\n                                         \"must've\": \"must have\",\n                                         \"mustn't\": \"must not\",\n                                         \"mustn't've\": \"must not have\",\n                                         \"needn't\": \"need not\",\n                                         \"needn't've\": \"need not have\",\n                                         \"o'clock\": \"of the clock\",\n                                         \"oughtn't\": \"ought not\",\n                                         \"oughtn't've\": \"ought not have\",\n                                         \"shan't\": \"shall not\",\n                                         \"sha'n't\": \"shall not\",\n                                         \"shan't've\": \"shall not have\",\n                                         \"she'd\": \"she would\",\n                                         \"she'd've\": \"she would have\",\n                                         \"she'll\": \"she will\",\n                                         \"she'll've\": \"she will have\",\n                                         \"she's\": \"she is\",\n                                         \"should've\": \"should have\",\n                                         \"shouldn't\": \"should not\",\n                                         \"shouldn't've\": \"should not have\",\n                                         \"so've\": \"so have\",\n                                         \"so's\": \"so is\",\n                                         \"that'd\": \"that would\",\n                                         \"that'd've\": \"that would have\",\n                                         \"that's\": \"that is\",\n                                         \"there'd\": \"there had\",\n                                         \"there'd've\": \"there would have\",\n                                         \"there's\": \"there is\",\n                                         \"they'd\": \"they would\",\n                                         \"they'd've\": \"they would have\",\n                                         \"they'll\": \"they will\",\n                                         \"they'll've\": \"they will have\",\n                                         \"they're\": \"they are\",\n                                         \"they've\": \"they have\",\n                                         \"to've\": \"to have\",\n                                         \"wasn't\": \"was not\",\n                                         \"we'd\": \"we had\",\n                                         \"we'd've\": \"we would have\",\n                                         \"we'll\": \"we will\",\n                                         \"we'll've\": \"we will have\",\n                                         \"we're\": \"we are\",\n                                         \"we've\": \"we have\",\n                                         \"weren't\": \"were not\",\n                                         \"what'll\": \"what will\",\n                                         \"what'll've\": \"what will have\",\n                                         \"what're\": \"what are\",\n                                         \"what's\": \"what is\",\n                                         \"what've\": \"what have\",\n                                         \"when's\": \"when is\",\n                                         \"when've\": \"when have\",\n                                         \"where'd\": \"where did\",\n                                         \"where's\": \"where is\",\n                                         \"where've\": \"where have\",\n                                         \"who'll\": \"who will\",\n                                         \"who'll've\": \"who will have\",\n                                         \"who's\": \"who is\",\n                                         \"who've\": \"who have\",\n                                         \"why's\": \"why is\",\n                                         \"why've\": \"why have\",\n                                         \"will've\": \"will have\",\n                                         \"won't\": \"will not\",\n                                         \"won't've\": \"will not have\",\n                                         \"would've\": \"would have\",\n                                         \"wouldn't\": \"would not\",\n                                         \"wouldn't've\": \"would not have\",\n                                         \"y'all\": \"you all\",\n                                         \"y'alls\": \"you alls\",\n                                         \"y'all'd\": \"you all would\",\n                                         \"y'all'd've\": \"you all would have\",\n                                         \"y'all're\": \"you all are\",\n                                         \"y'all've\": \"you all have\",\n                                         \"you'd\": \"you had\",\n                                         \"you'd've\": \"you would have\",\n                                         \"you'll\": \"you you will\",\n                                         \"you'll've\": \"you you will have\",\n                                         \"you're\": \"you are\",\n                                         \"you've\": \"you have\"\n                                         }\n\n    @staticmethod\n    def __remove_double_whitespaces(string: str):\n        return \" \".join(string.split())\n\n    def __remove_url(self, string_series: pd.Series):\n        \"\"\"\n        Removes URLs m text\n        :param string_series: pd.Series, input string series\n        :return: pd.Series, cleaned string series\n        \"\"\"\n        clean_string_series = string_series.str.replace(\n            pat=r\"(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})\",\n            repl=\" \", regex=True)\n        return clean_string_series.map(self.__remove_double_whitespaces)\n\n    def __expand(self, string_series: pd.Series):\n        \"\"\"\n        Replaces contractions with expansions. eg. don't wit do not.\n        :param string_series: pd.Series, input string series\n        :return: pd.Series, cleaned string series\n        \"\"\"\n        clean_string_series = string_series.copy()\n        for c, e in self.contraction_to_expansion.items():\n            clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False)\n        return clean_string_series.map(self.__remove_double_whitespaces)\n\n    def __remove_punct(self, string_series: pd.Series):\n        \"\"\"\n       Removes punctuations from the input string.\n       :param string_series: pd.Series, input string series\n       :return: pd.Series, cleaned string series\n       \"\"\"\n        clean_string_series = string_series.copy()\n        puncts = [r'\\n', r'\\r', r'\\t']\n        puncts.extend(list(string.punctuation))\n        for i in puncts:\n            clean_string_series = clean_string_series.str.replace(pat=i, repl=\" \", regex=False)\n        return clean_string_series.map(self.__remove_double_whitespaces)\n\n    def __remove_digits(self, string_series: pd.Series):\n        \"\"\"\n       Removes digits from the input string.\n       :param string_series: pd.Series, input string series\n       :return: pd.Series, cleaned string series\n       \"\"\"\n        clean_string_series = string_series.str.replace(pat=r'\\d', repl=\" \", regex=True)\n        return clean_string_series.map(self.__remove_double_whitespaces)\n\n    @staticmethod\n    def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):\n        \"\"\"\n        Reomves words/tokens where minlen <= len <= maxlen.\n        :param string_series: pd.Series, input string series\n        :param minlen: int, minimum length of token to be removed.\n        :param maxlen:  int, maximum length of token to be removed.\n        :return: pd.Series, cleaned string series\n        \"\"\"\n        clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split() if\n                                                                         (len(word) > maxlen) or (len(word) < minlen)]))\n        return clean_string_series\n\n    def __remove_stop_words(self, string_series: pd.Series):\n        \"\"\"\n       Removes stop words from the input string.\n       :param string_series: pd.Series, input string series\n       :return: pd.Series, cleaned string series\n       \"\"\"\n        def str_remove_stop_words(string: str):\n            stops = self.stop_words\n            return \" \".join([token for token in string.split() if token not in stops])\n\n        return string_series.map(str_remove_stop_words)\n\n    def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,\n                                  bottom_p: int = None, dataset: str = 'train'):\n        \"\"\"\n        Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.\n        :param string_series: pd.Series, input string series\n        :param top_p: float, percent of frequent words to remove.\n        :param bottom_p: float, percent of rare words to remove.\n        :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n        :return: pd.Series, cleaned string series\n        \"\"\"\n        if dataset == 'train':\n            if top_p is None:\n                top_p = 0\n            if bottom_p is None:\n                bottom_p = 0\n\n            if top_p > 0 or bottom_p > 0:\n                word_freq = pd.Series(\" \".join(string_series).split()).value_counts()\n                n_words = len(word_freq)\n\n            if top_p > 0:\n                self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])\n\n            if bottom_p > 0:\n                self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])\n\n        if len(self.words_to_remove) == 0:\n            return string_series\n        else:\n            clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split()\n                                                                             if word not in self.words_to_remove]))\n            return clean_string_series\n\n    def preprocess(self, string_series: pd.Series, dataset: str = \"train\"):\n        \"\"\"\n        Entry point.\n        :param string_series: pd.Series, input string series\n        :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n        :return: pd.Series, cleaned string series\n        \"\"\"\n        string_series = string_series.str.lower()\n        string_series = string_series.map(unidecode)\n        string_series = self.__remove_url(string_series=string_series)\n        string_series = self.__expand(string_series=string_series)\n\n        if self.remove_punct:\n            string_series = self.__remove_punct(string_series=string_series)\n        if self.remove_digits:\n            string_series = self.__remove_digits(string_series=string_series)\n        if self.remove_stop_words:\n            string_series = self.__remove_stop_words(string_series=string_series)\n        if self.remove_short_words:\n            string_series = self.__remove_short_words(string_series=string_series,\n                                                      minlen=self.minlen,\n                                                      maxlen=self.maxlen)\n        string_series = self.__remove_top_bottom_words(string_series=string_series,\n                                                       top_p=self.top_p,\n                                                       bottom_p=self.bottom_p, dataset=dataset)\n\n        string_series = string_series.str.strip()\n        string_series.replace(to_replace=\"\", value=\"this is an empty message\", inplace=True)\n\n        return string_series","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:08.561739Z","iopub.execute_input":"2024-01-01T15:47:08.562416Z","iopub.status.idle":"2024-01-01T15:47:08.604271Z","shell.execute_reply.started":"2024-01-01T15:47:08.562388Z","shell.execute_reply":"2024-01-01T15:47:08.603034Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"data = pd.read_csv('train.csv')","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:08.606763Z","iopub.execute_input":"2024-01-01T15:47:08.607049Z","iopub.status.idle":"2024-01-01T15:47:10.359700Z","shell.execute_reply.started":"2024-01-01T15:47:08.607024Z","shell.execute_reply":"2024-01-01T15:47:10.358885Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.360758Z","iopub.execute_input":"2024-01-01T15:47:10.361041Z","iopub.status.idle":"2024-01-01T15:47:10.387262Z","shell.execute_reply.started":"2024-01-01T15:47:10.361017Z","shell.execute_reply":"2024-01-01T15:47:10.386476Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"                      id                                       comment_text  \\\n0       0000997932d777bf  Explanation\\nWhy the edits made under my usern...   \n1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   \n2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   \n3       0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...   \n4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   \n...                  ...                                                ...   \n159566  ffe987279560d7ff  \":::::And for the second time of asking, when ...   \n159567  ffea4adeee384e90  You should be ashamed of yourself \\n\\nThat is ...   \n159568  ffee36eab5c267c9  Spitzer \\n\\nUmm, theres no actual article for ...   \n159569  fff125370e4aaaf3  And it looks like it was actually you who put ...   \n159570  fff46fc426af1f9a  \"\\nAnd ... I really don't think you understand...   \n\n        toxic  severe_toxic  obscene  threat  insult  identity_hate  \n0           0             0        0       0       0              0  \n1           0             0        0       0       0              0  \n2           0             0        0       0       0              0  \n3           0             0        0       0       0              0  \n4           0             0        0       0       0              0  \n...       ...           ...      ...     ...     ...            ...  \n159566      0             0        0       0       0              0  \n159567      0             0        0       0       0              0  \n159568      0             0        0       0       0              0  \n159569      0             0        0       0       0              0  \n159570      0             0        0       0       0              0  \n\n[159571 rows x 8 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0000997932d777bf</td>\n      <td>Explanation\\nWhy the edits made under my usern...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>000103f0d9cfb60f</td>\n      <td>D'aww! He matches this background colour I'm s...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>000113f07ec002fd</td>\n      <td>Hey man, I'm really not trying to edit war. It...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0001b41b1c6bb37e</td>\n      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0001d958c54c6e35</td>\n      <td>You, sir, are my hero. Any chance you remember...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>159566</th>\n      <td>ffe987279560d7ff</td>\n      <td>\":::::And for the second time of asking, when ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159567</th>\n      <td>ffea4adeee384e90</td>\n      <td>You should be ashamed of yourself \\n\\nThat is ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159568</th>\n      <td>ffee36eab5c267c9</td>\n      <td>Spitzer \\n\\nUmm, theres no actual article for ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159569</th>\n      <td>fff125370e4aaaf3</td>\n      <td>And it looks like it was actually you who put ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159570</th>\n      <td>fff46fc426af1f9a</td>\n      <td>\"\\nAnd ... I really don't think you understand...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>159571 rows × 8 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"data.iloc[:, 2:].apply(np.mean)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.388359Z","iopub.execute_input":"2024-01-01T15:47:10.388620Z","iopub.status.idle":"2024-01-01T15:47:10.409686Z","shell.execute_reply.started":"2024-01-01T15:47:10.388597Z","shell.execute_reply":"2024-01-01T15:47:10.408904Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"toxic            0.095844\nsevere_toxic     0.009996\nobscene          0.052948\nthreat           0.002996\ninsult           0.049364\nidentity_hate    0.008805\ndtype: float64"},"metadata":{}}]},{"cell_type":"code","source":"data.loc[data['toxic']==1, 'comment_text']","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.410789Z","iopub.execute_input":"2024-01-01T15:47:10.411391Z","iopub.status.idle":"2024-01-01T15:47:10.420370Z","shell.execute_reply.started":"2024-01-01T15:47:10.411356Z","shell.execute_reply":"2024-01-01T15:47:10.419483Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"6              COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK\n12        Hey... what is it..\\n@ | talk .\\nWhat is it......\n16        Bye! \\n\\nDon't look, come or think of comming ...\n42        You are gay or antisemmitian? \\n\\nArchangel WH...\n43                 FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!\n                                ...                        \n159494    \"\\n\\n our previous conversation \\n\\nyou fuckin...\n159514                    YOU ARE A MISCHIEVIOUS PUBIC HAIR\n159541    Your absurd edits \\n\\nYour absurd edits on gre...\n159546    \"\\n\\nHey listen don't you ever!!!! Delete my e...\n159554    and i'm going to keep posting the stuff u dele...\nName: comment_text, Length: 15294, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"data.drop(columns='id', inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.421480Z","iopub.execute_input":"2024-01-01T15:47:10.421751Z","iopub.status.idle":"2024-01-01T15:47:10.446263Z","shell.execute_reply.started":"2024-01-01T15:47:10.421727Z","shell.execute_reply":"2024-01-01T15:47:10.445444Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.447467Z","iopub.execute_input":"2024-01-01T15:47:10.448193Z","iopub.status.idle":"2024-01-01T15:47:10.461011Z","shell.execute_reply.started":"2024-01-01T15:47:10.448136Z","shell.execute_reply":"2024-01-01T15:47:10.460160Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"                                             comment_text  toxic  \\\n0       Explanation\\nWhy the edits made under my usern...      0   \n1       D'aww! He matches this background colour I'm s...      0   \n2       Hey man, I'm really not trying to edit war. It...      0   \n3       \"\\nMore\\nI can't make any real suggestions on ...      0   \n4       You, sir, are my hero. Any chance you remember...      0   \n...                                                   ...    ...   \n159566  \":::::And for the second time of asking, when ...      0   \n159567  You should be ashamed of yourself \\n\\nThat is ...      0   \n159568  Spitzer \\n\\nUmm, theres no actual article for ...      0   \n159569  And it looks like it was actually you who put ...      0   \n159570  \"\\nAnd ... I really don't think you understand...      0   \n\n        severe_toxic  obscene  threat  insult  identity_hate  \n0                  0        0       0       0              0  \n1                  0        0       0       0              0  \n2                  0        0       0       0              0  \n3                  0        0       0       0              0  \n4                  0        0       0       0              0  \n...              ...      ...     ...     ...            ...  \n159566             0        0       0       0              0  \n159567             0        0       0       0              0  \n159568             0        0       0       0              0  \n159569             0        0       0       0              0  \n159570             0        0       0       0              0  \n\n[159571 rows x 7 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>comment_text</th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Explanation\\nWhy the edits made under my usern...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>D'aww! He matches this background colour I'm s...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Hey man, I'm really not trying to edit war. It...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>You, sir, are my hero. Any chance you remember...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>159566</th>\n      <td>\":::::And for the second time of asking, when ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159567</th>\n      <td>You should be ashamed of yourself \\n\\nThat is ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159568</th>\n      <td>Spitzer \\n\\nUmm, theres no actual article for ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159569</th>\n      <td>And it looks like it was actually you who put ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159570</th>\n      <td>\"\\nAnd ... I really don't think you understand...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>159571 rows × 7 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"data.rename(columns={'comment_text': 'text'}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.464669Z","iopub.execute_input":"2024-01-01T15:47:10.464933Z","iopub.status.idle":"2024-01-01T15:47:10.472614Z","shell.execute_reply.started":"2024-01-01T15:47:10.464902Z","shell.execute_reply":"2024-01-01T15:47:10.471928Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"data.shape","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.473840Z","iopub.execute_input":"2024-01-01T15:47:10.474531Z","iopub.status.idle":"2024-01-01T15:47:10.486073Z","shell.execute_reply.started":"2024-01-01T15:47:10.474501Z","shell.execute_reply":"2024-01-01T15:47:10.485243Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"(159571, 7)"},"metadata":{}}]},{"cell_type":"code","source":"data.dtypes","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.486990Z","iopub.execute_input":"2024-01-01T15:47:10.487272Z","iopub.status.idle":"2024-01-01T15:47:10.499292Z","shell.execute_reply.started":"2024-01-01T15:47:10.487249Z","shell.execute_reply":"2024-01-01T15:47:10.498422Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"text             object\ntoxic             int64\nsevere_toxic      int64\nobscene           int64\nthreat            int64\ninsult            int64\nidentity_hate     int64\ndtype: object"},"metadata":{}}]},{"cell_type":"code","source":"# data.drop(columns='categories', inplace=True)\ndata.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.500596Z","iopub.execute_input":"2024-01-01T15:47:10.500858Z","iopub.status.idle":"2024-01-01T15:47:10.534055Z","shell.execute_reply.started":"2024-01-01T15:47:10.500835Z","shell.execute_reply":"2024-01-01T15:47:10.533213Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.535327Z","iopub.execute_input":"2024-01-01T15:47:10.535998Z","iopub.status.idle":"2024-01-01T15:47:10.550373Z","shell.execute_reply.started":"2024-01-01T15:47:10.535965Z","shell.execute_reply":"2024-01-01T15:47:10.549243Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"                                                     text  toxic  \\\n0       Explanation\\nWhy the edits made under my usern...      0   \n1       D'aww! He matches this background colour I'm s...      0   \n2       Hey man, I'm really not trying to edit war. It...      0   \n3       \"\\nMore\\nI can't make any real suggestions on ...      0   \n4       You, sir, are my hero. Any chance you remember...      0   \n...                                                   ...    ...   \n159566  \":::::And for the second time of asking, when ...      0   \n159567  You should be ashamed of yourself \\n\\nThat is ...      0   \n159568  Spitzer \\n\\nUmm, theres no actual article for ...      0   \n159569  And it looks like it was actually you who put ...      0   \n159570  \"\\nAnd ... I really don't think you understand...      0   \n\n        severe_toxic  obscene  threat  insult  identity_hate  \n0                  0        0       0       0              0  \n1                  0        0       0       0              0  \n2                  0        0       0       0              0  \n3                  0        0       0       0              0  \n4                  0        0       0       0              0  \n...              ...      ...     ...     ...            ...  \n159566             0        0       0       0              0  \n159567             0        0       0       0              0  \n159568             0        0       0       0              0  \n159569             0        0       0       0              0  \n159570             0        0       0       0              0  \n\n[159571 rows x 7 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>text</th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Explanation\\nWhy the edits made under my usern...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>D'aww! He matches this background colour I'm s...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Hey man, I'm really not trying to edit war. It...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>You, sir, are my hero. Any chance you remember...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>159566</th>\n      <td>\":::::And for the second time of asking, when ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159567</th>\n      <td>You should be ashamed of yourself \\n\\nThat is ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159568</th>\n      <td>Spitzer \\n\\nUmm, theres no actual article for ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159569</th>\n      <td>And it looks like it was actually you who put ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>159570</th>\n      <td>\"\\nAnd ... I really don't think you understand...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>159571 rows × 7 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.551372Z","iopub.execute_input":"2024-01-01T15:47:10.551662Z","iopub.status.idle":"2024-01-01T15:47:10.560442Z","shell.execute_reply.started":"2024-01-01T15:47:10.551631Z","shell.execute_reply":"2024-01-01T15:47:10.559694Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"\"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.\""},"metadata":{}}]},{"cell_type":"code","source":"CLASS_NAMES = [*data.columns][1:]\nprint(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.561389Z","iopub.execute_input":"2024-01-01T15:47:10.561696Z","iopub.status.idle":"2024-01-01T15:47:10.570269Z","shell.execute_reply.started":"2024-01-01T15:47:10.561673Z","shell.execute_reply":"2024-01-01T15:47:10.569424Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n","output_type":"stream"}]},{"cell_type":"code","source":"tp = TextPreprocessor()\ndata['text'] = tp.preprocess(data['text'])","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:10.571497Z","iopub.execute_input":"2024-01-01T15:47:10.571829Z","iopub.status.idle":"2024-01-01T15:47:46.305794Z","shell.execute_reply.started":"2024-01-01T15:47:10.571799Z","shell.execute_reply":"2024-01-01T15:47:46.305002Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"data['text'][2]","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.306763Z","iopub.execute_input":"2024-01-01T15:47:46.307015Z","iopub.status.idle":"2024-01-01T15:47:46.312867Z","shell.execute_reply.started":"2024-01-01T15:47:46.306993Z","shell.execute_reply":"2024-01-01T15:47:46.311963Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"'hey man i am really not trying to edit war it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info'"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"wb\") as model_file_obj:\n    cloudpickle.dump((tp, CLASS_NAMES), model_file_obj)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.314062Z","iopub.execute_input":"2024-01-01T15:47:46.314481Z","iopub.status.idle":"2024-01-01T15:47:46.332489Z","shell.execute_reply.started":"2024-01-01T15:47:46.314449Z","shell.execute_reply":"2024-01-01T15:47:46.331749Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"x = data['text']\ny = data.drop(columns='text').values.copy()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.333549Z","iopub.execute_input":"2024-01-01T15:47:46.333811Z","iopub.status.idle":"2024-01-01T15:47:46.440198Z","shell.execute_reply.started":"2024-01-01T15:47:46.333789Z","shell.execute_reply":"2024-01-01T15:47:46.439230Z"},"trusted":true},"execution_count":20,"outputs":[]},{"cell_type":"code","source":"x","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.441552Z","iopub.execute_input":"2024-01-01T15:47:46.442022Z","iopub.status.idle":"2024-01-01T15:47:46.450997Z","shell.execute_reply.started":"2024-01-01T15:47:46.441987Z","shell.execute_reply":"2024-01-01T15:47:46.450118Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"0         explanation why the edits made under my userna...\n1         d aww he matches this background colour i am s...\n2         hey man i am really not trying to edit war it ...\n3         more i cannot make any real suggestions on imp...\n4         you sir are my hero any chance you remember wh...\n                                ...                        \n159566    and for the second time of asking when your vi...\n159567    you should be ashamed of yourself that is a ho...\n159568    spitzer umm theres no actual article for prost...\n159569    and it looks like it was actually you who put ...\n159570    and i really do not think you understand i cam...\nName: text, Length: 159571, dtype: object"},"metadata":{}}]},{"cell_type":"code","source":"y","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.452005Z","iopub.execute_input":"2024-01-01T15:47:46.452267Z","iopub.status.idle":"2024-01-01T15:47:46.464143Z","shell.execute_reply.started":"2024-01-01T15:47:46.452245Z","shell.execute_reply":"2024-01-01T15:47:46.463182Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"array([[0, 0, 0, 0, 0, 0],\n       [0, 0, 0, 0, 0, 0],\n       [0, 0, 0, 0, 0, 0],\n       ...,\n       [0, 0, 0, 0, 0, 0],\n       [0, 0, 0, 0, 0, 0],\n       [0, 0, 0, 0, 0, 0]])"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.465279Z","iopub.execute_input":"2024-01-01T15:47:46.465582Z","iopub.status.idle":"2024-01-01T15:47:46.501516Z","shell.execute_reply.started":"2024-01-01T15:47:46.465556Z","shell.execute_reply":"2024-01-01T15:47:46.500757Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"x_train.shape, x_test.shape, y_train.shape, y_test.shape","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.502491Z","iopub.execute_input":"2024-01-01T15:47:46.502756Z","iopub.status.idle":"2024-01-01T15:47:46.508855Z","shell.execute_reply.started":"2024-01-01T15:47:46.502733Z","shell.execute_reply":"2024-01-01T15:47:46.507998Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"((111699,), (47872,), (111699, 6), (47872, 6))"},"metadata":{}}]},{"cell_type":"code","source":"def compute_pos_weight(y_train):\n    num_positives = np.sum(y_train, axis=0)\n    total_examples = y_train.shape[0]\n    class_weights = num_positives / total_examples\n    pos_weight = 1.0 / class_weights    \n    return pos_weight","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.509931Z","iopub.execute_input":"2024-01-01T15:47:46.510222Z","iopub.status.idle":"2024-01-01T15:47:46.520489Z","shell.execute_reply.started":"2024-01-01T15:47:46.510193Z","shell.execute_reply":"2024-01-01T15:47:46.519690Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"POS_WEIGHT = compute_pos_weight(y_train)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.521479Z","iopub.execute_input":"2024-01-01T15:47:46.521709Z","iopub.status.idle":"2024-01-01T15:47:46.535983Z","shell.execute_reply.started":"2024-01-01T15:47:46.521689Z","shell.execute_reply":"2024-01-01T15:47:46.535146Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"POS_WEIGHT","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.536943Z","iopub.execute_input":"2024-01-01T15:47:46.537232Z","iopub.status.idle":"2024-01-01T15:47:46.547418Z","shell.execute_reply.started":"2024-01-01T15:47:46.537200Z","shell.execute_reply":"2024-01-01T15:47:46.546609Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"array([ 10.42746453, 100.72046889,  18.95452231, 326.60526316,\n        20.35331633, 114.79856115])"},"metadata":{}}]},{"cell_type":"code","source":"x_train, x_test = x_train.to_list(), x_test.to_list()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.553940Z","iopub.execute_input":"2024-01-01T15:47:46.554221Z","iopub.status.idle":"2024-01-01T15:47:46.566178Z","shell.execute_reply.started":"2024-01-01T15:47:46.554192Z","shell.execute_reply":"2024-01-01T15:47:46.565562Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"from transformers import DistilBertTokenizerFast","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.567076Z","iopub.execute_input":"2024-01-01T15:47:46.567346Z","iopub.status.idle":"2024-01-01T15:47:46.578372Z","shell.execute_reply.started":"2024-01-01T15:47:46.567324Z","shell.execute_reply":"2024-01-01T15:47:46.577596Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"model_checkpoint = \"distilbert-base-uncased\"\ntokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:46.579486Z","iopub.execute_input":"2024-01-01T15:47:46.579731Z","iopub.status.idle":"2024-01-01T15:47:47.602960Z","shell.execute_reply.started":"2024-01-01T15:47:46.579709Z","shell.execute_reply":"2024-01-01T15:47:47.602058Z"},"trusted":true},"execution_count":30,"outputs":[{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"776eb2a6f16849ddaf5684e67061d1fc"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b0da584f955d44229db4894996450cb6"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a50d28eb67954079a9bdb0edacc58d59"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1765da0409b949338d2c8bfcc818f458"}},"metadata":{}}]},{"cell_type":"code","source":"print(x_train[0])\nprint(tokenizer.tokenize(x_train[0]))\nprint(tokenizer(x_train[0]))","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:47.604295Z","iopub.execute_input":"2024-01-01T15:47:47.605015Z","iopub.status.idle":"2024-01-01T15:47:47.617275Z","shell.execute_reply.started":"2024-01-01T15:47:47.604973Z","shell.execute_reply":"2024-01-01T15:47:47.616261Z"},"trusted":true},"execution_count":31,"outputs":[{"name":"stdout","text":"conflict of interest note by your user name it appears that you represent a company or organization please read our conflict of interest guidelines as well as our faq for businesses we welcome your contributions here but please refrain from writing about your own company s services and personnel thanks and happy editing speaketh\n['conflict', 'of', 'interest', 'note', 'by', 'your', 'user', 'name', 'it', 'appears', 'that', 'you', 'represent', 'a', 'company', 'or', 'organization', 'please', 'read', 'our', 'conflict', 'of', 'interest', 'guidelines', 'as', 'well', 'as', 'our', 'fa', '##q', 'for', 'businesses', 'we', 'welcome', 'your', 'contributions', 'here', 'but', 'please', 'refrain', 'from', 'writing', 'about', 'your', 'own', 'company', 's', 'services', 'and', 'personnel', 'thanks', 'and', 'happy', 'editing', 'speak', '##eth']\n{'input_ids': [101, 4736, 1997, 3037, 3602, 2011, 2115, 5310, 2171, 2009, 3544, 2008, 2017, 5050, 1037, 2194, 2030, 3029, 3531, 3191, 2256, 4736, 1997, 3037, 11594, 2004, 2092, 2004, 2256, 6904, 4160, 2005, 5661, 2057, 6160, 2115, 5857, 2182, 2021, 3531, 20703, 2013, 3015, 2055, 2115, 2219, 2194, 1055, 2578, 1998, 5073, 4283, 1998, 3407, 9260, 3713, 11031, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n","output_type":"stream"}]},{"cell_type":"code","source":"strategy = tf.distribute.MirroredStrategy()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:47.618802Z","iopub.execute_input":"2024-01-01T15:47:47.619765Z","iopub.status.idle":"2024-01-01T15:47:55.587212Z","shell.execute_reply.started":"2024-01-01T15:47:47.619728Z","shell.execute_reply":"2024-01-01T15:47:55.586212Z"},"trusted":true},"execution_count":32,"outputs":[]},{"cell_type":"code","source":"BATCH_SIZE = 25 * strategy.num_replicas_in_sync\nN_TOKENS = 512\nN_CLASSES = len(CLASS_NAMES)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:55.588435Z","iopub.execute_input":"2024-01-01T15:47:55.588798Z","iopub.status.idle":"2024-01-01T15:47:55.593649Z","shell.execute_reply.started":"2024-01-01T15:47:55.588765Z","shell.execute_reply":"2024-01-01T15:47:55.592717Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"train_tokens = tokenizer(x_train, max_length=N_TOKENS, padding=\"max_length\", truncation=True, return_tensors=\"tf\", return_attention_mask=True, return_token_type_ids=False)\ntest_tokens = tokenizer(x_test, max_length=N_TOKENS, padding=\"max_length\", truncation=True, return_tensors=\"tf\", return_attention_mask=True, return_token_type_ids=False)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:47:55.594800Z","iopub.execute_input":"2024-01-01T15:47:55.595144Z","iopub.status.idle":"2024-01-01T15:48:29.985372Z","shell.execute_reply.started":"2024-01-01T15:47:55.595103Z","shell.execute_reply":"2024-01-01T15:48:29.981647Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"train_tokens[:5]","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:29.986582Z","iopub.execute_input":"2024-01-01T15:48:29.986856Z","iopub.status.idle":"2024-01-01T15:48:29.993130Z","shell.execute_reply.started":"2024-01-01T15:48:29.986834Z","shell.execute_reply":"2024-01-01T15:48:29.991952Z"},"trusted":true},"execution_count":35,"outputs":[{"execution_count":35,"output_type":"execute_result","data":{"text/plain":"[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),\n Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]"},"metadata":{}}]},{"cell_type":"code","source":"# train_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), y_train, sample_weight_param))\ntrain_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), y_train))\ntest_tf_data = tf.data.Dataset.from_tensor_slices((dict(test_tokens), y_test))","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:29.994271Z","iopub.execute_input":"2024-01-01T15:48:29.994556Z","iopub.status.idle":"2024-01-01T15:48:30.021302Z","shell.execute_reply.started":"2024-01-01T15:48:29.994523Z","shell.execute_reply":"2024-01-01T15:48:30.020402Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"del(data)\ndel(train_tokens)\ndel(test_tokens)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:30.022390Z","iopub.execute_input":"2024-01-01T15:48:30.022648Z","iopub.status.idle":"2024-01-01T15:48:33.697100Z","shell.execute_reply.started":"2024-01-01T15:48:30.022625Z","shell.execute_reply":"2024-01-01T15:48:33.696199Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"train_tf_data=train_tf_data.prefetch(tf.data.AUTOTUNE)\ntest_tf_data=test_tf_data.prefetch(tf.data.AUTOTUNE)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:33.698305Z","iopub.execute_input":"2024-01-01T15:48:33.698646Z","iopub.status.idle":"2024-01-01T15:48:33.717063Z","shell.execute_reply.started":"2024-01-01T15:48:33.698619Z","shell.execute_reply":"2024-01-01T15:48:33.716317Z"},"trusted":true},"execution_count":38,"outputs":[]},{"cell_type":"code","source":"for i in train_tf_data.take(1):\n    print(i)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:33.718122Z","iopub.execute_input":"2024-01-01T15:48:33.718456Z","iopub.status.idle":"2024-01-01T15:48:33.807405Z","shell.execute_reply.started":"2024-01-01T15:48:33.718432Z","shell.execute_reply":"2024-01-01T15:48:33.806366Z"},"trusted":true},"execution_count":39,"outputs":[{"name":"stdout","text":"({'input_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=\narray([  101,  4736,  1997,  3037,  3602,  2011,  2115,  5310,  2171,\n        2009,  3544,  2008,  2017,  5050,  1037,  2194,  2030,  3029,\n        3531,  3191,  2256,  4736,  1997,  3037, 11594,  2004,  2092,\n        2004,  2256,  6904,  4160,  2005,  5661,  2057,  6160,  2115,\n        5857,  2182,  2021,  3531, 20703,  2013,  3015,  2055,  2115,\n        2219,  2194,  1055,  2578,  1998,  5073,  4283,  1998,  3407,\n        9260,  3713, 11031,   102,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0,     0,\n           0,     0,     0,     0,     0,     0,     0,     0],\n      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(512,), dtype=int32, numpy=\narray([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0], dtype=int32)>}, <tf.Tensor: shape=(6,), dtype=int64, numpy=array([0, 0, 0, 0, 0, 0])>)\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:42.885276Z","iopub.execute_input":"2024-01-01T15:48:42.886195Z","iopub.status.idle":"2024-01-01T15:48:42.890691Z","shell.execute_reply.started":"2024-01-01T15:48:42.886141Z","shell.execute_reply":"2024-01-01T15:48:42.889813Z"},"trusted":true},"execution_count":40,"outputs":[]},{"cell_type":"code","source":"config = DistilBertConfig.from_pretrained(model_checkpoint, output_hidden_states=False)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:43.619284Z","iopub.execute_input":"2024-01-01T15:48:43.620217Z","iopub.status.idle":"2024-01-01T15:48:43.698896Z","shell.execute_reply.started":"2024-01-01T15:48:43.620159Z","shell.execute_reply":"2024-01-01T15:48:43.697941Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"def weighted_binary_crossentropy(y_true, y_pred):\n    # handle class imbalance\n    y_true = tf.cast(y_true, tf.float32)\n    loss = tf.nn.weighted_cross_entropy_with_logits(labels=y_true, logits=y_pred, pos_weight=POS_WEIGHT)\n    return tf.reduce_mean(loss)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:44.492801Z","iopub.execute_input":"2024-01-01T15:48:44.493181Z","iopub.status.idle":"2024-01-01T15:48:44.500229Z","shell.execute_reply.started":"2024-01-01T15:48:44.493138Z","shell.execute_reply":"2024-01-01T15:48:44.499225Z"},"trusted":true},"execution_count":42,"outputs":[]},{"cell_type":"code","source":"from tensorflow.keras.optimizers.schedules import PolynomialDecay\nwith strategy.scope():\n    model = TFDistilBertModel.from_pretrained(model_checkpoint, config=config)\n    learning_schedule = PolynomialDecay(initial_learning_rate=1e-4, decay_steps=len(train_tf_data) * 10, end_learning_rate=0)\n    input_ids = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"input_ids\")\n    attention_mask = Input(shape=(N_TOKENS,), dtype=tf.int32, name=\"attention_mask\")\n    x = model([input_ids, attention_mask])[0][:,0,:] # [CLS] token of last hidden state\n    x = BatchNormalization()(x)\n    x = Dropout(0.3)(x)\n    x = Dense(512, activation=\"relu\")(x)\n    x = Dropout(0.3)(x)\n    x = BatchNormalization()(x)\n    x = Dense(512, activation=\"relu\")(x)\n    x = Dropout(0.3)(x)\n    x = BatchNormalization()(x)\n#     x = Dense(256, activation=\"relu\")(x)\n#     x = Dropout(0.3)(x)\n#     x = BatchNormalization()(x)\n    output = Dense(N_CLASSES, name=\"output\")(x) # no sigmoid activation since loss is computed using logits\n    model = tf.keras.Model(inputs=[input_ids, attention_mask],outputs=output)\n    metric = [tf.keras.metrics.AUC(multi_label=True, num_labels=N_CLASSES)]\n    model.compile(optimizer=tf.keras.optimizers.Adam(learning_schedule), metrics=metric, loss=weighted_binary_crossentropy)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:57.245859Z","iopub.execute_input":"2024-01-01T15:48:57.246233Z","iopub.status.idle":"2024-01-01T15:48:57.304709Z","shell.execute_reply.started":"2024-01-01T15:48:57.246198Z","shell.execute_reply":"2024-01-01T15:48:57.303858Z"},"trusted":true},"execution_count":44,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type)                   Output Shape         Param #     Connected to                     \n==================================================================================================\n input_ids (InputLayer)         [(None, 512)]        0           []                               \n                                                                                                  \n attention_mask (InputLayer)    [(None, 512)]        0           []                               \n                                                                                                  \n tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              \n BertModel)                     ast_hidden_state=(N               'attention_mask[0][0]']         \n                                one, 512, 768),                                                   \n                                 hidden_states=None                                               \n                                , attentions=None)                                                \n                                                                                                  \n tf.__operators__.getitem (Slic  (None, 768)         0           ['tf_distil_bert_model[0][0]']   \n ingOpLambda)                                                                                     \n                                                                                                  \n batch_normalization (BatchNorm  (None, 768)         3072        ['tf.__operators__.getitem[0][0]'\n alization)                                                      ]                                \n                                                                                                  \n dropout_19 (Dropout)           (None, 768)          0           ['batch_normalization[0][0]']    \n                                                                                                  \n dense (Dense)                  (None, 512)          393728      ['dropout_19[0][0]']             \n                                                                                                  \n dropout_20 (Dropout)           (None, 512)          0           ['dense[0][0]']                  \n                                                                                                  \n batch_normalization_1 (BatchNo  (None, 512)         2048        ['dropout_20[0][0]']             \n rmalization)                                                                                     \n                                                                                                  \n dense_1 (Dense)                (None, 512)          262656      ['batch_normalization_1[0][0]']  \n                                                                                                  \n dropout_21 (Dropout)           (None, 512)          0           ['dense_1[0][0]']                \n                                                                                                  \n batch_normalization_2 (BatchNo  (None, 512)         2048        ['dropout_21[0][0]']             \n rmalization)                                                                                     \n                                                                                                  \n output (Dense)                 (None, 6)            3078        ['batch_normalization_2[0][0]']  \n                                                                                                  \n==================================================================================================\nTotal params: 67,029,510\nTrainable params: 67,025,926\nNon-trainable params: 3,584\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"from tensorflow.keras.callbacks import EarlyStopping\nearly_stop = EarlyStopping(monitor=\"val_loss\",patience=1,mode=\"min\")","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:57.305988Z","iopub.execute_input":"2024-01-01T15:48:57.306663Z","iopub.status.idle":"2024-01-01T15:48:57.311370Z","shell.execute_reply.started":"2024-01-01T15:48:57.306617Z","shell.execute_reply":"2024-01-01T15:48:57.310499Z"},"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"model.fit(train_tf_data.shuffle(len(train_tf_data)).batch(BATCH_SIZE), validation_data=test_tf_data.shuffle(len(test_tf_data)).batch(BATCH_SIZE), \n          epochs=10, callbacks=[early_stop])","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:48:57.313364Z","iopub.execute_input":"2024-01-01T15:48:57.313701Z","iopub.status.idle":"2024-01-01T17:50:28.473405Z","shell.execute_reply.started":"2024-01-01T15:48:57.313667Z","shell.execute_reply":"2024-01-01T17:50:28.472543Z"},"trusted":true},"execution_count":46,"outputs":[{"name":"stdout","text":"Epoch 1/10\n2234/2234 [==============================] - 3660s 2s/step - loss: 0.7597 - auc: 0.8941 - val_loss: 0.4980 - val_auc: 0.9317\nEpoch 2/10\n2234/2234 [==============================] - 3629s 2s/step - loss: 0.5204 - auc: 0.9265 - val_loss: 0.5717 - val_auc: 0.9456\n","output_type":"stream"},{"execution_count":46,"output_type":"execute_result","data":{"text/plain":"<keras.callbacks.History at 0x7a915c64e380>"},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"toxic_comment_classifier_hf_distilbert.h5\")","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:50:34.269685Z","iopub.execute_input":"2024-01-01T17:50:34.270054Z","iopub.status.idle":"2024-01-01T17:50:35.987233Z","shell.execute_reply.started":"2024-01-01T17:50:34.270024Z","shell.execute_reply":"2024-01-01T17:50:35.986422Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"tf_model = tf.keras.models.load_model('toxic_comment_classifier_hf_distilbert.h5', custom_objects={\"TFDistilBertModel\": TFDistilBertModel, 'weighted_binary_crossentropy': weighted_binary_crossentropy})","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:50:38.175471Z","iopub.execute_input":"2024-01-01T17:50:38.175809Z","iopub.status.idle":"2024-01-01T17:50:42.024993Z","shell.execute_reply.started":"2024-01-01T17:50:38.175784Z","shell.execute_reply":"2024-01-01T17:50:42.024221Z"},"trusted":true},"execution_count":48,"outputs":[]},{"cell_type":"code","source":"def sigmoid(x):\n    return 1 / (1 + np.exp(-x))","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:51:59.959469Z","iopub.execute_input":"2024-01-01T17:51:59.959766Z","iopub.status.idle":"2024-01-01T17:51:59.964272Z","shell.execute_reply.started":"2024-01-01T17:51:59.959742Z","shell.execute_reply":"2024-01-01T17:51:59.963375Z"},"trusted":true},"execution_count":50,"outputs":[]},{"cell_type":"markdown","source":"COMPUTING THRESHOLD ","metadata":{}},{"cell_type":"code","source":"def inference(text):\n    model_checkpoint = \"distilbert-base-uncased\"\n    tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)\n    input_=tf.data.Dataset.from_tensor_slices((dict(tokenizer(text, \n                                                     max_length=512, padding=\"max_length\", \n                                                     truncation=True, return_tensors=\"tf\"))))\n    pred = tf_model.predict(input_, verbose=0)\n    \n    return pred","metadata":{"execution":{"iopub.status.busy":"2024-01-01T14:58:41.888372Z","iopub.execute_input":"2024-01-01T14:58:41.888762Z","iopub.status.idle":"2024-01-01T14:58:41.894960Z","shell.execute_reply.started":"2024-01-01T14:58:41.888731Z","shell.execute_reply":"2024-01-01T14:58:41.894020Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"preds = inference([*x_test])","metadata":{"execution":{"iopub.status.busy":"2024-01-01T14:59:08.532073Z","iopub.execute_input":"2024-01-01T14:59:08.532927Z","iopub.status.idle":"2024-01-01T15:06:56.850495Z","shell.execute_reply.started":"2024-01-01T14:59:08.532890Z","shell.execute_reply":"2024-01-01T15:06:56.849347Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"preds.shape","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:08:43.065216Z","iopub.execute_input":"2024-01-01T15:08:43.065711Z","iopub.status.idle":"2024-01-01T15:08:43.071901Z","shell.execute_reply.started":"2024-01-01T15:08:43.065676Z","shell.execute_reply":"2024-01-01T15:08:43.071037Z"},"trusted":true},"execution_count":37,"outputs":[{"execution_count":37,"output_type":"execute_result","data":{"text/plain":"(31915, 6)"},"metadata":{}}]},{"cell_type":"code","source":"y_test","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:08:55.045644Z","iopub.execute_input":"2024-01-01T15:08:55.046555Z","iopub.status.idle":"2024-01-01T15:08:55.052833Z","shell.execute_reply.started":"2024-01-01T15:08:55.046521Z","shell.execute_reply":"2024-01-01T15:08:55.051817Z"},"trusted":true},"execution_count":38,"outputs":[{"execution_count":38,"output_type":"execute_result","data":{"text/plain":"array([[0, 0, 0, 0, 0, 0],\n       [0, 0, 0, 0, 0, 0],\n       [0, 0, 0, 0, 0, 0],\n       ...,\n       [0, 0, 0, 0, 0, 0],\n       [1, 0, 1, 0, 1, 0],\n       [0, 0, 0, 0, 0, 0]])"},"metadata":{}}]},{"cell_type":"code","source":"from sklearn.metrics import roc_curve","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:09:16.517466Z","iopub.execute_input":"2024-01-01T15:09:16.518374Z","iopub.status.idle":"2024-01-01T15:09:16.522589Z","shell.execute_reply.started":"2024-01-01T15:09:16.518339Z","shell.execute_reply":"2024-01-01T15:09:16.521397Z"},"trusted":true},"execution_count":39,"outputs":[]},{"cell_type":"code","source":"label_threshold = []\nsigmoid_preds = sigmoid(preds)\nfor i in range(y_test.shape[1]):\n    fpr, tpr, thresholds = roc_curve(y_test[:, i], sigmoid_preds[:, i])\n    j = tpr - fpr\n    idx = np.argmax(j)\n    best_threshold = thresholds[idx]\n    label_threshold.append(best_threshold)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:15:15.956147Z","iopub.execute_input":"2024-01-01T15:15:15.956576Z","iopub.status.idle":"2024-01-01T15:15:15.999900Z","shell.execute_reply.started":"2024-01-01T15:15:15.956550Z","shell.execute_reply":"2024-01-01T15:15:15.999028Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"label_threshold","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:15:17.282955Z","iopub.execute_input":"2024-01-01T15:15:17.283495Z","iopub.status.idle":"2024-01-01T15:15:17.289870Z","shell.execute_reply.started":"2024-01-01T15:15:17.283462Z","shell.execute_reply":"2024-01-01T15:15:17.288876Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"[0.5054522, 0.1555657, 0.5025445, 0.17308293, 0.48295122, 0.07352413]"},"metadata":{}}]},{"cell_type":"markdown","source":"QUANTIZATION","metadata":{"execution":{"iopub.status.busy":"2024-01-01T15:16:12.511941Z","iopub.execute_input":"2024-01-01T15:16:12.512377Z","iopub.status.idle":"2024-01-01T15:16:12.558722Z","shell.execute_reply.started":"2024-01-01T15:16:12.512345Z","shell.execute_reply":"2024-01-01T15:16:12.557405Z"}}},{"cell_type":"code","source":"import pathlib\nconverter = tf.lite.TFLiteConverter.from_keras_model(tf_model)\nconverter.optimizations = [tf.lite.Optimize.DEFAULT]\ntflite_model = converter.convert()\n\ntflite_models_dir = pathlib.Path(os.path.join(\"tflite_models\"))\ntflite_models_dir.mkdir(exist_ok=True, parents=True)\ntflite_model_file = tflite_models_dir/\"toxic_comment_classifier_hf_distilbert.tflite\"\ntflite_model_file.write_bytes(tflite_model)","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:50:55.945400Z","iopub.execute_input":"2024-01-01T17:50:55.946058Z","iopub.status.idle":"2024-01-01T17:51:59.957942Z","shell.execute_reply.started":"2024-01-01T17:50:55.946024Z","shell.execute_reply":"2024-01-01T17:51:59.956990Z"},"trusted":true},"execution_count":49,"outputs":[{"execution_count":49,"output_type":"execute_result","data":{"text/plain":"68543400"},"metadata":{}}]},{"cell_type":"code","source":"with open(\"toxic_comment_preprocessor_classnames.bin\", \"rb\") as model_file_obj:\n        text_preprocessor, class_names = cloudpickle.load(model_file_obj)\n        \ninterpreter = tf.lite.Interpreter(model_path=os.path.join(\"tflite_models\", \"toxic_comment_classifier_hf_distilbert.tflite\"))\n","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:52:04.252505Z","iopub.execute_input":"2024-01-01T17:52:04.253263Z","iopub.status.idle":"2024-01-01T17:52:04.261732Z","shell.execute_reply.started":"2024-01-01T17:52:04.253214Z","shell.execute_reply":"2024-01-01T17:52:04.260705Z"},"trusted":true},"execution_count":51,"outputs":[]},{"cell_type":"code","source":"def inference(text):\n    text = text_preprocessor.preprocess(pd.Series(text))[0]\n    \n    model_checkpoint = \"distilbert-base-uncased\"\n    tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)\n    tokens = tokenizer(text, max_length=512, padding=\"max_length\", truncation=True, return_tensors=\"tf\")\n    \n    # tflite model inference  \n    interpreter.allocate_tensors()\n    input_details = interpreter.get_input_details()\n    output_details = interpreter.get_output_details()[0]\n    attention_mask, input_ids = tokens['attention_mask'], tokens['input_ids']\n    interpreter.set_tensor(input_details[0][\"index\"], attention_mask)\n    interpreter.set_tensor(input_details[1][\"index\"], input_ids)\n    interpreter.invoke()\n    tflite_logits = interpreter.get_tensor(output_details[\"index\"])[0]\n    tflite_pred = sigmoid(tflite_logits)\n    result_df = pd.DataFrame({'class': class_names, 'prob': tflite_pred})\n    result_df.sort_values(by='prob', ascending=True, inplace=True)\n    return result_df","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:52:06.735831Z","iopub.execute_input":"2024-01-01T17:52:06.736217Z","iopub.status.idle":"2024-01-01T17:52:06.744507Z","shell.execute_reply.started":"2024-01-01T17:52:06.736184Z","shell.execute_reply":"2024-01-01T17:52:06.743562Z"},"trusted":true},"execution_count":52,"outputs":[]},{"cell_type":"code","source":"","metadata":{"execution":{"iopub.status.busy":"2024-01-01T17:53:28.258034Z","iopub.execute_input":"2024-01-01T17:53:28.258950Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}