{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":7296872,"sourceType":"datasetVersion","datasetId":4232592},{"sourceId":7296911,"sourceType":"datasetVersion","datasetId":4232619}],"dockerImageVersionId":30626,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# !pip install tensorflow==2.10","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nimport string\nfrom unidecode import unidecode\nimport tensorflow as tf \nfrom sklearn.utils import class_weight\nfrom tensorflow.keras.utils import to_categorical\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nimport cloudpickle\nimport os\nfrom transformers import DistilBertTokenizerFast\nfrom transformers import TFDistilBertModel, DistilBertConfig\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization\nfrom tensorflow.keras.optimizers.schedules import PolynomialDecay\nfrom tensorflow.keras.callbacks import EarlyStopping","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class TextPreprocessor:\n def __init__(self, remove_punct: bool = True, remove_digits: bool = True,\n remove_stop_words: bool = True,\n remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,\n bottom_p: float = None):\n self.remove_punct = remove_punct\n self.remove_digits = remove_digits\n self.remove_stop_words = remove_stop_words\n self.remove_short_words = remove_short_words\n self.minlen = minlen\n self.maxlen = maxlen\n self.top_p = top_p\n self.bottom_p = bottom_p\n self.words_to_remove = []\n self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',\n 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',\n 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',\n 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',\n 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',\n 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',\n 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',\n 'into', 'through', 'during', 'before', 'after', 'to', 'from',\n 'in', 'out', 'on', 'off', 'further', 'then', 'once',\n 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',\n 'other', 'such', 'own', 'same', 'so', 'than', 'can', 'will', 'should','now']\n\n self.contraction_to_expansion = {\"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"cannot\",\n \"can't've\": \"cannot have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it had\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so is\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there had\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we had\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'alls\": \"you alls\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you had\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you you will\",\n \"you'll've\": \"you you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n @staticmethod\n def __remove_double_whitespaces(string: str):\n return \" \".join(string.split())\n\n def __remove_url(self, string_series: pd.Series):\n \"\"\"\n Removes URLs m text\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(\n pat=r\"(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})\",\n repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __expand(self, string_series: pd.Series):\n \"\"\"\n Replaces contractions with expansions. eg. don't wit do not.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n for c, e in self.contraction_to_expansion.items():\n clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_punct(self, string_series: pd.Series):\n \"\"\"\n Removes punctuations from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.copy()\n puncts = [r'\\n', r'\\r', r'\\t']\n puncts.extend(list(string.punctuation))\n for i in puncts:\n clean_string_series = clean_string_series.str.replace(pat=i, repl=\" \", regex=False)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n def __remove_digits(self, string_series: pd.Series):\n \"\"\"\n Removes digits from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.str.replace(pat=r'\\d', repl=\" \", regex=True)\n return clean_string_series.map(self.__remove_double_whitespaces)\n\n @staticmethod\n def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):\n \"\"\"\n Reomves words/tokens where minlen <= len <= maxlen.\n :param string_series: pd.Series, input string series\n :param minlen: int, minimum length of token to be removed.\n :param maxlen: int, maximum length of token to be removed.\n :return: pd.Series, cleaned string series\n \"\"\"\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split() if\n (len(word) > maxlen) or (len(word) < minlen)]))\n return clean_string_series\n\n def __remove_stop_words(self, string_series: pd.Series):\n \"\"\"\n Removes stop words from the input string.\n :param string_series: pd.Series, input string series\n :return: pd.Series, cleaned string series\n \"\"\"\n def str_remove_stop_words(string: str):\n stops = self.stop_words\n return \" \".join([token for token in string.split() if token not in stops])\n\n return string_series.map(str_remove_stop_words)\n\n def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,\n bottom_p: int = None, dataset: str = 'train'):\n \"\"\"\n Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.\n :param string_series: pd.Series, input string series\n :param top_p: float, percent of frequent words to remove.\n :param bottom_p: float, percent of rare words to remove.\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n if dataset == 'train':\n if top_p is None:\n top_p = 0\n if bottom_p is None:\n bottom_p = 0\n\n if top_p > 0 or bottom_p > 0:\n word_freq = pd.Series(\" \".join(string_series).split()).value_counts()\n n_words = len(word_freq)\n\n if top_p > 0:\n self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])\n\n if bottom_p > 0:\n self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])\n\n if len(self.words_to_remove) == 0:\n return string_series\n else:\n clean_string_series = string_series.map(lambda string: \" \".join([word for word in string.split()\n if word not in self.words_to_remove]))\n return clean_string_series\n\n def preprocess(self, string_series: pd.Series, dataset: str = \"train\"):\n \"\"\"\n Entry point.\n :param string_series: pd.Series, input string series\n :param dataset: str, \"train\" for training set, \"tesrt\" for val/dev/test set.\n :return: pd.Series, cleaned string series\n \"\"\"\n string_series = string_series.str.lower()\n string_series = string_series.map(unidecode)\n string_series = self.__remove_url(string_series=string_series)\n string_series = self.__expand(string_series=string_series)\n\n if self.remove_punct:\n string_series = self.__remove_punct(string_series=string_series)\n if self.remove_digits:\n string_series = self.__remove_digits(string_series=string_series)\n if self.remove_stop_words:\n string_series = self.__remove_stop_words(string_series=string_series)\n if self.remove_short_words:\n string_series = self.__remove_short_words(string_series=string_series,\n minlen=self.minlen,\n maxlen=self.maxlen)\n string_series = self.__remove_top_bottom_words(string_series=string_series,\n top_p=self.top_p,\n bottom_p=self.bottom_p, dataset=dataset)\n\n string_series = string_series.str.strip()\n string_series.replace(to_replace=\"\", value=\"this is an empty message\", inplace=True)\n\n return string_series","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:30:59.635659Z","iopub.execute_input":"2023-12-31T07:30:59.636337Z","iopub.status.idle":"2023-12-31T07:30:59.678677Z","shell.execute_reply.started":"2023-12-31T07:30:59.636307Z","shell.execute_reply":"2023-12-31T07:30:59.677974Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"data = pd.read_csv('train.csv')","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:30:59.680601Z","iopub.execute_input":"2023-12-31T07:30:59.680867Z","iopub.status.idle":"2023-12-31T07:31:01.412865Z","shell.execute_reply.started":"2023-12-31T07:30:59.680843Z","shell.execute_reply":"2023-12-31T07:31:01.412060Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"data","metadata":{"execution":{"iopub.status.busy":"2023-12-31T07:31:01.413911Z","iopub.execute_input":"2023-12-31T07:31:01.414204Z","iopub.status.idle":"2023-12-31T07:31:01.440183Z","shell.execute_reply.started":"2023-12-31T07:31:01.414177Z","shell.execute_reply":"2023-12-31T07:31:01.439351Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" id comment_text \\\n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n... ... ... \n159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n\n toxic severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 \n2 0 0 0 0 0 0 \n3 0 0 0 0 0 0 \n4 0 0 0 0 0 0 \n... ... ... ... ... ... ... \n159566 0 0 0 0 0 0 \n159567 0 0 0 0 0 0 \n159568 0 0 0 0 0 0 \n159569 0 0 0 0 0 0 \n159570 0 0 0 0 0 0 \n\n[159571 rows x 8 columns]","text/html":"
\n | id | \ncomment_text | \ntoxic | \nsevere_toxic | \nobscene | \nthreat | \ninsult | \nidentity_hate | \n
---|---|---|---|---|---|---|---|---|
0 | \n0000997932d777bf | \nExplanation\\nWhy the edits made under my usern... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
1 | \n000103f0d9cfb60f | \nD'aww! He matches this background colour I'm s... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
2 | \n000113f07ec002fd | \nHey man, I'm really not trying to edit war. It... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
3 | \n0001b41b1c6bb37e | \n\"\\nMore\\nI can't make any real suggestions on ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
4 | \n0001d958c54c6e35 | \nYou, sir, are my hero. Any chance you remember... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
159566 | \nffe987279560d7ff | \n\":::::And for the second time of asking, when ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159567 | \nffea4adeee384e90 | \nYou should be ashamed of yourself \\n\\nThat is ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159568 | \nffee36eab5c267c9 | \nSpitzer \\n\\nUmm, theres no actual article for ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159569 | \nfff125370e4aaaf3 | \nAnd it looks like it was actually you who put ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159570 | \nfff46fc426af1f9a | \n\"\\nAnd ... I really don't think you understand... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159571 rows × 8 columns
\n\n | comment_text | \ntoxic | \nsevere_toxic | \nobscene | \nthreat | \ninsult | \nidentity_hate | \n
---|---|---|---|---|---|---|---|
0 | \nExplanation\\nWhy the edits made under my usern... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
1 | \nD'aww! He matches this background colour I'm s... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
2 | \nHey man, I'm really not trying to edit war. It... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
3 | \n\"\\nMore\\nI can't make any real suggestions on ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
4 | \nYou, sir, are my hero. Any chance you remember... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
159566 | \n\":::::And for the second time of asking, when ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159567 | \nYou should be ashamed of yourself \\n\\nThat is ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159568 | \nSpitzer \\n\\nUmm, theres no actual article for ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159569 | \nAnd it looks like it was actually you who put ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159570 | \n\"\\nAnd ... I really don't think you understand... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159571 rows × 7 columns
\n\n | text | \ntoxic | \nsevere_toxic | \nobscene | \nthreat | \ninsult | \nidentity_hate | \n
---|---|---|---|---|---|---|---|
0 | \nExplanation\\nWhy the edits made under my usern... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
1 | \nD'aww! He matches this background colour I'm s... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
2 | \nHey man, I'm really not trying to edit war. It... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
3 | \n\"\\nMore\\nI can't make any real suggestions on ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
4 | \nYou, sir, are my hero. Any chance you remember... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
159566 | \n\":::::And for the second time of asking, when ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159567 | \nYou should be ashamed of yourself \\n\\nThat is ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159568 | \nSpitzer \\n\\nUmm, theres no actual article for ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159569 | \nAnd it looks like it was actually you who put ... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159570 | \n\"\\nAnd ... I really don't think you understand... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
159571 rows × 7 columns
\n\n | class | \nprob | \n
---|---|---|
3 | \nthreat | \n0.000621 | \n
1 | \nsevere_toxic | \n0.000848 | \n
5 | \nidentity_hate | \n0.000876 | \n
2 | \nobscene | \n0.001126 | \n
4 | \ninsult | \n0.001540 | \n
0 | \ntoxic | \n0.002890 | \n