Spaces:

nctu16028
/

spam_detector

Runtime error

App Files Files Community

nctu16028 commited on May 12, 2023

Commit

f0f4ad8

•

1 Parent(s): ab6a0aa

Apply solution

Browse files

Files changed (14) hide show

.gitattributes +1 -0
.gitignore +5 -0
app.ipynb +219 -0
app.py +35 -0
spam.csv +0 -0
spam_model/added_tokens.json +3 -0
spam_model/config.json +3 -0
spam_model/pytorch_model.bin +3 -0
spam_model/special_tokens_map.json +3 -0
spam_model/spm.model +3 -0
spam_model/tokenizer.json +3 -0
spam_model/tokenizer_config.json +3 -0
spam_model/training_args.bin +3 -0
train.ipynb +1 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.ipynb_checkpoints/
+tmp_trainer/
+flagged/
+*.bak
+*.swp

app.ipynb ADDED Viewed

	@@ -0,0 +1,219 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9e3afc69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|default_exp app"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ca02cd22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import gradio as gr\n",
+    "from datasets import Dataset\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "674fa5e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "import warnings, logging\n",
+    "warnings.simplefilter('ignore')\n",
+    "logging.disable(logging.WARNING)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "28150bb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"./spam_model/\")\n",
+    "tokz = AutoTokenizer.from_pretrained(\"./spam_model/\")\n",
+    "trainer = Trainer(model, tokenizer=tokz)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4f1da521",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<transformers.trainer.Trainer at 0x7fe9230c40a0>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "cb001f05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "def tok_func(x):\n",
+    "    return tokz(x[\"input\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c6cc7802",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.8317995071411133"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "document = 'Send this message to 5 more people ASAP'\n",
+    "input_ds = Dataset.from_pandas(pd.DataFrame([document], columns=['input'])).map(tok_func, batched=True)\n",
+    "trainer.predict(input_ds).predictions.astype(float)[0, 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "d9e18de1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "def classify_message(text):\n",
+    "    input_ds = Dataset.from_pandas(pd.DataFrame([text], columns=['input'])).map(tok_func, batched=True)\n",
+    "    spam_prob = np.clip(trainer.predict(input_ds).predictions.astype(float), 0, 1)[0, 0]\n",
+    "    return f'{100*spam_prob:.1f}% probability being Spam'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c70fc002",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#|export\n",
+    "intf = gr.Interface(fn=classify_message, inputs='text', outputs='text')\n",
+    "intf.launch(inline=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "fdf43e45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from nbdev.export import nb_export\n",
+    "nb_export('app.ipynb', '.')"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
+# %% auto 0
+__all__ = ['model', 'tokz', 'trainer', 'intf', 'tok_func', 'classify_message']
+# %% app.ipynb 1
+import numpy as np
+import pandas as pd
+import gradio as gr
+from datasets import Dataset
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
+# %% app.ipynb 2
+import warnings, logging
+warnings.simplefilter('ignore')
+logging.disable(logging.WARNING)
+# %% app.ipynb 3
+model = AutoModelForSequenceClassification.from_pretrained("./spam_model/")
+tokz = AutoTokenizer.from_pretrained("./spam_model/")
+trainer = Trainer(model, tokenizer=tokz)
+# %% app.ipynb 5
+def tok_func(x):
+    return tokz(x["input"])
+# %% app.ipynb 7
+def classify_message(text):
+    input_ds = Dataset.from_pandas(pd.DataFrame([text], columns=['input'])).map(tok_func, batched=True)
+    spam_prob = np.clip(trainer.predict(input_ds).predictions.astype(float), 0, 1)[0, 0]
+    return f'{100*spam_prob:.1f}% probability being Spam'
+# %% app.ipynb 8
+intf = gr.Interface(fn=classify_message, inputs='text', outputs='text')
+intf.launch(inline=False)

spam.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

spam_model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc046d04c9b0ada7ae6f1dc89c465801799acdf0c9a6aab8c15a1b2d5ca4e91f
+size 23

spam_model/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f8920f3374f9490a58131d2b88a658cf64b1b58802e3aca7ea17aab5cea6170
+size 958

spam_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a93b4ef15c99cbd7d9480586f87fda89efcce815517996220e6c7de3eaf65e
+size 567623353

spam_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:311de3f4eed9d76a43bf0d71f10e62e086ca65ccce9f15d5da0d2098bf519ecc
+size 173

spam_model/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

spam_model/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a86f883318afa11c8c10466f1bf4efaeb6ded28a52cbe57217a8fa0d0a2a87df
+size 8656551

spam_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45705cb69567763661139b56f0f1f367dec7a130dfd6dcf86a14fbf174a48d3f
+size 412

spam_model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b382c01c07616fcd78e546a7e7028ab0c2bcfaf44135ec60d7952c2262f897c4
+size 3579

train.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"! pip install -q datasets","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-05-12T04:05:12.743738Z","iopub.execute_input":"2023-05-12T04:05:12.746149Z","iopub.status.idle":"2023-05-12T04:05:26.016049Z","shell.execute_reply.started":"2023-05-12T04:05:12.746114Z","shell.execute_reply":"2023-05-12T04:05:26.014993Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"markdown","source":"## Import Modules","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom pathlib import Path\nfrom datasets import Dataset, DatasetDict\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\nfrom transformers import TrainingArguments, Trainer\n\nnp.set_printoptions(precision=2, suppress=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:26.019972Z","iopub.execute_input":"2023-05-12T04:05:26.020311Z","iopub.status.idle":"2023-05-12T04:05:38.111349Z","shell.execute_reply.started":"2023-05-12T04:05:26.020282Z","shell.execute_reply":"2023-05-12T04:05:38.110437Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Load Data","metadata":{}},{"cell_type":"code","source":"path = Path('../input/sms-spam-collection-dataset')\n!ls {path}","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:38.112562Z","iopub.execute_input":"2023-05-12T04:05:38.113583Z","iopub.status.idle":"2023-05-12T04:05:39.095751Z","shell.execute_reply.started":"2023-05-12T04:05:38.113557Z","shell.execute_reply":"2023-05-12T04:05:39.094550Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"spam.csv\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df = pd.read_csv(path/'spam.csv', encoding='iso-8859-1')[['v1', 'v2']]\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.099307Z","iopub.execute_input":"2023-05-12T04:05:39.099677Z","iopub.status.idle":"2023-05-12T04:05:39.150580Z","shell.execute_reply.started":"2023-05-12T04:05:39.099646Z","shell.execute_reply":"2023-05-12T04:05:39.149605Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" v1 v2\n0 ham Go until jurong point, crazy.. Available only ...\n1 ham Ok lar... Joking wif u oni...\n2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n3 ham U dun say so early hor... U c already then say...\n4 ham Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 spam This is the 2nd time we have tried 2 contact u...\n5568 ham Will Ì_ b going to esplanade fr home?\n5569 ham Pity, * was in mood for that. So...any other s...\n5570 ham The guy did some bitching but I acted like i'd...\n5571 ham Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>v1</th>\n <th>v2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>ham</td>\n <td>Go until jurong point, crazy.. Available only ...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>ham</td>\n <td>Ok lar... Joking wif u oni...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>spam</td>\n <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>ham</td>\n <td>U dun say so early hor... U c already then say...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>ham</td>\n <td>Nah I don't think he goes to usf, he lives aro...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5567</th>\n <td>spam</td>\n <td>This is the 2nd time we have tried 2 contact u...</td>\n </tr>\n <tr>\n <th>5568</th>\n <td>ham</td>\n <td>Will Ì_ b going to esplanade fr home?</td>\n </tr>\n <tr>\n <th>5569</th>\n <td>ham</td>\n <td>Pity, * was in mood for that. So...any other s...</td>\n </tr>\n <tr>\n <th>5570</th>\n <td>ham</td>\n <td>The guy did some bitching but I acted like i'd...</td>\n </tr>\n <tr>\n <th>5571</th>\n <td>ham</td>\n <td>Rofl. Its true to its name</td>\n </tr>\n </tbody>\n</table>\n<p>5572 rows × 2 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train_df.describe(include='object')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.151914Z","iopub.execute_input":"2023-05-12T04:05:39.152242Z","iopub.status.idle":"2023-05-12T04:05:39.174189Z","shell.execute_reply.started":"2023-05-12T04:05:39.152194Z","shell.execute_reply":"2023-05-12T04:05:39.173238Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" v1 v2\ncount 5572 5572\nunique 2 5169\ntop ham Sorry, I'll call later\nfreq 4825 30","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>v1</th>\n <th>v2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>5572</td>\n <td>5572</td>\n </tr>\n <tr>\n <th>unique</th>\n <td>2</td>\n <td>5169</td>\n </tr>\n <tr>\n <th>top</th>\n <td>ham</td>\n <td>Sorry, I'll call later</td>\n </tr>\n <tr>\n <th>freq</th>\n <td>4825</td>\n <td>30</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"## Data Preprocessing","metadata":{}},{"cell_type":"code","source":"train_df.rename(columns={'v1': 'labels', 'v2': 'input'}, inplace=True)\ntrain_df['labels'] = (train_df['labels'] == 'spam').astype(float)\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.175463Z","iopub.execute_input":"2023-05-12T04:05:39.175854Z","iopub.status.idle":"2023-05-12T04:05:39.190828Z","shell.execute_reply.started":"2023-05-12T04:05:39.175823Z","shell.execute_reply":"2023-05-12T04:05:39.189848Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" labels input\n0 0.0 Go until jurong point, crazy.. Available only ...\n1 0.0 Ok lar... Joking wif u oni...\n2 1.0 Free entry in 2 a wkly comp to win FA Cup fina...\n3 0.0 U dun say so early hor... U c already then say...\n4 0.0 Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 1.0 This is the 2nd time we have tried 2 contact u...\n5568 0.0 Will Ì_ b going to esplanade fr home?\n5569 0.0 Pity, * was in mood for that. So...any other s...\n5570 0.0 The guy did some bitching but I acted like i'd...\n5571 0.0 Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>labels</th>\n <th>input</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.0</td>\n <td>Go until jurong point, crazy.. Available only ...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.0</td>\n <td>Ok lar... Joking wif u oni...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.0</td>\n <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.0</td>\n <td>U dun say so early hor... U c already then say...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.0</td>\n <td>Nah I don't think he goes to usf, he lives aro...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5567</th>\n <td>1.0</td>\n <td>This is the 2nd time we have tried 2 contact u...</td>\n </tr>\n <tr>\n <th>5568</th>\n <td>0.0</td>\n <td>Will Ì_ b going to esplanade fr home?</td>\n </tr>\n <tr>\n <th>5569</th>\n <td>0.0</td>\n <td>Pity, * was in mood for that. So...any other s...</td>\n </tr>\n <tr>\n <th>5570</th>\n <td>0.0</td>\n <td>The guy did some bitching but I acted like i'd...</td>\n </tr>\n <tr>\n <th>5571</th>\n <td>0.0</td>\n <td>Rofl. Its true to its name</td>\n </tr>\n </tbody>\n</table>\n<p>5572 rows × 2 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"## Tokenization","metadata":{}},{"cell_type":"code","source":"model_nm = 'microsoft/deberta-v3-small'\ntokz = AutoTokenizer.from_pretrained(model_nm)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.192156Z","iopub.execute_input":"2023-05-12T04:05:39.192544Z","iopub.status.idle":"2023-05-12T04:05:42.209496Z","shell.execute_reply.started":"2023-05-12T04:05:39.192514Z","shell.execute_reply":"2023-05-12T04:05:42.208442Z"},"trusted":true},"execution_count":7,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading (…)okenizer_config.json: 0%| | 0.00/52.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"981c0c2f622143358e0a065a0c9f69a2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading (…)lve/main/config.json: 0%| | 0.00/578 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9504127fd82d416d9efcc3ed7be43f8d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading spm.model: 0%| | 0.00/2.46M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e8e8a88bf04e4ec2830950d13684b53f"}},"metadata":{}},{"name":"stderr","text":"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n/opt/conda/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.py:454: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n warnings.warn(\nSpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n","output_type":"stream"}]},{"cell_type":"code","source":"def tok_func(x):\n return tokz(x[\"input\"])","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:42.211099Z","iopub.execute_input":"2023-05-12T04:05:42.211783Z","iopub.status.idle":"2023-05-12T04:05:42.218420Z","shell.execute_reply.started":"2023-05-12T04:05:42.211750Z","shell.execute_reply":"2023-05-12T04:05:42.217254Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"train_ds = Dataset.from_pandas(train_df)\ntrain_tok_ds = train_ds.map(tok_func, batched=True)\ntrain_tok_ds[0]['input'], train_tok_ds[0]['input_ids']","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:42.222675Z","iopub.execute_input":"2023-05-12T04:05:42.223346Z","iopub.status.idle":"2023-05-12T04:05:43.054865Z","shell.execute_reply.started":"2023-05-12T04:05:42.223286Z","shell.execute_reply":"2023-05-12T04:05:43.054024Z"},"trusted":true},"execution_count":9,"outputs":[{"output_type":"display_data","data":{"text/plain":" 0%| | 0/6 [00:00<?, ?ba/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"5d86501cfb93430f86fce3d001ea3f9f"}},"metadata":{}},{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',\n [1,\n 1968,\n 583,\n 18350,\n 49947,\n 582,\n 261,\n 3286,\n 260,\n 260,\n 4955,\n 364,\n 267,\n 5554,\n 1890,\n 2030,\n 426,\n 447,\n 2181,\n 865,\n 11709,\n 260,\n 260,\n 260,\n 33053,\n 343,\n 519,\n 266,\n 4755,\n 37964,\n 260,\n 260,\n 260,\n 2])"},"metadata":{}}]},{"cell_type":"markdown","source":"## Setup Arguments and Train the Model","metadata":{}},{"cell_type":"code","source":"def mse(x, y):\n return ((x-y)**2).mean()\n\ndef mse_d(eval_pred):\n return {'mse': mse(*eval_pred)}","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:43.060895Z","iopub.execute_input":"2023-05-12T04:05:43.063407Z","iopub.status.idle":"2023-05-12T04:05:43.070191Z","shell.execute_reply.started":"2023-05-12T04:05:43.063373Z","shell.execute_reply":"2023-05-12T04:05:43.069298Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"bs = 64\nepochs = 5\nlr = 5e-6","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:43.074799Z","iopub.execute_input":"2023-05-12T04:05:43.077166Z","iopub.status.idle":"2023-05-12T04:05:43.082883Z","shell.execute_reply.started":"2023-05-12T04:05:43.077134Z","shell.execute_reply":"2023-05-12T04:05:43.081927Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)\nargs = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,\n evaluation_strategy=\"epoch\", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,\n num_train_epochs=epochs, weight_decay=0.01, report_to='none')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:43.087507Z","iopub.execute_input":"2023-05-12T04:05:43.089876Z","iopub.status.idle":"2023-05-12T04:05:46.087250Z","shell.execute_reply.started":"2023-05-12T04:05:43.089843Z","shell.execute_reply":"2023-05-12T04:05:46.086254Z"},"trusted":true},"execution_count":12,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading pytorch_model.bin: 0%| | 0.00/286M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8b90e4e589af4bbfb271bea95bb9fb69"}},"metadata":{}},{"name":"stderr","text":"Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight']\n- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nSome weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","output_type":"stream"}]},{"cell_type":"code","source":"dds = train_tok_ds.train_test_split(0.25, seed=42)\ndds","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:46.089078Z","iopub.execute_input":"2023-05-12T04:05:46.089656Z","iopub.status.idle":"2023-05-12T04:05:46.108675Z","shell.execute_reply.started":"2023-05-12T04:05:46.089624Z","shell.execute_reply":"2023-05-12T04:05:46.107266Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"DatasetDict({\n train: Dataset({\n features: ['labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],\n num_rows: 4179\n })\n test: Dataset({\n features: ['labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],\n num_rows: 1393\n })\n})"},"metadata":{}}]},{"cell_type":"code","source":"trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],\n tokenizer=tokz, compute_metrics=mse_d)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:46.109932Z","iopub.execute_input":"2023-05-12T04:05:46.110369Z","iopub.status.idle":"2023-05-12T04:05:50.803465Z","shell.execute_reply.started":"2023-05-12T04:05:46.110337Z","shell.execute_reply":"2023-05-12T04:05:50.802512Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"trainer.train()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:50.804744Z","iopub.execute_input":"2023-05-12T04:05:50.805084Z","iopub.status.idle":"2023-05-12T04:08:02.938810Z","shell.execute_reply.started":"2023-05-12T04:05:50.805053Z","shell.execute_reply":"2023-05-12T04:08:02.937759Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n warnings.warn(\nYou're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"\n <div>\n \n <progress value='330' max='330' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [330/330 02:10, Epoch 5/5]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Epoch</th>\n <th>Training Loss</th>\n <th>Validation Loss</th>\n <th>Mse</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>1</td>\n <td>No log</td>\n <td>0.073268</td>\n <td>0.073268</td>\n </tr>\n <tr>\n <td>2</td>\n <td>No log</td>\n <td>0.009850</td>\n <td>0.009850</td>\n </tr>\n <tr>\n <td>3</td>\n <td>No log</td>\n <td>0.008275</td>\n <td>0.008275</td>\n </tr>\n <tr>\n <td>4</td>\n <td>No log</td>\n <td>0.007945</td>\n <td>0.007945</td>\n </tr>\n <tr>\n <td>5</td>\n <td>No log</td>\n <td>0.008093</td>\n <td>0.008093</td>\n </tr>\n </tbody>\n</table><p>"},"metadata":{}},{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"TrainOutput(global_step=330, training_loss=0.03608651305689956, metrics={'train_runtime': 132.1096, 'train_samples_per_second': 158.164, 'train_steps_per_second': 2.498, 'total_flos': 461121007217520.0, 'train_loss': 0.03608651305689956, 'epoch': 5.0})"},"metadata":{}}]},{"cell_type":"markdown","source":"## Test the model","metadata":{}},{"cell_type":"code","source":"preds = trainer.predict(dds['test']).predictions.astype(float)\npreds","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:02.940333Z","iopub.execute_input":"2023-05-12T04:08:02.940794Z","iopub.status.idle":"2023-05-12T04:08:05.616149Z","shell.execute_reply.started":"2023-05-12T04:08:02.940759Z","shell.execute_reply":"2023-05-12T04:08:05.615130Z"},"trusted":true},"execution_count":16,"outputs":[{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":""},"metadata":{}},{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"array([-0.03, -0.01, -0.04, ..., -0.03, -0.02, 1.12])"},"metadata":{}}]},{"cell_type":"code","source":"output = np.zeros(len(preds))\noutput[preds >= 0.5] = 1.0\noutput","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.617768Z","iopub.execute_input":"2023-05-12T04:08:05.618157Z","iopub.status.idle":"2023-05-12T04:08:05.626184Z","shell.execute_reply.started":"2023-05-12T04:08:05.618121Z","shell.execute_reply":"2023-05-12T04:08:05.625196Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"array([0., 0., 0., ..., 0., 0., 1.])"},"metadata":{}}]},{"cell_type":"code","source":"real = np.array(dds['test']['labels'])\nreal","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.627338Z","iopub.execute_input":"2023-05-12T04:08:05.628053Z","iopub.status.idle":"2023-05-12T04:08:05.646652Z","shell.execute_reply.started":"2023-05-12T04:08:05.628013Z","shell.execute_reply":"2023-05-12T04:08:05.645699Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"array([0., 0., 0., ..., 0., 0., 1.])"},"metadata":{}}]},{"cell_type":"code","source":"(output == real).sum() / len(real)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.647820Z","iopub.execute_input":"2023-05-12T04:08:05.648218Z","iopub.status.idle":"2023-05-12T04:08:05.654599Z","shell.execute_reply.started":"2023-05-12T04:08:05.648172Z","shell.execute_reply":"2023-05-12T04:08:05.653704Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"0.9921033740129217"},"metadata":{}}]},{"cell_type":"markdown","source":"## Save the Model","metadata":{}},{"cell_type":"code","source":"trainer.save_model(\"./spam_model\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:44.088969Z","iopub.execute_input":"2023-05-12T04:08:44.089372Z","iopub.status.idle":"2023-05-12T04:08:45.285254Z","shell.execute_reply.started":"2023-05-12T04:08:44.089342Z","shell.execute_reply":"2023-05-12T04:08:45.284198Z"},"trusted":true},"execution_count":20,"outputs":[]}]}