Spaces:
Runtime error
Runtime error
Apply solution
Browse files- .gitattributes +1 -0
- .gitignore +5 -0
- app.ipynb +219 -0
- app.py +35 -0
- spam.csv +0 -0
- spam_model/added_tokens.json +3 -0
- spam_model/config.json +3 -0
- spam_model/pytorch_model.bin +3 -0
- spam_model/special_tokens_map.json +3 -0
- spam_model/spm.model +3 -0
- spam_model/tokenizer.json +3 -0
- spam_model/tokenizer_config.json +3 -0
- spam_model/training_args.bin +3 -0
- train.ipynb +1 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.ipynb_checkpoints/
|
2 |
+
tmp_trainer/
|
3 |
+
flagged/
|
4 |
+
*.bak
|
5 |
+
*.swp
|
app.ipynb
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "9e3afc69",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"#|default_exp app"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 2,
|
16 |
+
"id": "ca02cd22",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"#|export\n",
|
21 |
+
"import numpy as np\n",
|
22 |
+
"import pandas as pd\n",
|
23 |
+
"import gradio as gr\n",
|
24 |
+
"from datasets import Dataset\n",
|
25 |
+
"from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": 3,
|
31 |
+
"id": "674fa5e5",
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"#|export\n",
|
36 |
+
"import warnings, logging\n",
|
37 |
+
"warnings.simplefilter('ignore')\n",
|
38 |
+
"logging.disable(logging.WARNING)"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": 4,
|
44 |
+
"id": "28150bb5",
|
45 |
+
"metadata": {},
|
46 |
+
"outputs": [],
|
47 |
+
"source": [
|
48 |
+
"#|export\n",
|
49 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"./spam_model/\")\n",
|
50 |
+
"tokz = AutoTokenizer.from_pretrained(\"./spam_model/\")\n",
|
51 |
+
"trainer = Trainer(model, tokenizer=tokz)"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": 5,
|
57 |
+
"id": "4f1da521",
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [
|
60 |
+
{
|
61 |
+
"data": {
|
62 |
+
"text/plain": [
|
63 |
+
"<transformers.trainer.Trainer at 0x7fe9230c40a0>"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
"execution_count": 5,
|
67 |
+
"metadata": {},
|
68 |
+
"output_type": "execute_result"
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"source": [
|
72 |
+
"trainer"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"cell_type": "code",
|
77 |
+
"execution_count": 6,
|
78 |
+
"id": "cb001f05",
|
79 |
+
"metadata": {},
|
80 |
+
"outputs": [],
|
81 |
+
"source": [
|
82 |
+
"#|export\n",
|
83 |
+
"def tok_func(x):\n",
|
84 |
+
" return tokz(x[\"input\"])"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 7,
|
90 |
+
"id": "c6cc7802",
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [
|
93 |
+
{
|
94 |
+
"data": {
|
95 |
+
"application/vnd.jupyter.widget-view+json": {
|
96 |
+
"model_id": "",
|
97 |
+
"version_major": 2,
|
98 |
+
"version_minor": 0
|
99 |
+
},
|
100 |
+
"text/plain": [
|
101 |
+
"Map: 0%| | 0/1 [00:00<?, ? examples/s]"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
"metadata": {},
|
105 |
+
"output_type": "display_data"
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"data": {
|
109 |
+
"text/html": [],
|
110 |
+
"text/plain": [
|
111 |
+
"<IPython.core.display.HTML object>"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
"metadata": {},
|
115 |
+
"output_type": "display_data"
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"data": {
|
119 |
+
"text/plain": [
|
120 |
+
"0.8317995071411133"
|
121 |
+
]
|
122 |
+
},
|
123 |
+
"execution_count": 7,
|
124 |
+
"metadata": {},
|
125 |
+
"output_type": "execute_result"
|
126 |
+
}
|
127 |
+
],
|
128 |
+
"source": [
|
129 |
+
"document = 'Send this message to 5 more people ASAP'\n",
|
130 |
+
"input_ds = Dataset.from_pandas(pd.DataFrame([document], columns=['input'])).map(tok_func, batched=True)\n",
|
131 |
+
"trainer.predict(input_ds).predictions.astype(float)[0, 0]"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": 8,
|
137 |
+
"id": "d9e18de1",
|
138 |
+
"metadata": {},
|
139 |
+
"outputs": [],
|
140 |
+
"source": [
|
141 |
+
"#|export\n",
|
142 |
+
"def classify_message(text):\n",
|
143 |
+
" input_ds = Dataset.from_pandas(pd.DataFrame([text], columns=['input'])).map(tok_func, batched=True)\n",
|
144 |
+
" spam_prob = np.clip(trainer.predict(input_ds).predictions.astype(float), 0, 1)[0, 0]\n",
|
145 |
+
" return f'{100*spam_prob:.1f}% probability being Spam'"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": 9,
|
151 |
+
"id": "c70fc002",
|
152 |
+
"metadata": {
|
153 |
+
"scrolled": true
|
154 |
+
},
|
155 |
+
"outputs": [
|
156 |
+
{
|
157 |
+
"name": "stdout",
|
158 |
+
"output_type": "stream",
|
159 |
+
"text": [
|
160 |
+
"Running on local URL: http://127.0.0.1:7860\n",
|
161 |
+
"\n",
|
162 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"data": {
|
167 |
+
"text/plain": []
|
168 |
+
},
|
169 |
+
"execution_count": 9,
|
170 |
+
"metadata": {},
|
171 |
+
"output_type": "execute_result"
|
172 |
+
}
|
173 |
+
],
|
174 |
+
"source": [
|
175 |
+
"#|export\n",
|
176 |
+
"intf = gr.Interface(fn=classify_message, inputs='text', outputs='text')\n",
|
177 |
+
"intf.launch(inline=False)"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"cell_type": "code",
|
182 |
+
"execution_count": 10,
|
183 |
+
"id": "fdf43e45",
|
184 |
+
"metadata": {},
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"name": "stdout",
|
188 |
+
"output_type": "stream",
|
189 |
+
"text": [
|
190 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
191 |
+
"To disable this warning, you can either:\n",
|
192 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
193 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
194 |
+
]
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"source": [
|
198 |
+
"from nbdev.export import nb_export\n",
|
199 |
+
"nb_export('app.ipynb', '.')"
|
200 |
+
]
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"metadata": {
|
204 |
+
"language_info": {
|
205 |
+
"codemirror_mode": {
|
206 |
+
"name": "ipython",
|
207 |
+
"version": 3
|
208 |
+
},
|
209 |
+
"file_extension": ".py",
|
210 |
+
"mimetype": "text/x-python",
|
211 |
+
"name": "python",
|
212 |
+
"nbconvert_exporter": "python",
|
213 |
+
"pygments_lexer": "ipython3",
|
214 |
+
"version": "3.9.13"
|
215 |
+
}
|
216 |
+
},
|
217 |
+
"nbformat": 4,
|
218 |
+
"nbformat_minor": 5
|
219 |
+
}
|
app.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
|
2 |
+
|
3 |
+
# %% auto 0
|
4 |
+
__all__ = ['model', 'tokz', 'trainer', 'intf', 'tok_func', 'classify_message']
|
5 |
+
|
6 |
+
# %% app.ipynb 1
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
import gradio as gr
|
10 |
+
from datasets import Dataset
|
11 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
|
12 |
+
|
13 |
+
# %% app.ipynb 2
|
14 |
+
import warnings, logging
|
15 |
+
warnings.simplefilter('ignore')
|
16 |
+
logging.disable(logging.WARNING)
|
17 |
+
|
18 |
+
# %% app.ipynb 3
|
19 |
+
model = AutoModelForSequenceClassification.from_pretrained("./spam_model/")
|
20 |
+
tokz = AutoTokenizer.from_pretrained("./spam_model/")
|
21 |
+
trainer = Trainer(model, tokenizer=tokz)
|
22 |
+
|
23 |
+
# %% app.ipynb 5
|
24 |
+
def tok_func(x):
|
25 |
+
return tokz(x["input"])
|
26 |
+
|
27 |
+
# %% app.ipynb 7
|
28 |
+
def classify_message(text):
|
29 |
+
input_ds = Dataset.from_pandas(pd.DataFrame([text], columns=['input'])).map(tok_func, batched=True)
|
30 |
+
spam_prob = np.clip(trainer.predict(input_ds).predictions.astype(float), 0, 1)[0, 0]
|
31 |
+
return f'{100*spam_prob:.1f}% probability being Spam'
|
32 |
+
|
33 |
+
# %% app.ipynb 8
|
34 |
+
intf = gr.Interface(fn=classify_message, inputs='text', outputs='text')
|
35 |
+
intf.launch(inline=False)
|
spam.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
spam_model/added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc046d04c9b0ada7ae6f1dc89c465801799acdf0c9a6aab8c15a1b2d5ca4e91f
|
3 |
+
size 23
|
spam_model/config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f8920f3374f9490a58131d2b88a658cf64b1b58802e3aca7ea17aab5cea6170
|
3 |
+
size 958
|
spam_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8a93b4ef15c99cbd7d9480586f87fda89efcce815517996220e6c7de3eaf65e
|
3 |
+
size 567623353
|
spam_model/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:311de3f4eed9d76a43bf0d71f10e62e086ca65ccce9f15d5da0d2098bf519ecc
|
3 |
+
size 173
|
spam_model/spm.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
|
3 |
+
size 2464616
|
spam_model/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a86f883318afa11c8c10466f1bf4efaeb6ded28a52cbe57217a8fa0d0a2a87df
|
3 |
+
size 8656551
|
spam_model/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45705cb69567763661139b56f0f1f367dec7a130dfd6dcf86a14fbf174a48d3f
|
3 |
+
size 412
|
spam_model/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b382c01c07616fcd78e546a7e7028ab0c2bcfaf44135ec60d7952c2262f897c4
|
3 |
+
size 3579
|
train.ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"! pip install -q datasets","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-05-12T04:05:12.743738Z","iopub.execute_input":"2023-05-12T04:05:12.746149Z","iopub.status.idle":"2023-05-12T04:05:26.016049Z","shell.execute_reply.started":"2023-05-12T04:05:12.746114Z","shell.execute_reply":"2023-05-12T04:05:26.014993Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"markdown","source":"## Import Modules","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom pathlib import Path\nfrom datasets import Dataset, DatasetDict\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\nfrom transformers import TrainingArguments, Trainer\n\nnp.set_printoptions(precision=2, suppress=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:26.019972Z","iopub.execute_input":"2023-05-12T04:05:26.020311Z","iopub.status.idle":"2023-05-12T04:05:38.111349Z","shell.execute_reply.started":"2023-05-12T04:05:26.020282Z","shell.execute_reply":"2023-05-12T04:05:38.110437Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Load Data","metadata":{}},{"cell_type":"code","source":"path = Path('../input/sms-spam-collection-dataset')\n!ls {path}","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:38.112562Z","iopub.execute_input":"2023-05-12T04:05:38.113583Z","iopub.status.idle":"2023-05-12T04:05:39.095751Z","shell.execute_reply.started":"2023-05-12T04:05:38.113557Z","shell.execute_reply":"2023-05-12T04:05:39.094550Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"spam.csv\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df = pd.read_csv(path/'spam.csv', encoding='iso-8859-1')[['v1', 'v2']]\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.099307Z","iopub.execute_input":"2023-05-12T04:05:39.099677Z","iopub.status.idle":"2023-05-12T04:05:39.150580Z","shell.execute_reply.started":"2023-05-12T04:05:39.099646Z","shell.execute_reply":"2023-05-12T04:05:39.149605Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" v1 v2\n0 ham Go until jurong point, crazy.. Available only ...\n1 ham Ok lar... Joking wif u oni...\n2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n3 ham U dun say so early hor... U c already then say...\n4 ham Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 spam This is the 2nd time we have tried 2 contact u...\n5568 ham Will Ì_ b going to esplanade fr home?\n5569 ham Pity, * was in mood for that. So...any other s...\n5570 ham The guy did some bitching but I acted like i'd...\n5571 ham Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>v1</th>\n <th>v2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>ham</td>\n <td>Go until jurong point, crazy.. Available only ...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>ham</td>\n <td>Ok lar... Joking wif u oni...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>spam</td>\n <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>ham</td>\n <td>U dun say so early hor... U c already then say...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>ham</td>\n <td>Nah I don't think he goes to usf, he lives aro...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5567</th>\n <td>spam</td>\n <td>This is the 2nd time we have tried 2 contact u...</td>\n </tr>\n <tr>\n <th>5568</th>\n <td>ham</td>\n <td>Will Ì_ b going to esplanade fr home?</td>\n </tr>\n <tr>\n <th>5569</th>\n <td>ham</td>\n <td>Pity, * was in mood for that. So...any other s...</td>\n </tr>\n <tr>\n <th>5570</th>\n <td>ham</td>\n <td>The guy did some bitching but I acted like i'd...</td>\n </tr>\n <tr>\n <th>5571</th>\n <td>ham</td>\n <td>Rofl. Its true to its name</td>\n </tr>\n </tbody>\n</table>\n<p>5572 rows × 2 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train_df.describe(include='object')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.151914Z","iopub.execute_input":"2023-05-12T04:05:39.152242Z","iopub.status.idle":"2023-05-12T04:05:39.174189Z","shell.execute_reply.started":"2023-05-12T04:05:39.152194Z","shell.execute_reply":"2023-05-12T04:05:39.173238Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" v1 v2\ncount 5572 5572\nunique 2 5169\ntop ham Sorry, I'll call later\nfreq 4825 30","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>v1</th>\n <th>v2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>5572</td>\n <td>5572</td>\n </tr>\n <tr>\n <th>unique</th>\n <td>2</td>\n <td>5169</td>\n </tr>\n <tr>\n <th>top</th>\n <td>ham</td>\n <td>Sorry, I'll call later</td>\n </tr>\n <tr>\n <th>freq</th>\n <td>4825</td>\n <td>30</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"## Data Preprocessing","metadata":{}},{"cell_type":"code","source":"train_df.rename(columns={'v1': 'labels', 'v2': 'input'}, inplace=True)\ntrain_df['labels'] = (train_df['labels'] == 'spam').astype(float)\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.175463Z","iopub.execute_input":"2023-05-12T04:05:39.175854Z","iopub.status.idle":"2023-05-12T04:05:39.190828Z","shell.execute_reply.started":"2023-05-12T04:05:39.175823Z","shell.execute_reply":"2023-05-12T04:05:39.189848Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" labels input\n0 0.0 Go until jurong point, crazy.. Available only ...\n1 0.0 Ok lar... Joking wif u oni...\n2 1.0 Free entry in 2 a wkly comp to win FA Cup fina...\n3 0.0 U dun say so early hor... U c already then say...\n4 0.0 Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 1.0 This is the 2nd time we have tried 2 contact u...\n5568 0.0 Will Ì_ b going to esplanade fr home?\n5569 0.0 Pity, * was in mood for that. So...any other s...\n5570 0.0 The guy did some bitching but I acted like i'd...\n5571 0.0 Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>labels</th>\n <th>input</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.0</td>\n <td>Go until jurong point, crazy.. Available only ...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.0</td>\n <td>Ok lar... Joking wif u oni...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.0</td>\n <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.0</td>\n <td>U dun say so early hor... U c already then say...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.0</td>\n <td>Nah I don't think he goes to usf, he lives aro...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5567</th>\n <td>1.0</td>\n <td>This is the 2nd time we have tried 2 contact u...</td>\n </tr>\n <tr>\n <th>5568</th>\n <td>0.0</td>\n <td>Will Ì_ b going to esplanade fr home?</td>\n </tr>\n <tr>\n <th>5569</th>\n <td>0.0</td>\n <td>Pity, * was in mood for that. So...any other s...</td>\n </tr>\n <tr>\n <th>5570</th>\n <td>0.0</td>\n <td>The guy did some bitching but I acted like i'd...</td>\n </tr>\n <tr>\n <th>5571</th>\n <td>0.0</td>\n <td>Rofl. Its true to its name</td>\n </tr>\n </tbody>\n</table>\n<p>5572 rows × 2 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"## Tokenization","metadata":{}},{"cell_type":"code","source":"model_nm = 'microsoft/deberta-v3-small'\ntokz = AutoTokenizer.from_pretrained(model_nm)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.192156Z","iopub.execute_input":"2023-05-12T04:05:39.192544Z","iopub.status.idle":"2023-05-12T04:05:42.209496Z","shell.execute_reply.started":"2023-05-12T04:05:39.192514Z","shell.execute_reply":"2023-05-12T04:05:42.208442Z"},"trusted":true},"execution_count":7,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading (…)okenizer_config.json: 0%| | 0.00/52.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"981c0c2f622143358e0a065a0c9f69a2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading (…)lve/main/config.json: 0%| | 0.00/578 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9504127fd82d416d9efcc3ed7be43f8d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading spm.model: 0%| | 0.00/2.46M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e8e8a88bf04e4ec2830950d13684b53f"}},"metadata":{}},{"name":"stderr","text":"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n/opt/conda/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.py:454: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n warnings.warn(\nSpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n","output_type":"stream"}]},{"cell_type":"code","source":"def tok_func(x):\n return tokz(x[\"input\"])","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:42.211099Z","iopub.execute_input":"2023-05-12T04:05:42.211783Z","iopub.status.idle":"2023-05-12T04:05:42.218420Z","shell.execute_reply.started":"2023-05-12T04:05:42.211750Z","shell.execute_reply":"2023-05-12T04:05:42.217254Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"train_ds = Dataset.from_pandas(train_df)\ntrain_tok_ds = train_ds.map(tok_func, batched=True)\ntrain_tok_ds[0]['input'], train_tok_ds[0]['input_ids']","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:42.222675Z","iopub.execute_input":"2023-05-12T04:05:42.223346Z","iopub.status.idle":"2023-05-12T04:05:43.054865Z","shell.execute_reply.started":"2023-05-12T04:05:42.223286Z","shell.execute_reply":"2023-05-12T04:05:43.054024Z"},"trusted":true},"execution_count":9,"outputs":[{"output_type":"display_data","data":{"text/plain":" 0%| | 0/6 [00:00<?, ?ba/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"5d86501cfb93430f86fce3d001ea3f9f"}},"metadata":{}},{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',\n [1,\n 1968,\n 583,\n 18350,\n 49947,\n 582,\n 261,\n 3286,\n 260,\n 260,\n 4955,\n 364,\n 267,\n 5554,\n 1890,\n 2030,\n 426,\n 447,\n 2181,\n 865,\n 11709,\n 260,\n 260,\n 260,\n 33053,\n 343,\n 519,\n 266,\n 4755,\n 37964,\n 260,\n 260,\n 260,\n 2])"},"metadata":{}}]},{"cell_type":"markdown","source":"## Setup Arguments and Train the Model","metadata":{}},{"cell_type":"code","source":"def mse(x, y):\n return ((x-y)**2).mean()\n\ndef mse_d(eval_pred):\n return {'mse': mse(*eval_pred)}","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:43.060895Z","iopub.execute_input":"2023-05-12T04:05:43.063407Z","iopub.status.idle":"2023-05-12T04:05:43.070191Z","shell.execute_reply.started":"2023-05-12T04:05:43.063373Z","shell.execute_reply":"2023-05-12T04:05:43.069298Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"bs = 64\nepochs = 5\nlr = 5e-6","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:43.074799Z","iopub.execute_input":"2023-05-12T04:05:43.077166Z","iopub.status.idle":"2023-05-12T04:05:43.082883Z","shell.execute_reply.started":"2023-05-12T04:05:43.077134Z","shell.execute_reply":"2023-05-12T04:05:43.081927Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)\nargs = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,\n evaluation_strategy=\"epoch\", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,\n num_train_epochs=epochs, weight_decay=0.01, report_to='none')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:43.087507Z","iopub.execute_input":"2023-05-12T04:05:43.089876Z","iopub.status.idle":"2023-05-12T04:05:46.087250Z","shell.execute_reply.started":"2023-05-12T04:05:43.089843Z","shell.execute_reply":"2023-05-12T04:05:46.086254Z"},"trusted":true},"execution_count":12,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading pytorch_model.bin: 0%| | 0.00/286M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8b90e4e589af4bbfb271bea95bb9fb69"}},"metadata":{}},{"name":"stderr","text":"Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight']\n- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nSome weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","output_type":"stream"}]},{"cell_type":"code","source":"dds = train_tok_ds.train_test_split(0.25, seed=42)\ndds","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:46.089078Z","iopub.execute_input":"2023-05-12T04:05:46.089656Z","iopub.status.idle":"2023-05-12T04:05:46.108675Z","shell.execute_reply.started":"2023-05-12T04:05:46.089624Z","shell.execute_reply":"2023-05-12T04:05:46.107266Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"DatasetDict({\n train: Dataset({\n features: ['labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],\n num_rows: 4179\n })\n test: Dataset({\n features: ['labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],\n num_rows: 1393\n })\n})"},"metadata":{}}]},{"cell_type":"code","source":"trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],\n tokenizer=tokz, compute_metrics=mse_d)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:46.109932Z","iopub.execute_input":"2023-05-12T04:05:46.110369Z","iopub.status.idle":"2023-05-12T04:05:50.803465Z","shell.execute_reply.started":"2023-05-12T04:05:46.110337Z","shell.execute_reply":"2023-05-12T04:05:50.802512Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"trainer.train()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:50.804744Z","iopub.execute_input":"2023-05-12T04:05:50.805084Z","iopub.status.idle":"2023-05-12T04:08:02.938810Z","shell.execute_reply.started":"2023-05-12T04:05:50.805053Z","shell.execute_reply":"2023-05-12T04:08:02.937759Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n warnings.warn(\nYou're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"\n <div>\n \n <progress value='330' max='330' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [330/330 02:10, Epoch 5/5]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Epoch</th>\n <th>Training Loss</th>\n <th>Validation Loss</th>\n <th>Mse</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>1</td>\n <td>No log</td>\n <td>0.073268</td>\n <td>0.073268</td>\n </tr>\n <tr>\n <td>2</td>\n <td>No log</td>\n <td>0.009850</td>\n <td>0.009850</td>\n </tr>\n <tr>\n <td>3</td>\n <td>No log</td>\n <td>0.008275</td>\n <td>0.008275</td>\n </tr>\n <tr>\n <td>4</td>\n <td>No log</td>\n <td>0.007945</td>\n <td>0.007945</td>\n </tr>\n <tr>\n <td>5</td>\n <td>No log</td>\n <td>0.008093</td>\n <td>0.008093</td>\n </tr>\n </tbody>\n</table><p>"},"metadata":{}},{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"TrainOutput(global_step=330, training_loss=0.03608651305689956, metrics={'train_runtime': 132.1096, 'train_samples_per_second': 158.164, 'train_steps_per_second': 2.498, 'total_flos': 461121007217520.0, 'train_loss': 0.03608651305689956, 'epoch': 5.0})"},"metadata":{}}]},{"cell_type":"markdown","source":"## Test the model","metadata":{}},{"cell_type":"code","source":"preds = trainer.predict(dds['test']).predictions.astype(float)\npreds","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:02.940333Z","iopub.execute_input":"2023-05-12T04:08:02.940794Z","iopub.status.idle":"2023-05-12T04:08:05.616149Z","shell.execute_reply.started":"2023-05-12T04:08:02.940759Z","shell.execute_reply":"2023-05-12T04:08:05.615130Z"},"trusted":true},"execution_count":16,"outputs":[{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":""},"metadata":{}},{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"array([-0.03, -0.01, -0.04, ..., -0.03, -0.02, 1.12])"},"metadata":{}}]},{"cell_type":"code","source":"output = np.zeros(len(preds))\noutput[preds >= 0.5] = 1.0\noutput","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.617768Z","iopub.execute_input":"2023-05-12T04:08:05.618157Z","iopub.status.idle":"2023-05-12T04:08:05.626184Z","shell.execute_reply.started":"2023-05-12T04:08:05.618121Z","shell.execute_reply":"2023-05-12T04:08:05.625196Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"array([0., 0., 0., ..., 0., 0., 1.])"},"metadata":{}}]},{"cell_type":"code","source":"real = np.array(dds['test']['labels'])\nreal","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.627338Z","iopub.execute_input":"2023-05-12T04:08:05.628053Z","iopub.status.idle":"2023-05-12T04:08:05.646652Z","shell.execute_reply.started":"2023-05-12T04:08:05.628013Z","shell.execute_reply":"2023-05-12T04:08:05.645699Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"array([0., 0., 0., ..., 0., 0., 1.])"},"metadata":{}}]},{"cell_type":"code","source":"(output == real).sum() / len(real)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.647820Z","iopub.execute_input":"2023-05-12T04:08:05.648218Z","iopub.status.idle":"2023-05-12T04:08:05.654599Z","shell.execute_reply.started":"2023-05-12T04:08:05.648172Z","shell.execute_reply":"2023-05-12T04:08:05.653704Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"0.9921033740129217"},"metadata":{}}]},{"cell_type":"markdown","source":"## Save the Model","metadata":{}},{"cell_type":"code","source":"trainer.save_model(\"./spam_model\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:44.088969Z","iopub.execute_input":"2023-05-12T04:08:44.089372Z","iopub.status.idle":"2023-05-12T04:08:45.285254Z","shell.execute_reply.started":"2023-05-12T04:08:44.089342Z","shell.execute_reply":"2023-05-12T04:08:45.284198Z"},"trusted":true},"execution_count":20,"outputs":[]}]}
|