{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"! pip install -q datasets","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-05-12T04:05:12.743738Z","iopub.execute_input":"2023-05-12T04:05:12.746149Z","iopub.status.idle":"2023-05-12T04:05:26.016049Z","shell.execute_reply.started":"2023-05-12T04:05:12.746114Z","shell.execute_reply":"2023-05-12T04:05:26.014993Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"markdown","source":"## Import Modules","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom pathlib import Path\nfrom datasets import Dataset, DatasetDict\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\nfrom transformers import TrainingArguments, Trainer\n\nnp.set_printoptions(precision=2, suppress=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:26.019972Z","iopub.execute_input":"2023-05-12T04:05:26.020311Z","iopub.status.idle":"2023-05-12T04:05:38.111349Z","shell.execute_reply.started":"2023-05-12T04:05:26.020282Z","shell.execute_reply":"2023-05-12T04:05:38.110437Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Load Data","metadata":{}},{"cell_type":"code","source":"path = Path('../input/sms-spam-collection-dataset')\n!ls {path}","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:38.112562Z","iopub.execute_input":"2023-05-12T04:05:38.113583Z","iopub.status.idle":"2023-05-12T04:05:39.095751Z","shell.execute_reply.started":"2023-05-12T04:05:38.113557Z","shell.execute_reply":"2023-05-12T04:05:39.094550Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"spam.csv\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df = pd.read_csv(path/'spam.csv', encoding='iso-8859-1')[['v1', 'v2']]\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.099307Z","iopub.execute_input":"2023-05-12T04:05:39.099677Z","iopub.status.idle":"2023-05-12T04:05:39.150580Z","shell.execute_reply.started":"2023-05-12T04:05:39.099646Z","shell.execute_reply":"2023-05-12T04:05:39.149605Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" v1 v2\n0 ham Go until jurong point, crazy.. Available only ...\n1 ham Ok lar... Joking wif u oni...\n2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n3 ham U dun say so early hor... U c already then say...\n4 ham Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 spam This is the 2nd time we have tried 2 contact u...\n5568 ham Will Ì_ b going to esplanade fr home?\n5569 ham Pity, * was in mood for that. So...any other s...\n5570 ham The guy did some bitching but I acted like i'd...\n5571 ham Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
v1v2
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
.........
5567spamThis is the 2nd time we have tried 2 contact u...
5568hamWill Ì_ b going to esplanade fr home?
5569hamPity, * was in mood for that. So...any other s...
5570hamThe guy did some bitching but I acted like i'd...
5571hamRofl. Its true to its name
\n

5572 rows × 2 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"train_df.describe(include='object')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.151914Z","iopub.execute_input":"2023-05-12T04:05:39.152242Z","iopub.status.idle":"2023-05-12T04:05:39.174189Z","shell.execute_reply.started":"2023-05-12T04:05:39.152194Z","shell.execute_reply":"2023-05-12T04:05:39.173238Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" v1 v2\ncount 5572 5572\nunique 2 5169\ntop ham Sorry, I'll call later\nfreq 4825 30","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
v1v2
count55725572
unique25169
tophamSorry, I'll call later
freq482530
\n
"},"metadata":{}}]},{"cell_type":"markdown","source":"## Data Preprocessing","metadata":{}},{"cell_type":"code","source":"train_df.rename(columns={'v1': 'labels', 'v2': 'input'}, inplace=True)\ntrain_df['labels'] = (train_df['labels'] == 'spam').astype(float)\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.175463Z","iopub.execute_input":"2023-05-12T04:05:39.175854Z","iopub.status.idle":"2023-05-12T04:05:39.190828Z","shell.execute_reply.started":"2023-05-12T04:05:39.175823Z","shell.execute_reply":"2023-05-12T04:05:39.189848Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" labels input\n0 0.0 Go until jurong point, crazy.. Available only ...\n1 0.0 Ok lar... Joking wif u oni...\n2 1.0 Free entry in 2 a wkly comp to win FA Cup fina...\n3 0.0 U dun say so early hor... U c already then say...\n4 0.0 Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 1.0 This is the 2nd time we have tried 2 contact u...\n5568 0.0 Will Ì_ b going to esplanade fr home?\n5569 0.0 Pity, * was in mood for that. So...any other s...\n5570 0.0 The guy did some bitching but I acted like i'd...\n5571 0.0 Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labelsinput
00.0Go until jurong point, crazy.. Available only ...
10.0Ok lar... Joking wif u oni...
21.0Free entry in 2 a wkly comp to win FA Cup fina...
30.0U dun say so early hor... U c already then say...
40.0Nah I don't think he goes to usf, he lives aro...
.........
55671.0This is the 2nd time we have tried 2 contact u...
55680.0Will Ì_ b going to esplanade fr home?
55690.0Pity, * was in mood for that. So...any other s...
55700.0The guy did some bitching but I acted like i'd...
55710.0Rofl. Its true to its name
\n

5572 rows × 2 columns

\n
"},"metadata":{}}]},{"cell_type":"markdown","source":"## Tokenization","metadata":{}},{"cell_type":"code","source":"model_nm = 'microsoft/deberta-v3-small'\ntokz = AutoTokenizer.from_pretrained(model_nm)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.192156Z","iopub.execute_input":"2023-05-12T04:05:39.192544Z","iopub.status.idle":"2023-05-12T04:05:42.209496Z","shell.execute_reply.started":"2023-05-12T04:05:39.192514Z","shell.execute_reply":"2023-05-12T04:05:42.208442Z"},"trusted":true},"execution_count":7,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading (…)okenizer_config.json: 0%| | 0.00/52.0 [00:00","text/html":"\n
\n \n \n [330/330 02:10, Epoch 5/5]\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
EpochTraining LossValidation LossMse
1No log0.0732680.073268
2No log0.0098500.009850
3No log0.0082750.008275
4No log0.0079450.007945
5No log0.0080930.008093

"},"metadata":{}},{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"TrainOutput(global_step=330, training_loss=0.03608651305689956, metrics={'train_runtime': 132.1096, 'train_samples_per_second': 158.164, 'train_steps_per_second': 2.498, 'total_flos': 461121007217520.0, 'train_loss': 0.03608651305689956, 'epoch': 5.0})"},"metadata":{}}]},{"cell_type":"markdown","source":"## Test the model","metadata":{}},{"cell_type":"code","source":"preds = trainer.predict(dds['test']).predictions.astype(float)\npreds","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:02.940333Z","iopub.execute_input":"2023-05-12T04:08:02.940794Z","iopub.status.idle":"2023-05-12T04:08:05.616149Z","shell.execute_reply.started":"2023-05-12T04:08:02.940759Z","shell.execute_reply":"2023-05-12T04:08:05.615130Z"},"trusted":true},"execution_count":16,"outputs":[{"output_type":"display_data","data":{"text/plain":"","text/html":""},"metadata":{}},{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"array([-0.03, -0.01, -0.04, ..., -0.03, -0.02, 1.12])"},"metadata":{}}]},{"cell_type":"code","source":"output = np.zeros(len(preds))\noutput[preds >= 0.5] = 1.0\noutput","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.617768Z","iopub.execute_input":"2023-05-12T04:08:05.618157Z","iopub.status.idle":"2023-05-12T04:08:05.626184Z","shell.execute_reply.started":"2023-05-12T04:08:05.618121Z","shell.execute_reply":"2023-05-12T04:08:05.625196Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"array([0., 0., 0., ..., 0., 0., 1.])"},"metadata":{}}]},{"cell_type":"code","source":"real = np.array(dds['test']['labels'])\nreal","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.627338Z","iopub.execute_input":"2023-05-12T04:08:05.628053Z","iopub.status.idle":"2023-05-12T04:08:05.646652Z","shell.execute_reply.started":"2023-05-12T04:08:05.628013Z","shell.execute_reply":"2023-05-12T04:08:05.645699Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"array([0., 0., 0., ..., 0., 0., 1.])"},"metadata":{}}]},{"cell_type":"code","source":"(output == real).sum() / len(real)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:05.647820Z","iopub.execute_input":"2023-05-12T04:08:05.648218Z","iopub.status.idle":"2023-05-12T04:08:05.654599Z","shell.execute_reply.started":"2023-05-12T04:08:05.648172Z","shell.execute_reply":"2023-05-12T04:08:05.653704Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"0.9921033740129217"},"metadata":{}}]},{"cell_type":"markdown","source":"## Save the Model","metadata":{}},{"cell_type":"code","source":"trainer.save_model(\"./spam_model\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:44.088969Z","iopub.execute_input":"2023-05-12T04:08:44.089372Z","iopub.status.idle":"2023-05-12T04:08:45.285254Z","shell.execute_reply.started":"2023-05-12T04:08:44.089342Z","shell.execute_reply":"2023-05-12T04:08:45.284198Z"},"trusted":true},"execution_count":20,"outputs":[]}]}