{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"! pip install -q datasets","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-05-12T04:05:12.743738Z","iopub.execute_input":"2023-05-12T04:05:12.746149Z","iopub.status.idle":"2023-05-12T04:05:26.016049Z","shell.execute_reply.started":"2023-05-12T04:05:12.746114Z","shell.execute_reply":"2023-05-12T04:05:26.014993Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"markdown","source":"## Import Modules","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom pathlib import Path\nfrom datasets import Dataset, DatasetDict\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\nfrom transformers import TrainingArguments, Trainer\n\nnp.set_printoptions(precision=2, suppress=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:26.019972Z","iopub.execute_input":"2023-05-12T04:05:26.020311Z","iopub.status.idle":"2023-05-12T04:05:38.111349Z","shell.execute_reply.started":"2023-05-12T04:05:26.020282Z","shell.execute_reply":"2023-05-12T04:05:38.110437Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Load Data","metadata":{}},{"cell_type":"code","source":"path = Path('../input/sms-spam-collection-dataset')\n!ls {path}","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:38.112562Z","iopub.execute_input":"2023-05-12T04:05:38.113583Z","iopub.status.idle":"2023-05-12T04:05:39.095751Z","shell.execute_reply.started":"2023-05-12T04:05:38.113557Z","shell.execute_reply":"2023-05-12T04:05:39.094550Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"spam.csv\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df = pd.read_csv(path/'spam.csv', encoding='iso-8859-1')[['v1', 'v2']]\ntrain_df","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:05:39.099307Z","iopub.execute_input":"2023-05-12T04:05:39.099677Z","iopub.status.idle":"2023-05-12T04:05:39.150580Z","shell.execute_reply.started":"2023-05-12T04:05:39.099646Z","shell.execute_reply":"2023-05-12T04:05:39.149605Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" v1 v2\n0 ham Go until jurong point, crazy.. Available only ...\n1 ham Ok lar... Joking wif u oni...\n2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n3 ham U dun say so early hor... U c already then say...\n4 ham Nah I don't think he goes to usf, he lives aro...\n... ... ...\n5567 spam This is the 2nd time we have tried 2 contact u...\n5568 ham Will Ì_ b going to esplanade fr home?\n5569 ham Pity, * was in mood for that. So...any other s...\n5570 ham The guy did some bitching but I acted like i'd...\n5571 ham Rofl. Its true to its name\n\n[5572 rows x 2 columns]","text/html":"
\n | v1 | \nv2 | \n
---|---|---|
0 | \nham | \nGo until jurong point, crazy.. Available only ... | \n
1 | \nham | \nOk lar... Joking wif u oni... | \n
2 | \nspam | \nFree entry in 2 a wkly comp to win FA Cup fina... | \n
3 | \nham | \nU dun say so early hor... U c already then say... | \n
4 | \nham | \nNah I don't think he goes to usf, he lives aro... | \n
... | \n... | \n... | \n
5567 | \nspam | \nThis is the 2nd time we have tried 2 contact u... | \n
5568 | \nham | \nWill Ì_ b going to esplanade fr home? | \n
5569 | \nham | \nPity, * was in mood for that. So...any other s... | \n
5570 | \nham | \nThe guy did some bitching but I acted like i'd... | \n
5571 | \nham | \nRofl. Its true to its name | \n
5572 rows × 2 columns
\n\n | v1 | \nv2 | \n
---|---|---|
count | \n5572 | \n5572 | \n
unique | \n2 | \n5169 | \n
top | \nham | \nSorry, I'll call later | \n
freq | \n4825 | \n30 | \n
\n | labels | \ninput | \n
---|---|---|
0 | \n0.0 | \nGo until jurong point, crazy.. Available only ... | \n
1 | \n0.0 | \nOk lar... Joking wif u oni... | \n
2 | \n1.0 | \nFree entry in 2 a wkly comp to win FA Cup fina... | \n
3 | \n0.0 | \nU dun say so early hor... U c already then say... | \n
4 | \n0.0 | \nNah I don't think he goes to usf, he lives aro... | \n
... | \n... | \n... | \n
5567 | \n1.0 | \nThis is the 2nd time we have tried 2 contact u... | \n
5568 | \n0.0 | \nWill Ì_ b going to esplanade fr home? | \n
5569 | \n0.0 | \nPity, * was in mood for that. So...any other s... | \n
5570 | \n0.0 | \nThe guy did some bitching but I acted like i'd... | \n
5571 | \n0.0 | \nRofl. Its true to its name | \n
5572 rows × 2 columns
\nEpoch | \nTraining Loss | \nValidation Loss | \nMse | \n
---|---|---|---|
1 | \nNo log | \n0.073268 | \n0.073268 | \n
2 | \nNo log | \n0.009850 | \n0.009850 | \n
3 | \nNo log | \n0.008275 | \n0.008275 | \n
4 | \nNo log | \n0.007945 | \n0.007945 | \n
5 | \nNo log | \n0.008093 | \n0.008093 | \n
"},"metadata":{}},{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"TrainOutput(global_step=330, training_loss=0.03608651305689956, metrics={'train_runtime': 132.1096, 'train_samples_per_second': 158.164, 'train_steps_per_second': 2.498, 'total_flos': 461121007217520.0, 'train_loss': 0.03608651305689956, 'epoch': 5.0})"},"metadata":{}}]},{"cell_type":"markdown","source":"## Test the model","metadata":{}},{"cell_type":"code","source":"preds = trainer.predict(dds['test']).predictions.astype(float)\npreds","metadata":{"execution":{"iopub.status.busy":"2023-05-12T04:08:02.940333Z","iopub.execute_input":"2023-05-12T04:08:02.940794Z","iopub.status.idle":"2023-05-12T04:08:05.616149Z","shell.execute_reply.started":"2023-05-12T04:08:02.940759Z","shell.execute_reply":"2023-05-12T04:08:05.615130Z"},"trusted":true},"execution_count":16,"outputs":[{"output_type":"display_data","data":{"text/plain":"