{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"eMSthPt-5jKX"},"outputs":[],"source":[]},{"cell_type":"markdown","source":["**BERT TOKENIZATION IMPLEMENTATION**"],"metadata":{"id":"oLhNYAvZmv59"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":73},"id":"4gJPGZBYtT4E","outputId":"76a89f39-0c16-4b0d-d476-ee1c0318475e","executionInfo":{"status":"ok","timestamp":1698405953146,"user_tz":-330,"elapsed":86775,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}}},"outputs":[{"output_type":"display_data","data":{"text/plain":[""],"text/html":["\n"," \n"," \n"," Upload widget is only available when the cell has been executed in the\n"," current browser session. Please rerun this cell to enable.\n"," \n"," "]},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Saving Phishing.csv to Phishing.csv\n"]}],"source":["import matplotlib.pyplot as plt\n","%matplotlib inline\n","from google.colab import files\n","upload = files.upload()\n","import pandas as pd\n","import numpy as np"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"szMKJ8D-tpMv"},"outputs":[],"source":["rachdatahug = pd.read_csv(\"RACH_URL_5000.csv\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"chwt8TE9tv0M"},"outputs":[],"source":["X=list(rachdatahug['URL'])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-L4ordBktz_N"},"outputs":[],"source":["y=list(rachdatahug['Label'])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"qJIJ_zoDt5k9"},"outputs":[],"source":["y = list(pd.get_dummies(y,drop_first=True)['good'])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DxioXkNXt81V"},"outputs":[],"source":["from sklearn.model_selection import train_test_split\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state= 0)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5958,"status":"ok","timestamp":1690449517058,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"},"user_tz":-330},"id":"qLh9nOZFuAlN","outputId":"37362513-1503-4793-cd66-4c9b63275ca9"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.31.0)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.16.4)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.24.3)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n","Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n","Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.6.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n"]}],"source":["!pip install transformers"]},{"cell_type":"markdown","source":["**BERT TOKENIZATION IMPLEMENTATION**"],"metadata":{"id":"2KJ0TTw_m5ot"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":33055,"status":"ok","timestamp":1690449552464,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"},"user_tz":-330},"id":"cJW_-FKav4FJ","outputId":"67915058-da1d-4525-f641-572edf24b561"},"outputs":[{"output_type":"stream","name":"stderr","text":["Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']\n","- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n","- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n","All the weights of TFBertModel were initialized from the PyTorch model.\n","If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n"]}],"source":["from transformers import BertTokenizer, TFBertModel\n","tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n","model = TFBertModel.from_pretrained(\"bert-base-uncased\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LehPr2o0wEEJ"},"outputs":[],"source":["train_encodings = tokenizer(X_train, truncation=True, padding=True)\n","test_encodings = tokenizer(X_test, truncation=True, padding=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"__qAgJQ5wLUh"},"outputs":[],"source":["import tensorflow as tf\n","\n","train_dataset = tf.data.Dataset.from_tensor_slices((\n"," dict(train_encodings),\n"," y_train\n","))\n","\n","test_dataset = tf.data.Dataset.from_tensor_slices((\n"," dict(test_encodings),\n"," y_test\n","))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":358,"status":"ok","timestamp":1690449587891,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"},"user_tz":-330},"id":"6fYis2PNx2XG","outputId":"3807f06f-4faf-47d8-d058-3560e209c9bc"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(408,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(408,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(408,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>"]},"metadata":{},"execution_count":11}],"source":["train_dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DR_mDIgiwXSg"},"outputs":[],"source":["from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments\n","\n","training_args = TFTrainingArguments(\n"," output_dir='./results', # output directory\n"," num_train_epochs=2, # total number of training epochs\n"," per_device_train_batch_size=8, # batch size per device during training\n"," per_device_eval_batch_size=16, # batch size for evaluation\n"," warmup_steps=500, # number of warmup steps for learning rate scheduler\n"," weight_decay=0.01, # strength of weight decay\n"," logging_dir='./logs', # directory for storing logs\n"," eval_steps=10,\n",")"]},{"cell_type":"code","source":[],"metadata":{"id":"Ug5nfiE1diw0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!pip install --upgrade tensorflow"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5-CEnjUGoZ-i","executionInfo":{"status":"ok","timestamp":1690449599295,"user_tz":-330,"elapsed":6434,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"2d708e1b-d86f-43d3-a6d5-a5ce98327798"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: tensorflow in /usr/local/lib/python3.10/dist-packages (2.13.0)\n","Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.4.0)\n","Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.6.3)\n","Requirement already satisfied: flatbuffers>=23.1.21 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (23.5.26)\n","Requirement already satisfied: gast<=0.4.0,>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.4.0)\n","Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)\n","Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.56.2)\n","Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.8.0)\n","Requirement already satisfied: keras<2.14,>=2.13.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.13.1)\n","Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (16.0.6)\n","Requirement already satisfied: numpy<=1.24.3,>=1.22 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.24.3)\n","Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.3.0)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow) (23.1)\n","Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.20.3)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow) (67.7.2)\n","Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.16.0)\n","Requirement already satisfied: tensorboard<2.14,>=2.13 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.13.0)\n","Requirement already satisfied: tensorflow-estimator<2.14,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.13.0)\n","Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.3.0)\n","Requirement already satisfied: typing-extensions<4.6.0,>=3.6.6 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (4.5.0)\n","Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.14.1)\n","Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.32.0)\n","Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow) (0.41.0)\n","Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.14,>=2.13->tensorflow) (2.17.3)\n","Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.14,>=2.13->tensorflow) (1.0.0)\n","Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.14,>=2.13->tensorflow) (3.4.3)\n","Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.14,>=2.13->tensorflow) (2.27.1)\n","Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.14,>=2.13->tensorflow) (0.7.1)\n","Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.14,>=2.13->tensorflow) (2.3.6)\n","Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.14,>=2.13->tensorflow) (5.3.1)\n","Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.14,>=2.13->tensorflow) (0.3.0)\n","Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.14,>=2.13->tensorflow) (4.9)\n","Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard<2.14,>=2.13->tensorflow) (1.3.1)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.14,>=2.13->tensorflow) (1.26.16)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.14,>=2.13->tensorflow) (2023.7.22)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.14,>=2.13->tensorflow) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.14,>=2.13->tensorflow) (3.4)\n","Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard<2.14,>=2.13->tensorflow) (2.1.3)\n","Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.14,>=2.13->tensorflow) (0.5.0)\n","Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard<2.14,>=2.13->tensorflow) (3.2.2)\n"]}]},{"cell_type":"code","source":["with training_args.strategy.scope():\n"," model = TFBertForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n","\n","trainer = TFTrainer(\n"," model=model, # the instantiated 🤗 Transformers model to be trained\n"," args=training_args, # training arguments, defined above\n"," train_dataset=train_dataset, # training dataset\n"," eval_dataset=test_dataset # evaluation dataset\n",")\n","\n","trainer.train()"],"metadata":{"id":"3rZlYMeurarz"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":[],"metadata":{"id":"LuSy__xZmt6Q"}},{"cell_type":"code","source":["trainer.evaluate(test_dataset)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XO9rZVJmTh9s","executionInfo":{"status":"ok","timestamp":1690199001753,"user_tz":-330,"elapsed":21101,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"f834f67e-d198-4a1d-d3af-159a0acd333f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'eval_loss': 0.07492480855999571}"]},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":["trainer.predict(test_dataset)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"SQ3GH-V_TtkV","executionInfo":{"status":"ok","timestamp":1690199055253,"user_tz":-330,"elapsed":41424,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"51c88fe7-e47c-4bd3-e3d5-1f1d00c12da5"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["PredictionOutput(predictions=array([[-3.1541142, 2.8970544],\n"," [-3.8023884, 3.4152956],\n"," [-3.8407516, 3.470047 ],\n"," ...,\n"," [ 3.3833935, -2.9478166],\n"," [-3.6145408, 3.3758733],\n"," [ 3.3220055, -2.9492233]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 0], dtype=int32), metrics={'eval_loss': 0.07491695519649622})"]},"metadata":{},"execution_count":17}]},{"cell_type":"code","source":["trainer.predict(test_dataset)[1].shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gxS5XUGsT6nA","executionInfo":{"status":"ok","timestamp":1690199076288,"user_tz":-330,"elapsed":18114,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"d08e88ea-a65c-48f1-c3a6-ff9e0c34a78a"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(1047,)"]},"metadata":{},"execution_count":18}]},{"cell_type":"code","source":["output=trainer.predict(test_dataset)[1]"],"metadata":{"id":"59COgBpSUIJq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import confusion_matrix\n","\n","cm=confusion_matrix(y_test,output)\n","cm"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"z58IlkO9UYnb","executionInfo":{"status":"ok","timestamp":1690199101230,"user_tz":-330,"elapsed":393,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"ef976110-54c5-4920-e04a-4f341ebdf4be"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[507, 0],\n"," [ 0, 540]])"]},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["from sklearn import metrics"],"metadata":{"id":"NjA5j3VIUeJ2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["metrics.accuracy_score(y_test, output)"],"metadata":{"id":"VoDafYtXUpeO","executionInfo":{"status":"ok","timestamp":1690199110033,"user_tz":-330,"elapsed":390,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"36d72e82-5b57-4847-9859-d6112bca3c92","colab":{"base_uri":"https://localhost:8080/"}},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1.0"]},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["from sklearn.metrics import roc_curve\n","from sklearn.metrics import roc_auc_score\n","from sklearn.metrics import roc_curve, auc\n","from matplotlib import pyplot"],"metadata":{"id":"j4ZqgIHse0wP"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["fpr, tpr, _ = roc_curve(y_test,output)\n","roc_auc = auc(fpr, tpr)\n","\n","# Plot the ROC curve\n","plt.figure()\n","plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)\n","plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n","plt.xlim([0.0, 1.0])\n","plt.ylim([0.0, 1.05])\n","plt.xlabel('False Positive Rate')\n","plt.ylabel('True Positive Rate')\n","plt.title('Receiver Operating Characteristic (ROC)')\n","plt.legend(loc='lower right')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"id":"RruwkabmvdKa","executionInfo":{"status":"ok","timestamp":1690199951824,"user_tz":-330,"elapsed":1099,"user":{"displayName":"RACHANA POTPELWAR","userId":"12339588403377734923"}},"outputId":"028c3633-ac5b-4159-f2b5-16459488f11e"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]}],"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOPNebWsFwOMkgys65v9a3B"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}