Akhil0-o
/

Phishing_detection

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Load the required libraries\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import classification_report\n",
+    "from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments\n",
+    "from transformers import TrainerCallback\n",
+    "import os\n",
+    "from transformers import TrainingArguments, Trainer\n",
+    "#Create directory to save model\n",
+    "os.makedirs(\"./best_model\", exist_ok=True)\n",
+    "\n",
+    "#Create a callback class to save the best model\n",
+    "class SaveBestModelCallback(TrainerCallback):\n",
+    "    #Initialize the class variables and values\n",
+    "    def __init__(self):\n",
+    "        self.best_f1_score = 0\n",
+    "    #Get the evaluation metrics\n",
+    "    def on_evaluate(self, args, state, control, metrics, **kwargs):\n",
+    "        metrics = trainer.evaluate()\n",
+    "        f1_score = metrics[\"eval_f1\"]\n",
+    "        #Save the model if the current f1 score is higher that the best f1 score so far\n",
+    "        if f1_score > self.best_f1_score:\n",
+    "          self.best_f1_score = f1_score\n",
+    "          model.save_pretrained(\"./best_model\")\n",
+    "          tokenizer.save_pretrained(\"./best_model\")\n",
+    "          #Print the f1 score\n",
+    "          print(f\"New best model saved with F1 score: {f1_score}\")\n",
+    "\n",
+    "# Load and preprocess the data\n",
+    "train_data = pd.read_csv(\"train_links.csv\", encoding='utf-8', encoding_errors='ignore')\n",
+    "test_data = pd.read_csv(\"test_links.csv\", encoding='utf-8', encoding_errors='ignore')\n",
+    "\n",
+    "test_data=test_data[:16171]\n",
+    "\n",
+    "train_data=train_data[['email', 'label']]\n",
+    "test_data=test_data[['email', 'label']]\n",
+    "\n",
+    "\n",
+    "#print(len(train_data))\n",
+    "#print(train_data[train_data['label'].isnull()])\n",
+    "\n",
+    "\n",
+    "train_data['label'] = train_data['label'].astype(int)\n",
+    "test_data['label'] = test_data['label'].astype(int)\n",
+    "\n",
+    "#Convert all column data to strings\n",
+    "train_email_list=train_data[\"email\"].tolist()\n",
+    "for i in range(len(train_email_list)):\n",
+    "  if type(train_email_list[i]) != type('a'):\n",
+    "    temp=str(train_email_list[i])\n",
+    "    train_email_list[i]=temp\n",
+    "\n",
+    "#Get the label lists\n",
+    "train_label_list=train_data[\"label\"].tolist()\n",
+    "\n",
+    "#print(len(train_email_list))\n",
+    "#print(len(train_label_list))\n",
+    "\n",
+    "\n",
+    "for i in range(len(train_label_list)):\n",
+    "  if type(train_label_list[i]) != type(1):\n",
+    "    temp=int(train_label_list[i])\n",
+    "    train_label_list[i]=temp\n",
+    "\n",
+    "#Convert null values in labels to 0\n",
+    "count=0\n",
+    "#print(count)\n",
+    "for i in (train_data[\"label\"].tolist()):\n",
+    "  if type(i) != type(1):\n",
+    "    count+=1\n",
+    "\n",
+    "#print(count)\n",
+    "\n",
+    "#print(len(train_data))\n",
+    "#print(train_data[train_data['label'].isnull()])\n",
+    "\n",
+    "\n",
+    "#Get test email and label lists\n",
+    "test_email_list=test_data[\"email\"].tolist()\n",
+    "for i in range(len(test_email_list)):\n",
+    "  if type(test_email_list[i]) != type('a'):\n",
+    "    temp=str(test_email_list[i])\n",
+    "    test_email_list[i]=temp\n",
+    "\n",
+    "\n",
+    "test_label_list=test_data[\"label\"].tolist()\n",
+    "\n",
+    "#print(len(train_email_list))\n",
+    "#print(len(train_label_list))\n",
+    "\n",
+    "\n",
+    "for i in range(len(test_label_list)):\n",
+    "  if type(test_label_list[i]) != type(1):\n",
+    "    temp=int(test_label_list[i])\n",
+    "    test_label_list[i]=temp\n",
+    "\n",
+    "count=0\n",
+    "#print(count)\n",
+    "for i in (test_data[\"label\"].tolist()):\n",
+    "  if type(i) != type(1):\n",
+    "    count+=1\n",
+    "\n",
+    "#print(count)\n",
+    "\n",
+    "train_data=train_data[['email', 'label']]\n",
+    "test_data=test_data[['email', 'label']]\n",
+    "\n",
+    "train_data['label'] = train_data['label'].astype(int)\n",
+    "test_data['label'] = test_data['label'].astype(int)\n",
+    "\n",
+    "#Load the RoBERTa tokenizer\n",
+    "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n",
+    "\n",
+    "#Preprocess the data\n",
+    "def preprocess(df):\n",
+    "    inputs = tokenizer(df[\"email\"].tolist(), return_tensors=\"pt\", padding=True, truncation=True, max_length=512)\n",
+    "    labels = torch.tensor(df[\"label\"].tolist())\n",
+    "    return inputs, labels\n",
+    "\n",
+    "train_inputs, train_labels = preprocess(train_data)\n",
+    "test_inputs, test_labels = preprocess(test_data)\n",
+    "\n",
+    "# Custom dataset class\n",
+    "class CustomDataset(Dataset):\n",
+    "    def __init__(self, inputs, labels):\n",
+    "        self.inputs = inputs\n",
+    "        self.labels = labels\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.labels)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        item = {key: val[idx] for key, val in self.inputs.items()}\n",
+    "        item[\"labels\"] = self.labels[idx]\n",
+    "        return item\n",
+    "\n",
+    "# Prepare the RoBERTa model for training\n",
+    "model = RobertaForSequenceClassification.from_pretrained(\"roberta-base\", num_labels=2)\n",
+    "\n",
+    "# Define the Trainer and TrainingArguments\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=100,\n",
+    "    save_steps=1000,\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    weight_decay=0.01,\n",
+    ")\n",
+    "\n",
+    "#Define the compute metrics function\n",
+    "def compute_metrics(pred):\n",
+    "    labels = pred.label_ids\n",
+    "    preds = pred.predictions.argmax(-1)\n",
+    "    metrics = classification_report(labels, preds, output_dict=True)[\"weighted avg\"]\n",
+    "    return {\"f1\": metrics[\"f1-score\"]}\n",
+    "\n",
+    "\n",
+    "#Initialize the trainer\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=CustomDataset(train_inputs, train_labels),\n",
+    "    eval_dataset=CustomDataset(test_inputs, test_labels),\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "\n",
+    "#trainer.add_callback(SaveBestModelCallback())\n",
+    "trainer.train()\n",
+    "\n",
+    "# Evaluate the model\n",
+    "eval_results = trainer.evaluate()\n",
+    "\n",
+    "#Printing the results\n",
+    "print(\"Evaluation results:\", eval_results)\n",
+    "\n",
+    "\n",
+    "#Save the best model\n",
+    "model.save_pretrained('./best_model')\n",
+    "model.save_pretrained('./best_model.h5')\n",
+    "tokenizer.save_pretrained(\"./best_model\")\n",
+    "\n",
+    "\"\"\"\n",
+    "best_model = RobertaForSequenceClassification.from_pretrained(\"./best_model\")\n",
+    "best_tokenizer = RobertaTokenizer.from_pretrained(\"./best_model\")\n",
+    "For using the saved model in a Google Chrome extension, you would need to use a server-side solution or a cloud-based API to connect your extension to the trained model.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = RobertaForSequenceClassification.from_pretrained(\"./best_model\")\n",
+    "tokenizer = RobertaTokenizer.from_pretrained(\"./best_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = tokenizer(\"www.tiem.utk.edu/~gross/bioed/bealsmodules/spider.html\", return_tensors=\"pt\")\n",
+    "outputs = model(**inputs)\n",
+    "predictions = torch.argmax(outputs.logits, dim=-1)\n",
+    "\n",
+    "print(predictions)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}