{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Load the required libraries\n", "import torch\n", "from torch.utils.data import Dataset\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report\n", "from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments\n", "from transformers import TrainerCallback\n", "import os\n", "from transformers import TrainingArguments, Trainer\n", "#Create directory to save model\n", "os.makedirs(\"./best_model\", exist_ok=True)\n", "\n", "#Create a callback class to save the best model\n", "class SaveBestModelCallback(TrainerCallback):\n", " #Initialize the class variables and values\n", " def __init__(self):\n", " self.best_f1_score = 0\n", " #Get the evaluation metrics\n", " def on_evaluate(self, args, state, control, metrics, **kwargs):\n", " metrics = trainer.evaluate()\n", " f1_score = metrics[\"eval_f1\"]\n", " #Save the model if the current f1 score is higher that the best f1 score so far\n", " if f1_score > self.best_f1_score:\n", " self.best_f1_score = f1_score\n", " model.save_pretrained(\"./best_model\")\n", " tokenizer.save_pretrained(\"./best_model\")\n", " #Print the f1 score\n", " print(f\"New best model saved with F1 score: {f1_score}\")\n", "\n", "# Load and preprocess the data\n", "train_data = pd.read_csv(\"train_links.csv\", encoding='utf-8', encoding_errors='ignore')\n", "test_data = pd.read_csv(\"test_links.csv\", encoding='utf-8', encoding_errors='ignore')\n", "\n", "test_data=test_data[:16171]\n", "\n", "train_data=train_data[['email', 'label']]\n", "test_data=test_data[['email', 'label']]\n", "\n", "\n", "#print(len(train_data))\n", "#print(train_data[train_data['label'].isnull()])\n", "\n", "\n", "train_data['label'] = train_data['label'].astype(int)\n", "test_data['label'] = test_data['label'].astype(int)\n", "\n", "#Convert all column data to strings\n", "train_email_list=train_data[\"email\"].tolist()\n", "for i in range(len(train_email_list)):\n", " if type(train_email_list[i]) != type('a'):\n", " temp=str(train_email_list[i])\n", " train_email_list[i]=temp\n", "\n", "#Get the label lists\n", "train_label_list=train_data[\"label\"].tolist()\n", "\n", "#print(len(train_email_list))\n", "#print(len(train_label_list))\n", "\n", "\n", "for i in range(len(train_label_list)):\n", " if type(train_label_list[i]) != type(1):\n", " temp=int(train_label_list[i])\n", " train_label_list[i]=temp\n", "\n", "#Convert null values in labels to 0\n", "count=0\n", "#print(count)\n", "for i in (train_data[\"label\"].tolist()):\n", " if type(i) != type(1):\n", " count+=1\n", "\n", "#print(count)\n", "\n", "#print(len(train_data))\n", "#print(train_data[train_data['label'].isnull()])\n", "\n", "\n", "#Get test email and label lists\n", "test_email_list=test_data[\"email\"].tolist()\n", "for i in range(len(test_email_list)):\n", " if type(test_email_list[i]) != type('a'):\n", " temp=str(test_email_list[i])\n", " test_email_list[i]=temp\n", "\n", "\n", "test_label_list=test_data[\"label\"].tolist()\n", "\n", "#print(len(train_email_list))\n", "#print(len(train_label_list))\n", "\n", "\n", "for i in range(len(test_label_list)):\n", " if type(test_label_list[i]) != type(1):\n", " temp=int(test_label_list[i])\n", " test_label_list[i]=temp\n", "\n", "count=0\n", "#print(count)\n", "for i in (test_data[\"label\"].tolist()):\n", " if type(i) != type(1):\n", " count+=1\n", "\n", "#print(count)\n", "\n", "train_data=train_data[['email', 'label']]\n", "test_data=test_data[['email', 'label']]\n", "\n", "train_data['label'] = train_data['label'].astype(int)\n", "test_data['label'] = test_data['label'].astype(int)\n", "\n", "#Load the RoBERTa tokenizer\n", "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n", "\n", "#Preprocess the data\n", "def preprocess(df):\n", " inputs = tokenizer(df[\"email\"].tolist(), return_tensors=\"pt\", padding=True, truncation=True, max_length=512)\n", " labels = torch.tensor(df[\"label\"].tolist())\n", " return inputs, labels\n", "\n", "train_inputs, train_labels = preprocess(train_data)\n", "test_inputs, test_labels = preprocess(test_data)\n", "\n", "# Custom dataset class\n", "class CustomDataset(Dataset):\n", " def __init__(self, inputs, labels):\n", " self.inputs = inputs\n", " self.labels = labels\n", "\n", " def __len__(self):\n", " return len(self.labels)\n", "\n", " def __getitem__(self, idx):\n", " item = {key: val[idx] for key, val in self.inputs.items()}\n", " item[\"labels\"] = self.labels[idx]\n", " return item\n", "\n", "# Prepare the RoBERTa model for training\n", "model = RobertaForSequenceClassification.from_pretrained(\"roberta-base\", num_labels=2)\n", "\n", "# Define the Trainer and TrainingArguments\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " num_train_epochs=1,\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=16,\n", " logging_dir=\"./logs\",\n", " logging_steps=100,\n", " save_steps=1000,\n", " evaluation_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", " weight_decay=0.01,\n", ")\n", "\n", "#Define the compute metrics function\n", "def compute_metrics(pred):\n", " labels = pred.label_ids\n", " preds = pred.predictions.argmax(-1)\n", " metrics = classification_report(labels, preds, output_dict=True)[\"weighted avg\"]\n", " return {\"f1\": metrics[\"f1-score\"]}\n", "\n", "\n", "#Initialize the trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=CustomDataset(train_inputs, train_labels),\n", " eval_dataset=CustomDataset(test_inputs, test_labels),\n", " compute_metrics=compute_metrics,\n", ")\n", "\n", "#trainer.add_callback(SaveBestModelCallback())\n", "trainer.train()\n", "\n", "# Evaluate the model\n", "eval_results = trainer.evaluate()\n", "\n", "#Printing the results\n", "print(\"Evaluation results:\", eval_results)\n", "\n", "\n", "#Save the best model\n", "model.save_pretrained('./best_model')\n", "model.save_pretrained('./best_model.h5')\n", "tokenizer.save_pretrained(\"./best_model\")\n", "\n", "\"\"\"\n", "best_model = RobertaForSequenceClassification.from_pretrained(\"./best_model\")\n", "best_tokenizer = RobertaTokenizer.from_pretrained(\"./best_model\")\n", "For using the saved model in a Google Chrome extension, you would need to use a server-side solution or a cloud-based API to connect your extension to the trained model.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = RobertaForSequenceClassification.from_pretrained(\"./best_model\")\n", "tokenizer = RobertaTokenizer.from_pretrained(\"./best_model\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inputs = tokenizer(\"www.tiem.utk.edu/~gross/bioed/bealsmodules/spider.html\", return_tensors=\"pt\")\n", "outputs = model(**inputs)\n", "predictions = torch.argmax(outputs.logits, dim=-1)\n", "\n", "print(predictions)" ] } ], "metadata": { "language_info": { "name": "python" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }