In [None]:
#Load the required libraries
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import os
from transformers import TrainingArguments, Trainer
#Create directory to save model
os.makedirs("./best_model", exist_ok=True)

#Create a callback class to save the best model
class SaveBestModelCallback(TrainerCallback):
 #Initialize the class variables and values
 def __init__(self):
 self.best_f1_score = 0
 #Get the evaluation metrics
 def on_evaluate(self, args, state, control, metrics, **kwargs):
 metrics = trainer.evaluate()
 f1_score = metrics["eval_f1"]
 #Save the model if the current f1 score is higher that the best f1 score so far
 if f1_score > self.best_f1_score:
 self.best_f1_score = f1_score
 model.save_pretrained("./best_model")
 tokenizer.save_pretrained("./best_model")
 #Print the f1 score
 print(f"New best model saved with F1 score: {f1_score}")

# Load and preprocess the data
train_data = pd.read_csv("train_links.csv", encoding='utf-8', encoding_errors='ignore')
test_data = pd.read_csv("test_links.csv", encoding='utf-8', encoding_errors='ignore')

test_data=test_data[:16171]

train_data=train_data[['email', 'label']]
test_data=test_data[['email', 'label']]


#print(len(train_data))
#print(train_data[train_data['label'].isnull()])


train_data['label'] = train_data['label'].astype(int)
test_data['label'] = test_data['label'].astype(int)

#Convert all column data to strings
train_email_list=train_data["email"].tolist()
for i in range(len(train_email_list)):
 if type(train_email_list[i]) != type('a'):
 temp=str(train_email_list[i])
 train_email_list[i]=temp

#Get the label lists
train_label_list=train_data["label"].tolist()

#print(len(train_email_list))
#print(len(train_label_list))


for i in range(len(train_label_list)):
 if type(train_label_list[i]) != type(1):
 temp=int(train_label_list[i])
 train_label_list[i]=temp

#Convert null values in labels to 0
count=0
#print(count)
for i in (train_data["label"].tolist()):
 if type(i) != type(1):
 count+=1

#print(count)

#print(len(train_data))
#print(train_data[train_data['label'].isnull()])


#Get test email and label lists
test_email_list=test_data["email"].tolist()
for i in range(len(test_email_list)):
 if type(test_email_list[i]) != type('a'):
 temp=str(test_email_list[i])
 test_email_list[i]=temp


test_label_list=test_data["label"].tolist()

#print(len(train_email_list))
#print(len(train_label_list))


for i in range(len(test_label_list)):
 if type(test_label_list[i]) != type(1):
 temp=int(test_label_list[i])
 test_label_list[i]=temp

count=0
#print(count)
for i in (test_data["label"].tolist()):
 if type(i) != type(1):
 count+=1

#print(count)

train_data=train_data[['email', 'label']]
test_data=test_data[['email', 'label']]

train_data['label'] = train_data['label'].astype(int)
test_data['label'] = test_data['label'].astype(int)

#Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

#Preprocess the data
def preprocess(df):
 inputs = tokenizer(df["email"].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
 labels = torch.tensor(df["label"].tolist())
 return inputs, labels

train_inputs, train_labels = preprocess(train_data)
test_inputs, test_labels = preprocess(test_data)

# Custom dataset class
class CustomDataset(Dataset):
 def __init__(self, inputs, labels):
 self.inputs = inputs
 self.labels = labels

 def __len__(self):
 return len(self.labels)

 def __getitem__(self, idx):
 item = {key: val[idx] for key, val in self.inputs.items()}
 item["labels"] = self.labels[idx]
 return item

# Prepare the RoBERTa model for training
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Define the Trainer and TrainingArguments
training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=1,
 per_device_train_batch_size=8,
 per_device_eval_batch_size=16,
 logging_dir="./logs",
 logging_steps=100,
 save_steps=1000,
 evaluation_strategy="epoch",
 learning_rate=2e-5,
 weight_decay=0.01,
)

#Define the compute metrics function
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 metrics = classification_report(labels, preds, output_dict=True)["weighted avg"]
 return {"f1": metrics["f1-score"]}


#Initialize the trainer
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=CustomDataset(train_inputs, train_labels),
 eval_dataset=CustomDataset(test_inputs, test_labels),
 compute_metrics=compute_metrics,
)

#trainer.add_callback(SaveBestModelCallback())
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

#Printing the results
print("Evaluation results:", eval_results)


#Save the best model
model.save_pretrained('./best_model')
model.save_pretrained('./best_model.h5')
tokenizer.save_pretrained("./best_model")

"""
best_model = RobertaForSequenceClassification.from_pretrained("./best_model")
best_tokenizer = RobertaTokenizer.from_pretrained("./best_model")
For using the saved model in a Google Chrome extension, you would need to use a server-side solution or a cloud-based API to connect your extension to the trained model.
"""

In [None]:
model = RobertaForSequenceClassification.from_pretrained("./best_model")
tokenizer = RobertaTokenizer.from_pretrained("./best_model")

In [None]:
inputs = tokenizer("www.tiem.utk.edu/~gross/bioed/bealsmodules/spider.html", return_tensors="pt")
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)

print(predictions)