In [None]:
# For use with Google Colab only to install transformers library
# !pip install transformers

In [2]:
# importing necessary libraries
import torch 
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [3]:
# define the model name
model_name = "distilbert-base-uncased"

# reading in the data and splitting into features and labels
df = pd.read_csv("train.csv")
train_texts = df["comment_text"].values
train_labels = df[df.columns[2:]].values

df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# splitting up the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [None]:
# getting the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, max_length=1024)

# creating a custom dataset for training
class ToxicDataset(Dataset):
  def __init__(self, texts, labels):
    self.texts = texts
    self.labels = labels
  
  def __getitem__(self, index):
    encodings = tokenizer(self.texts[index], truncation=True, padding='max_length')
    item = {key: torch.tensor(val) for key, val in encodings.items()}
    item['labels'] = torch.tensor(self.labels[index], dtype=torch.float32)
    del encodings #
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
# creating a dataloader for training and custom dataset
# device is set in order to use GPU for training, adjust code accordingly if GPU is not available
device = torch.device('cuda')

# download model and prepare it for training
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
model.to(device)
model.train()

# defining the dataset and dataloader
train_dataset = ToxicDataset(train_texts, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16)

In [None]:
# getting the optimizer and setting the number of epochs
optim = AdamW(model.parameters(), lr=5e-5)
num_train_epochs = 1

In [8]:
# training the model
for epoch in range(num_train_epochs):
  for batch in train_dataloader:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    loss = outputs[0]
    loss.backward()
    optim.step()

In [None]:
# setting the model to evaluation mode
model.eval()

In [17]:
# testing a predication on a single example from the training set
X_train = ["COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK"]
batch = tokenizer(X_train, truncation=True, padding='max_length', return_tensors="pt").to(device)

with torch.no_grad():
  outputs = model(**batch)
  predictions = torch.sigmoid(outputs.logits)*100
  print(predictions)

tensor([[99.9134, 47.9581, 99.0946,  0.6099, 91.4176,  1.0425]],
       device='cuda:0')


In [18]:
# saving the model and its tokenizer
model.save_pretrained("pretrained_model")
tokenizer.save_pretrained("model_tokenizer")