In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import AdamW
import pandas as pd
from huggingface_hub import notebook_login

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test_labels = pd.read_csv('test_labels.csv')

model_name = "distilbert-base-uncased"

def read_file(f):
  texts = f['comment_text'].tolist()
  labels = []
  for i in range(len(f)):
    temp = []
    temp.append(f['toxic'][i])
    temp.append(f['severe_toxic'][i])
    temp.append(f['obscene'][i])
    temp.append(f['threat'][i])
    temp.append(f['insult'][i])
    temp.append(f['identity_hate'][i])
    labels.append(temp)
  return texts, labels

train_texts, train_labels = read_file(df_train)
test_texts = df_test['comment_text'].tolist()
test_labels = []
for i in range(len(df_test_labels)):
  temp = []
  temp.append(df_test_labels['toxic'][i])
  temp.append(df_test_labels['severe_toxic'][i])
  temp.append(df_test_labels['obscene'][i])
  temp.append(df_test_labels['threat'][i])
  temp.append(df_test_labels['insult'][i])
  temp.append(df_test_labels['identity_hate'][i])
  test_labels.append(temp)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

ind = 0
train_encodings = {'input_ids': [], 'attention_mask': []}

for i in range(len(train_texts)//16):
  temp = tokenizer(train_texts[ind:ind+16], truncation=True, padding=True)
  train_encodings['input_ids'] += temp['input_ids']
  train_encodings['attention_mask'] += temp['attention_mask']
  ind += 16

ind = 0
val_encodings = {'input_ids': [], 'attention_mask': []}

for i in range(len(val_texts)//16):
  temp = tokenizer(val_texts[ind:ind+16], truncation=True, padding=True)
  val_encodings['input_ids'] += temp['input_ids']
  val_encodings['attention_mask'] += temp['attention_mask']
  ind += 16

ind = 0
test_encodings = {'input_ids': [], 'attention_mask': []}

for i in range(len(test_texts)//16):
  temp = tokenizer(test_texts[ind:ind+16], truncation=True, padding=True)
  test_encodings['input_ids'] += temp['input_ids']
  test_encodings['attention_mask'] += temp['attention_mask']
  ind += 16

while True:
  if len(train_labels) > len(train_encodings):
    train_labels.pop()
  else:
    break
  
while True:
  if len(val_labels) > len(val_encodings):
    val_labels.pop()
  else:
    break

while True:
  if len(test_labels) > len(test_encodings):
    test_labels.pop()
  else:
    break

class dataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels
  
  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
  
  def __len__(self):
    return(len(self.labels))

train_dataset_list = [[], [], [], [], [], []]
for i in train_labels:
  for j in range(6):
    train_dataset_list[j].append(i[j])
  
val_dataset_list = [[], [], [], [], [], []]
for i in val_labels:
  for j in range(6):
    val_dataset_list[j].append(i[j])

train_dataset_0 = dataset(train_encodings, train_dataset_list[0])
train_dataset_1 = dataset(train_encodings, train_dataset_list[1])
train_dataset_2 = dataset(train_encodings, train_dataset_list[2])
train_dataset_3 = dataset(train_encodings, train_dataset_list[3])
train_dataset_4 = dataset(train_encodings, train_dataset_list[4])
train_dataset_5 = dataset(train_encodings, train_dataset_list[5])

val_dataset_0 = dataset(val_encodings, val_dataset_list[0])
val_dataset_1 = dataset(val_encodings, val_dataset_list[1])
val_dataset_2 = dataset(val_encodings, val_dataset_list[2])
val_dataset_3 = dataset(val_encodings, val_dataset_list[3])
val_dataset_4 = dataset(val_encodings, val_dataset_list[4])
val_dataset_5 = dataset(val_encodings, val_dataset_list[5])

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# save tokenizer
tokenizer.save_pretrained("tokenizer.json")

('tokenizer.json/tokenizer_config.json',
 'tokenizer.json/special_tokens_map.json',
 'tokenizer.json/vocab.txt',
 'tokenizer.json/added_tokens.json',
 'tokenizer.json/tokenizer.json')

In [None]:
# training_args = TrainingArguments(output_dir='Rathgeberj/milestone3_fine_tuned', 
#                                   push_to_hub=True, 
#                                   num_train_epochs=2, 
#                                   per_device_train_batch_size=16, 
#                                   per_device_eval_batch_size=64, 
#                                   warmup_steps=500, learning_rate=5e-5, 
#                                   weight_decay=.01, logging_dir='./logs', 
#                                   logging_steps=10)
# /Users/jeffreyrathgeber

training_args = TrainingArguments(output_dir='results', 
                                  num_train_epochs=2, 
                                  per_device_train_batch_size=16, 
                                  per_device_eval_batch_size=64, 
                                  warmup_steps=500, learning_rate=5e-5, 
                                  weight_decay=.01, logging_dir='./logs', 
                                  logging_steps=10)

model_0 = DistilBertForSequenceClassification.from_pretrained(model_name)
model_1 = DistilBertForSequenceClassification.from_pretrained(model_name)
model_2 = DistilBertForSequenceClassification.from_pretrained(model_name)
model_3 = DistilBertForSequenceClassification.from_pretrained(model_name)
model_4 = DistilBertForSequenceClassification.from_pretrained(model_name)
model_5 = DistilBertForSequenceClassification.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [None]:
trainer_0 = Trainer(model=model_0, args=training_args, train_dataset=train_dataset_0, eval_dataset=val_dataset_0)
trainer_0.train()
trainer_0.save_model(output_dir='NEW')



Step,Training Loss


In [None]:
trainer_1 = Trainer(model=model_1, args=training_args, train_dataset=train_dataset_1, eval_dataset=val_dataset_1)
trainer_1.train()
trainer_1.save_model(output_dir='UPDATED_1')

Step,Training Loss


In [None]:
trainer_2 = Trainer(model=model_2, args=training_args, train_dataset=train_dataset_2, eval_dataset=val_dataset_2)
trainer_2.train()
trainer_2.save_model(output_dir='UPDATED_2')


Step,Training Loss


In [None]:
trainer_3 = Trainer(model=model_3, args=training_args, train_dataset=train_dataset_3, eval_dataset=val_dataset_3)
trainer_3.train()
trainer_3.save_model(output_dir='UPDATED_3')

Step,Training Loss


In [None]:
trainer_4 = Trainer(model=model_4, args=training_args, train_dataset=train_dataset_4, eval_dataset=val_dataset_4)
trainer_4.train()
trainer_4.save_model(output_dir='UPDATED_4')

Step,Training Loss


In [None]:
trainer_5 = Trainer(model=model_5, args=training_args, train_dataset=train_dataset_5, eval_dataset=val_dataset_5)
trainer_5.train()
trainer_5.save_model(output_dir='UPDATED_5')

Step,Training Loss
