Mile-stone-3 / train.py
kya5's picture
Duplicate from kya5/milestone-3
9047480
raw
history blame
3.95 kB
import pandas as pd
import os
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
import torch
from torch.utils.data import Dataset
torch.cuda.empty_cache()
class MultiLabelClassifierDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx])
for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx]).float()
return item
def __len__(self):
return len(self.labels)
work_dir = os.path.dirname(os.path.realpath(__file__)) + '/'
dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/'
classifiers = ['toxic', 'severe_toxic', 'obscene',
'threat', 'insult', 'identity_hate']
df = pd.read_csv(dataset_dir + 'train.csv')
df = df.sample(frac=1).reset_index(drop=True) # Shuffle
train_df = df[:int(len(df)*0.1)]
train_labels = train_df[classifiers].to_numpy()
device = torch.device('cuda')
print("Using device: ", device)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=2,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
fp16=True
)
print("BERT")
bert_dir = work_dir + 'bert/'
print("Model base: ", "vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained(
"vinai/bertweet-base", model_max_length=128)
train_encodings = tokenizer(
train_df['comment_text'].tolist(), truncation=True, padding=True)
print("Training model to be stored in" + bert_dir)
print("Creating dataset")
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
print("Loading model for training...")
model = AutoModelForSequenceClassification.from_pretrained(
'vinai/bertweet-base', num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
trainer.save_model(bert_dir + '_bert_model')
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=32,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
fp16=True
)
print("RoBERTa")
roberta_dir = work_dir + 'roberta/'
tokenizer = RobertaTokenizer.from_pretrained(
'roberta-base', model_max_length=128)
train_encodings = tokenizer(
train_df['comment_text'].tolist(), truncation=True, padding=True)
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
model = AutoModelForSequenceClassification.from_pretrained(
'roberta-base', num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
trainer.save_model(roberta_dir + '_roberta_model')
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
fp16=True
)
print("DISTILBERT")
distilbert_dir = work_dir + 'distilbert/'
tokenizer = AutoTokenizer.from_pretrained(
'distilbert-base-cased', model_max_length=128)
train_encodings = tokenizer(
train_df['comment_text'].tolist(), truncation=True, padding=True)
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
model = AutoModelForSequenceClassification.from_pretrained(
'distilbert-base-cased', num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
trainer.save_model(distilbert_dir + '_distilbert_model')