Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""AiProjectTest.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1E4AHYbuRi_FbOMhQntdAMMZMY14hWh2e | |
""" | |
from pathlib import Path | |
from sklearn.model_selection import train_test_split | |
import torch | |
from torch.utils.data import Dataset | |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification | |
from transformers import Trainer, TrainingArguments | |
from torch.utils.data import DataLoader | |
from transformers import AdamW | |
import pandas as pd | |
df_train = pd.read_csv('train.csv') | |
df_test = pd.read_csv('test.csv') | |
df_test_labels = pd.read_csv('test_labels.csv') | |
model_name = "distilbert-base-uncased" | |
def read_file(f): | |
texts = f['comment_text'].tolist() | |
labels = [] | |
for i in range(len(f)): | |
temp = [] | |
temp.append(f['toxic'][i]) | |
temp.append(f['severe_toxic'][i]) | |
temp.append(f['obscene'][i]) | |
temp.append(f['threat'][i]) | |
temp.append(f['insult'][i]) | |
temp.append(f['identity_hate'][i]) | |
labels.append(temp) | |
return texts, labels | |
train_texts, train_labels = read_file(df_train) | |
test_texts = df_test['comment_text'].tolist() | |
test_labels = [] | |
for i in range(len(df_test_labels)): | |
temp = [] | |
temp.append(df_test_labels['toxic'][i]) | |
temp.append(df_test_labels['severe_toxic'][i]) | |
temp.append(df_test_labels['obscene'][i]) | |
temp.append(df_test_labels['threat'][i]) | |
temp.append(df_test_labels['insult'][i]) | |
temp.append(df_test_labels['identity_hate'][i]) | |
test_labels.append(temp) | |
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2) | |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
ind = 0 | |
train_encodings = {'input_ids': [], 'attention_mask': []} | |
for i in range(len(train_texts)//16): | |
temp = tokenizer(train_texts[ind:ind+16], truncation=True, padding=True) | |
train_encodings['input_ids'] += temp['input_ids'] | |
train_encodings['attention_mask'] += temp['attention_mask'] | |
ind += 16 | |
ind = 0 | |
val_encodings = {'input_ids': [], 'attention_mask': []} | |
for i in range(len(val_texts)//16): | |
temp = tokenizer(val_texts[ind:ind+16], truncation=True, padding=True) | |
val_encodings['input_ids'] += temp['input_ids'] | |
val_encodings['attention_mask'] += temp['attention_mask'] | |
ind += 16 | |
ind = 0 | |
test_encodings = {'input_ids': [], 'attention_mask': []} | |
for i in range(len(test_texts)//16): | |
temp = tokenizer(test_texts[ind:ind+16], truncation=True, padding=True) | |
test_encodings['input_ids'] += temp['input_ids'] | |
test_encodings['attention_mask'] += temp['attention_mask'] | |
ind += 16 | |
while True: | |
if len(train_labels) > len(train_encodings): | |
train_labels.pop() | |
else: | |
break | |
while True: | |
if len(val_labels) > len(val_encodings): | |
val_labels.pop() | |
else: | |
break | |
while True: | |
if len(test_labels) > len(test_encodings): | |
test_labels.pop() | |
else: | |
break | |
class dataset(Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
item['labels'] = torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return(len(self.labels)) | |
train_dataset_list = [[], [], [], [], [], []] | |
for i in train_labels: | |
for j in range(6): | |
train_dataset_list[j].append(i[j]) | |
val_dataset_list = [[], [], [], [], [], []] | |
for i in val_labels: | |
for j in range(6): | |
val_dataset_list[j].append(i[j]) | |
train_dataset_0 = dataset(train_encodings, train_dataset_list[0]) | |
train_dataset_1 = dataset(train_encodings, train_dataset_list[1]) | |
train_dataset_2 = dataset(train_encodings, train_dataset_list[2]) | |
train_dataset_3 = dataset(train_encodings, train_dataset_list[3]) | |
train_dataset_4 = dataset(train_encodings, train_dataset_list[4]) | |
train_dataset_5 = dataset(train_encodings, train_dataset_list[5]) | |
val_dataset_0 = dataset(val_encodings, val_dataset_list[0]) | |
val_dataset_1 = dataset(val_encodings, val_dataset_list[1]) | |
val_dataset_2 = dataset(val_encodings, val_dataset_list[2]) | |
val_dataset_3 = dataset(val_encodings, val_dataset_list[3]) | |
val_dataset_4 = dataset(val_encodings, val_dataset_list[4]) | |
val_dataset_5 = dataset(val_encodings, val_dataset_list[5]) | |
training_args = TrainingArguments(output_dir='./results', | |
num_train_epochs=2, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=64, | |
warmup_steps=500, learning_rate=5e-5, | |
weight_decay=.01, logging_dir='./logs', | |
logging_steps=10) | |
model = DistilBertForSequenceClassification.from_pretrained(model_name) | |
trainer_0 = Trainer(model=model, args=training_args, train_dataset=train_dataset_0, eval_dataset=val_dataset_0) | |
trainer_0.train() | |
trainer_1 = Trainer(model=model, args=training_args, train_dataset=train_dataset_1, eval_dataset=val_dataset_1) | |
trainer_1.train() | |
trainer_2 = Trainer(model=model, args=training_args, train_dataset=train_dataset_2, eval_dataset=val_dataset_2) | |
trainer_2.train() | |
trainer_3 = Trainer(model=model, args=training_args, train_dataset=train_dataset_3, eval_dataset=val_dataset_3) | |
trainer_3.train() | |
trainer_4 = Trainer(model=model, args=training_args, train_dataset=train_dataset_4, eval_dataset=val_dataset_4) | |
trainer_4.train() | |
trainer_5 = Trainer(model=model, args=training_args, train_dataset=train_dataset_5, eval_dataset=val_dataset_5) | |
trainer_5.train() | |
# train_dataset = dataset(train_encodings, train_labels) | |
# val_dataset = dataset(val_encodings, val_labels) | |
# test_dataset = dataset(test_encodings, test_labels) | |
# ----------------------------------------------------------------- | |
# test_dataset_list = [[], [], [], [], [], []] | |
# for i in test_labels: | |
# for j in range(6): | |
# test_dataset_list[j].append(i[j]) | |
# ----------------------------------------------------------------- | |
# val_dataset = dataset(val_encodings, val_labels) | |
# test_dataset_0 = dataset(test_encodings, test_dataset_list[0]) | |
# test_dataset_1 = dataset(test_encodings, test_dataset_list[1]) | |
# test_dataset_2 = dataset(test_encodings, test_dataset_list[2]) | |
# test_dataset_3 = dataset(test_encodings, test_dataset_list[3]) | |
# test_dataset_4 = dataset(test_encodings, test_dataset_list[4]) | |
# test_dataset_5 = dataset(test_encodings, test_dataset_list[5]) | |
# ----------------------------------------------------------------- | |
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') | |
# model.to(device) | |
# model.train() | |
# train_loader = DataLoader(train_dataset_0, batch_size=16, shuffle=True) | |
# optim = AdamW(model.parameters(), lr=5e-5) | |
# num_train_epochs = 2 | |
# for epoch in range(num_train_epochs): | |
# for batch in train_loader: | |
# optim.zero_grad() | |
# input_ids = batch['input_ids'].to(device) | |
# attention_mask = batch['attention_mask'].to(device) | |
# labels = batch['labels'].to(device) | |
# outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
# loss = outputs[0] | |
# loss.backward() | |
# optim.step() | |
# model.eval() |