File size: 2,998 Bytes
8fdaf9e
cf5d81e
01769d2
 
cf5d81e
01769d2
cf5d81e
 
 
8fdaf9e
01769d2
 
 
 
 
 
 
 
 
 
 
 
 
 
cf5d81e
 
 
 
 
 
 
01769d2
 
cf5d81e
 
01769d2
8fdaf9e
cf5d81e
01769d2
 
cf5d81e
 
01769d2
 
cf5d81e
 
 
01769d2
 
 
 
 
 
 
 
 
 
 
 
 
8fdaf9e
01769d2
 
 
 
 
 
 
 
 
 
 
 
 
 
cf5d81e
01769d2
cf5d81e
01769d2
cf5d81e
01769d2
cf5d81e
01769d2
 
 
 
 
 
cf5d81e
01769d2
cf5d81e
01769d2
 
 
cf5d81e
01769d2
cf5d81e
 
 
 
 
 
8fdaf9e
cf5d81e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
# from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split


# assignment 3
model_name = "bert-base-uncased"

class ToxicDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

print("Reading data...")
data = pd.read_csv("./data/train.csv")
toxic_data = pd.DataFrame()
toxic_data["text"] = data["comment_text"]
toxic_data["labels"] = data.iloc[:, 2:].values.tolist()

print("Data read. Splitting data...")
train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.text.to_list(), toxic_data.labels.to_list(), test_size=.2)


print("Data split. Tokenizing data...")
tokenizer = BertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')


train_dataset = ToxicDataset(train_encodings, train_labels)
val_dataset = ToxicDataset(val_encodings, val_labels)

print("Data tokenized. Beginning training...")

training_args = TrainingArguments(
    output_dir="./results",          
    num_train_epochs=2,             
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=16, 
    warmup_steps=500,               
    weight_decay=0.01,             
    logging_dir="./logs",            
    logging_steps=10,
)

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = BertModel.from_pretrained(model_name, num_labels=6)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

# model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)

# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# optim = AdamW(model.parameters(), lr=5e-5)

# num_train_epochs = 2

# for epoch in range(num_train_epochs):
#     for batch in train_loader:
#         optim.zero_grad()
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
#         loss = outputs[0]
#         loss.backward()
#         optim.step()

# model.eval()




print("Training complete. Saving model...")

save_directory = "./results/model"
model.save_pretrained(save_directory)

print("Model saved.")