MLR-Copilot / example /ex2_final.py
Lim0011's picture
Upload 2 files
960d190 verified
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
# Define constants
DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
class EssayDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = self.texts[item]
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
truncation=True
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.float)
}
class EssayScoreRegressor(nn.Module):
def __init__(self, n_outputs):
super(EssayScoreRegressor, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
def forward(self, input_ids, attention_mask):
pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)['pooler_output']
output = self.drop(pooled_output)
return self.out(output)
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
for d in data_loader:
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
loss = loss_fn(outputs, targets)
losses.append(loss.item())
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return np.mean(losses)
def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len):
train_dataset = EssayDataset(
texts=train_data['full_text'].to_numpy(),
targets=train_data[DIMENSIONS].to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
val_dataset = EssayDataset(
texts=val_data['full_text'].to_numpy(),
targets=val_data[DIMENSIONS].to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
train_data_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True
)
val_data_loader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False
)
loss_fn = nn.MSELoss().to(device)
for epoch in range(epochs):
print(f'Epoch {epoch + 1}/{epochs}')
print('-' * 10)
train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(train_dataset)
)
print(f'Train loss {train_loss}')
if __name__ == "__main__":
df = pd.read_csv('train.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EssayScoreRegressor(n_outputs=len(DIMENSIONS))
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(df) // 16 * 5
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
train_data = df.sample(frac=0.8, random_state=42)
val_data = df.drop(train_data.index)
train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)