|
import matplotlib.pyplot as plt |
|
import torch |
|
import torch.nn as nn |
|
from sklearn.metrics import f1_score |
|
from torch.utils.data import Dataset |
|
|
|
|
|
def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights): |
|
|
|
tokenizer = tokenizer_class.from_pretrained(pretrained_weights) |
|
model = model_class.from_pretrained(pretrained_weights) |
|
return model, tokenizer |
|
|
|
|
|
def train_model( |
|
DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion |
|
): |
|
|
|
if not os.path.exists("weights"): |
|
os.makedirs("weights") |
|
|
|
|
|
train_losses = [] |
|
train_accuracies = [] |
|
val_losses = [] |
|
val_accuracies = [] |
|
val_f1_scores = [] |
|
|
|
best_val_loss = float("inf") |
|
|
|
for epoch in range(epochs): |
|
model.train() |
|
train_loss = 0.0 |
|
total = 0 |
|
correct = 0 |
|
for batch in train_loader: |
|
optimizer.zero_grad() |
|
input_ids, attention_mask, labels = batch |
|
input_ids = input_ids.to(DEVICE) |
|
attention_mask = attention_mask.to(DEVICE) |
|
labels = labels.to(DEVICE) |
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
loss = criterion(outputs, labels.float().unsqueeze(1)) |
|
loss.backward() |
|
optimizer.step() |
|
train_loss += loss.item() |
|
preds = torch.round(torch.sigmoid(outputs)) |
|
total += labels.size(0) |
|
correct += (preds == labels.unsqueeze(1)).sum().item() |
|
|
|
accuracy = correct / total |
|
avg_train_loss = train_loss / len(train_loader) |
|
train_losses.append(avg_train_loss) |
|
train_accuracies.append(accuracy) |
|
|
|
model.eval() |
|
val_loss = 0.0 |
|
total_preds = [] |
|
total_labels = [] |
|
with torch.no_grad(): |
|
total = 0 |
|
correct = 0 |
|
for batch in valid_loader: |
|
input_ids, attention_mask, labels = batch |
|
input_ids = input_ids.to(DEVICE) |
|
attention_mask = attention_mask.to(DEVICE) |
|
labels = labels.to(DEVICE) |
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
loss = criterion(outputs, labels.float().unsqueeze(1)) |
|
val_loss += loss.item() |
|
preds = torch.round(torch.sigmoid(outputs)) |
|
total += labels.size(0) |
|
correct += (preds == labels.unsqueeze(1)).sum().item() |
|
total_preds.extend(preds.detach().cpu().numpy()) |
|
total_labels.extend(labels.detach().cpu().numpy()) |
|
|
|
accuracy = correct / total |
|
f1 = f1_score(total_labels, total_preds) |
|
avg_val_loss = val_loss / len(valid_loader) |
|
val_losses.append(avg_val_loss) |
|
val_accuracies.append(accuracy) |
|
val_f1_scores.append(f1) |
|
|
|
|
|
if avg_val_loss < best_val_loss: |
|
best_val_loss = avg_val_loss |
|
torch.save(model.state_dict(), "weights/best_bert_weights.pth") |
|
|
|
print(f"Epoch {epoch+1}") |
|
print( |
|
f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}" |
|
) |
|
print( |
|
f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}" |
|
) |
|
print(25 * "==") |
|
|
|
return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores |
|
|
|
|
|
def predict_sentiment(text, model, tokenizer, DEVICE): |
|
|
|
model.eval() |
|
|
|
|
|
encoding = tokenizer.encode_plus( |
|
text, padding="max_length", truncation=True, max_length=512, return_tensors="pt" |
|
) |
|
input_ids = encoding["input_ids"].to(DEVICE) |
|
attention_mask = encoding["attention_mask"].to(DEVICE) |
|
|
|
|
|
with torch.no_grad(): |
|
output = model(input_ids, attention_mask=attention_mask) |
|
|
|
|
|
probability = torch.sigmoid(output).item() |
|
|
|
|
|
threshold = 0.5 |
|
|
|
|
|
if probability >= threshold: |
|
return 1 |
|
|
|
else: |
|
return 0 |
|
|
|
|
|
|
|
def load_model(model_class, pretrained_weights, weights_path): |
|
|
|
model = ruBERTClassifier(model_class, pretrained_weights) |
|
|
|
|
|
model.load_state_dict(torch.load(weights_path, map_location="cpu")) |
|
|
|
return model |
|
|
|
|
|
def plot_metrics( |
|
train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores |
|
): |
|
epochs = range(1, len(train_losses) + 1) |
|
|
|
fig, axs = plt.subplots(1, 2, figsize=(15, 5)) |
|
|
|
|
|
axs[0].plot(epochs, train_losses, "r--", label="Training Loss") |
|
axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss") |
|
axs[0].set_title("Training and Validation Loss") |
|
axs[0].set_xlabel("Epochs") |
|
axs[0].set_ylabel("Loss") |
|
axs[0].legend() |
|
|
|
|
|
axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy") |
|
axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy") |
|
axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score") |
|
axs[1].set_title("Training and Validation Accuracy and F1 Score") |
|
axs[1].set_xlabel("Epochs") |
|
axs[1].set_ylabel("Metric Value") |
|
axs[1].legend() |
|
|
|
plt.tight_layout() |
|
plt.savefig("metrics_plot.png") |
|
plt.show() |
|
|
|
|
|
class TextClassificationDataset(Dataset): |
|
def __init__(self, texts, labels, tokenizer): |
|
self.texts = texts |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
|
|
def __len__(self): |
|
return len(self.texts) |
|
|
|
def __getitem__(self, idx): |
|
text = self.texts[idx] |
|
label = self.labels[idx] |
|
encoding = self.tokenizer.encode_plus( |
|
text, |
|
padding="max_length", |
|
truncation=True, |
|
max_length=512, |
|
return_tensors="pt", |
|
) |
|
return ( |
|
encoding["input_ids"].squeeze(), |
|
encoding["attention_mask"].squeeze(), |
|
torch.tensor(label), |
|
) |
|
|
|
|
|
class ruBERTClassifier(nn.Module): |
|
def __init__(self, model_class, pretrained_weights): |
|
super().__init__() |
|
self.bert = model_class.from_pretrained(pretrained_weights) |
|
|
|
for param in self.bert.parameters(): |
|
param.requires_grad = False |
|
|
|
|
|
for param in self.bert.pooler.parameters(): |
|
param.requires_grad = True |
|
|
|
self.linear = nn.Sequential( |
|
nn.Linear(312, 256), |
|
nn.ReLU(), |
|
nn.Dropout(), |
|
nn.Linear(256, 1), |
|
) |
|
|
|
def forward(self, x, attention_mask): |
|
bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :] |
|
out = self.linear(bert_out) |
|
return out |
|
|