nlp_proj / model /funcs.py
Maslov-Artem
Add 3 classifiers
cb2adb5
raw
history blame
8.09 kB
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
from torch.utils.data import Dataset
def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
# Создаем объекты для токенизатора и модели
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
return model, tokenizer
def train_model(
DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion
):
# Создаем папку для сохранения весов, если она еще не существует
if not os.path.exists("weights"):
os.makedirs("weights")
# Инициализация списков для сохранения значений потерь и точности
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []
val_f1_scores = []
best_val_loss = float("inf")
for epoch in range(epochs):
model.train()
train_loss = 0.0
total = 0
correct = 0
for batch in train_loader:
optimizer.zero_grad()
input_ids, attention_mask, labels = batch
input_ids = input_ids.to(DEVICE)
attention_mask = attention_mask.to(DEVICE)
labels = labels.to(DEVICE)
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels.float().unsqueeze(1))
loss.backward()
optimizer.step()
train_loss += loss.item()
preds = torch.round(torch.sigmoid(outputs))
total += labels.size(0)
correct += (preds == labels.unsqueeze(1)).sum().item()
accuracy = correct / total
avg_train_loss = train_loss / len(train_loader)
train_losses.append(avg_train_loss)
train_accuracies.append(accuracy)
model.eval()
val_loss = 0.0
total_preds = []
total_labels = []
with torch.no_grad():
total = 0
correct = 0
for batch in valid_loader:
input_ids, attention_mask, labels = batch
input_ids = input_ids.to(DEVICE)
attention_mask = attention_mask.to(DEVICE)
labels = labels.to(DEVICE)
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels.float().unsqueeze(1))
val_loss += loss.item()
preds = torch.round(torch.sigmoid(outputs))
total += labels.size(0)
correct += (preds == labels.unsqueeze(1)).sum().item()
total_preds.extend(preds.detach().cpu().numpy())
total_labels.extend(labels.detach().cpu().numpy())
accuracy = correct / total
f1 = f1_score(total_labels, total_preds)
avg_val_loss = val_loss / len(valid_loader)
val_losses.append(avg_val_loss)
val_accuracies.append(accuracy)
val_f1_scores.append(f1)
# Если это лучшая модель, сохраняем веса
if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
torch.save(model.state_dict(), "weights/best_bert_weights.pth")
print(f"Epoch {epoch+1}")
print(
f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}"
)
print(
f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}"
)
print(25 * "==")
return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
def predict_sentiment(text, model, tokenizer, DEVICE):
# Модель должна быть в режиме оценки
model.eval()
# Токенизируем текст и конвертируем в тензор
encoding = tokenizer.encode_plus(
text, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
)
input_ids = encoding["input_ids"].to(DEVICE)
attention_mask = encoding["attention_mask"].to(DEVICE)
# Прогоняем текст через модель
with torch.no_grad():
output = model(input_ids, attention_mask=attention_mask)
# Преобразуем выход модели в вероятность с помощью сигмоиды
probability = torch.sigmoid(output).item()
# Задаем порог
threshold = 0.5
# Возвращаем вероятность положительного или отрицательного класса
if probability >= threshold:
return 1
# return f"С вероятностью {probability*100:.2f}% это положительный отзыв"
else:
return 0
# return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв"
def load_model(model_class, pretrained_weights, weights_path):
# Создаем экземпляр классификатора
model = ruBERTClassifier(model_class, pretrained_weights)
# Загружаем веса
model.load_state_dict(torch.load(weights_path, map_location="cpu"))
return model
def plot_metrics(
train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
):
epochs = range(1, len(train_losses) + 1)
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
# Первый подграфик для потерь
axs[0].plot(epochs, train_losses, "r--", label="Training Loss")
axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss")
axs[0].set_title("Training and Validation Loss")
axs[0].set_xlabel("Epochs")
axs[0].set_ylabel("Loss")
axs[0].legend()
# Второй подграфик для точности и F1-оценки
axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy")
axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy")
axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score")
axs[1].set_title("Training and Validation Accuracy and F1 Score")
axs[1].set_xlabel("Epochs")
axs[1].set_ylabel("Metric Value")
axs[1].legend()
plt.tight_layout()
plt.savefig("metrics_plot.png") # Сохраняем рисунок в файл
plt.show()
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer.encode_plus(
text,
padding="max_length",
truncation=True,
max_length=512,
return_tensors="pt",
)
return (
encoding["input_ids"].squeeze(),
encoding["attention_mask"].squeeze(),
torch.tensor(label),
)
class ruBERTClassifier(nn.Module):
def __init__(self, model_class, pretrained_weights):
super().__init__()
self.bert = model_class.from_pretrained(pretrained_weights)
# Замораживаем все параметры
for param in self.bert.parameters():
param.requires_grad = False
# Размораживаем слой BertPooler
for param in self.bert.pooler.parameters():
param.requires_grad = True
self.linear = nn.Sequential(
nn.Linear(312, 256),
nn.ReLU(),
nn.Dropout(),
nn.Linear(256, 1),
)
def forward(self, x, attention_mask):
bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
out = self.linear(bert_out)
return out