Spaces:

Maslov-Artem
/

nlp_proj

Sleeping

File size: 8,093 Bytes

cb2adb5

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
from torch.utils.data import Dataset


def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
    # Создаем объекты для токенизатора и модели
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)
    return model, tokenizer


def train_model(
    DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion
):
    # Создаем папку для сохранения весов, если она еще не существует
    if not os.path.exists("weights"):
        os.makedirs("weights")

    # Инициализация списков для сохранения значений потерь и точности
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    val_f1_scores = []

    best_val_loss = float("inf")

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        total = 0
        correct = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            labels = labels.to(DEVICE)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels.float().unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            preds = torch.round(torch.sigmoid(outputs))
            total += labels.size(0)
            correct += (preds == labels.unsqueeze(1)).sum().item()

        accuracy = correct / total
        avg_train_loss = train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        train_accuracies.append(accuracy)

        model.eval()
        val_loss = 0.0
        total_preds = []
        total_labels = []
        with torch.no_grad():
            total = 0
            correct = 0
            for batch in valid_loader:
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)
                labels = labels.to(DEVICE)
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels.float().unsqueeze(1))
                val_loss += loss.item()
                preds = torch.round(torch.sigmoid(outputs))
                total += labels.size(0)
                correct += (preds == labels.unsqueeze(1)).sum().item()
                total_preds.extend(preds.detach().cpu().numpy())
                total_labels.extend(labels.detach().cpu().numpy())

            accuracy = correct / total
            f1 = f1_score(total_labels, total_preds)
            avg_val_loss = val_loss / len(valid_loader)
            val_losses.append(avg_val_loss)
            val_accuracies.append(accuracy)
            val_f1_scores.append(f1)

            # Если это лучшая модель, сохраняем веса
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), "weights/best_bert_weights.pth")

        print(f"Epoch {epoch+1}")
        print(
            f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}"
        )
        print(
            f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}"
        )
        print(25 * "==")

    return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores


def predict_sentiment(text, model, tokenizer, DEVICE):
    # Модель должна быть в режиме оценки
    model.eval()

    # Токенизируем текст и конвертируем в тензор
    encoding = tokenizer.encode_plus(
        text, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)

    # Прогоняем текст через модель
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    # Преобразуем выход модели в вероятность с помощью сигмоиды
    probability = torch.sigmoid(output).item()

    # Задаем порог
    threshold = 0.5

    # Возвращаем вероятность положительного или отрицательного класса
    if probability >= threshold:
        return 1
        # return f"С вероятностью {probability*100:.2f}% это положительный отзыв"
    else:
        return 0
        # return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв"


def load_model(model_class, pretrained_weights, weights_path):
    # Создаем экземпляр классификатора
    model = ruBERTClassifier(model_class, pretrained_weights)

    # Загружаем веса
    model.load_state_dict(torch.load(weights_path, map_location="cpu"))

    return model


def plot_metrics(
    train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
):
    epochs = range(1, len(train_losses) + 1)

    fig, axs = plt.subplots(1, 2, figsize=(15, 5))

    # Первый подграфик для потерь
    axs[0].plot(epochs, train_losses, "r--", label="Training Loss")
    axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss")
    axs[0].set_title("Training and Validation Loss")
    axs[0].set_xlabel("Epochs")
    axs[0].set_ylabel("Loss")
    axs[0].legend()

    # Второй подграфик для точности и F1-оценки
    axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy")
    axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy")
    axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score")
    axs[1].set_title("Training and Validation Accuracy and F1 Score")
    axs[1].set_xlabel("Epochs")
    axs[1].set_ylabel("Metric Value")
    axs[1].legend()

    plt.tight_layout()
    plt.savefig("metrics_plot.png")  # Сохраняем рисунок в файл
    plt.show()


class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        return (
            encoding["input_ids"].squeeze(),
            encoding["attention_mask"].squeeze(),
            torch.tensor(label),
        )


class ruBERTClassifier(nn.Module):
    def __init__(self, model_class, pretrained_weights):
        super().__init__()
        self.bert = model_class.from_pretrained(pretrained_weights)
        # Замораживаем все параметры
        for param in self.bert.parameters():
            param.requires_grad = False

        # Размораживаем слой BertPooler
        for param in self.bert.pooler.parameters():
            param.requires_grad = True

        self.linear = nn.Sequential(
            nn.Linear(312, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
        )

    def forward(self, x, attention_mask):
        bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
        out = self.linear(bert_out)
        return out