import matplotlib.pyplot as plt import torch import torch.nn as nn from sklearn.metrics import f1_score from torch.utils.data import Dataset def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights): # Создаем объекты для токенизатора и модели tokenizer = tokenizer_class.from_pretrained(pretrained_weights) model = model_class.from_pretrained(pretrained_weights) return model, tokenizer def train_model( DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion ): # Создаем папку для сохранения весов, если она еще не существует if not os.path.exists("weights"): os.makedirs("weights") # Инициализация списков для сохранения значений потерь и точности train_losses = [] train_accuracies = [] val_losses = [] val_accuracies = [] val_f1_scores = [] best_val_loss = float("inf") for epoch in range(epochs): model.train() train_loss = 0.0 total = 0 correct = 0 for batch in train_loader: optimizer.zero_grad() input_ids, attention_mask, labels = batch input_ids = input_ids.to(DEVICE) attention_mask = attention_mask.to(DEVICE) labels = labels.to(DEVICE) outputs = model(input_ids, attention_mask=attention_mask) loss = criterion(outputs, labels.float().unsqueeze(1)) loss.backward() optimizer.step() train_loss += loss.item() preds = torch.round(torch.sigmoid(outputs)) total += labels.size(0) correct += (preds == labels.unsqueeze(1)).sum().item() accuracy = correct / total avg_train_loss = train_loss / len(train_loader) train_losses.append(avg_train_loss) train_accuracies.append(accuracy) model.eval() val_loss = 0.0 total_preds = [] total_labels = [] with torch.no_grad(): total = 0 correct = 0 for batch in valid_loader: input_ids, attention_mask, labels = batch input_ids = input_ids.to(DEVICE) attention_mask = attention_mask.to(DEVICE) labels = labels.to(DEVICE) outputs = model(input_ids, attention_mask=attention_mask) loss = criterion(outputs, labels.float().unsqueeze(1)) val_loss += loss.item() preds = torch.round(torch.sigmoid(outputs)) total += labels.size(0) correct += (preds == labels.unsqueeze(1)).sum().item() total_preds.extend(preds.detach().cpu().numpy()) total_labels.extend(labels.detach().cpu().numpy()) accuracy = correct / total f1 = f1_score(total_labels, total_preds) avg_val_loss = val_loss / len(valid_loader) val_losses.append(avg_val_loss) val_accuracies.append(accuracy) val_f1_scores.append(f1) # Если это лучшая модель, сохраняем веса if avg_val_loss < best_val_loss: best_val_loss = avg_val_loss torch.save(model.state_dict(), "weights/best_bert_weights.pth") print(f"Epoch {epoch+1}") print( f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}" ) print( f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}" ) print(25 * "==") return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores def predict_sentiment(text, model, tokenizer, DEVICE): # Модель должна быть в режиме оценки model.eval() # Токенизируем текст и конвертируем в тензор encoding = tokenizer.encode_plus( text, padding="max_length", truncation=True, max_length=512, return_tensors="pt" ) input_ids = encoding["input_ids"].to(DEVICE) attention_mask = encoding["attention_mask"].to(DEVICE) # Прогоняем текст через модель with torch.no_grad(): output = model(input_ids, attention_mask=attention_mask) # Преобразуем выход модели в вероятность с помощью сигмоиды probability = torch.sigmoid(output).item() # Задаем порог threshold = 0.5 # Возвращаем вероятность положительного или отрицательного класса if probability >= threshold: return 1 # return f"С вероятностью {probability*100:.2f}% это положительный отзыв" else: return 0 # return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв" def load_model(model_class, pretrained_weights, weights_path): # Создаем экземпляр классификатора model = ruBERTClassifier(model_class, pretrained_weights) # Загружаем веса model.load_state_dict(torch.load(weights_path, map_location="cpu")) return model def plot_metrics( train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores ): epochs = range(1, len(train_losses) + 1) fig, axs = plt.subplots(1, 2, figsize=(15, 5)) # Первый подграфик для потерь axs[0].plot(epochs, train_losses, "r--", label="Training Loss") axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss") axs[0].set_title("Training and Validation Loss") axs[0].set_xlabel("Epochs") axs[0].set_ylabel("Loss") axs[0].legend() # Второй подграфик для точности и F1-оценки axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy") axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy") axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score") axs[1].set_title("Training and Validation Accuracy and F1 Score") axs[1].set_xlabel("Epochs") axs[1].set_ylabel("Metric Value") axs[1].legend() plt.tight_layout() plt.savefig("metrics_plot.png") # Сохраняем рисунок в файл plt.show() class TextClassificationDataset(Dataset): def __init__(self, texts, labels, tokenizer): self.texts = texts self.labels = labels self.tokenizer = tokenizer def __len__(self): return len(self.texts) def __getitem__(self, idx): text = self.texts[idx] label = self.labels[idx] encoding = self.tokenizer.encode_plus( text, padding="max_length", truncation=True, max_length=512, return_tensors="pt", ) return ( encoding["input_ids"].squeeze(), encoding["attention_mask"].squeeze(), torch.tensor(label), ) class ruBERTClassifier(nn.Module): def __init__(self, model_class, pretrained_weights): super().__init__() self.bert = model_class.from_pretrained(pretrained_weights) # Замораживаем все параметры for param in self.bert.parameters(): param.requires_grad = False # Размораживаем слой BertPooler for param in self.bert.pooler.parameters(): param.requires_grad = True self.linear = nn.Sequential( nn.Linear(312, 256), nn.ReLU(), nn.Dropout(), nn.Linear(256, 1), ) def forward(self, x, attention_mask): bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :] out = self.linear(bert_out) return out