nlp_proj

Runtime error

Maslov-Artem

Add 3 classifiers

cb2adb5 8 months ago

8.09 kB

	import matplotlib.pyplot as plt
	import torch
	import torch.nn as nn
	from sklearn.metrics import f1_score
	from torch.utils.data import Dataset


	def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
	# Создаем объекты для токенизатора и модели
	tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
	model = model_class.from_pretrained(pretrained_weights)
	return model, tokenizer


	def train_model(
	DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion
	):
	# Создаем папку для сохранения весов, если она еще не существует
	if not os.path.exists("weights"):
	os.makedirs("weights")

	# Инициализация списков для сохранения значений потерь и точности
	train_losses = []
	train_accuracies = []
	val_losses = []
	val_accuracies = []
	val_f1_scores = []

	best_val_loss = float("inf")

	for epoch in range(epochs):
	model.train()
	train_loss = 0.0
	total = 0
	correct = 0
	for batch in train_loader:
	optimizer.zero_grad()
	input_ids, attention_mask, labels = batch
	input_ids = input_ids.to(DEVICE)
	attention_mask = attention_mask.to(DEVICE)
	labels = labels.to(DEVICE)
	outputs = model(input_ids, attention_mask=attention_mask)
	loss = criterion(outputs, labels.float().unsqueeze(1))
	loss.backward()
	optimizer.step()
	train_loss += loss.item()
	preds = torch.round(torch.sigmoid(outputs))
	total += labels.size(0)
	correct += (preds == labels.unsqueeze(1)).sum().item()

	accuracy = correct / total
	avg_train_loss = train_loss / len(train_loader)
	train_losses.append(avg_train_loss)
	train_accuracies.append(accuracy)

	model.eval()
	val_loss = 0.0
	total_preds = []
	total_labels = []
	with torch.no_grad():
	total = 0
	correct = 0
	for batch in valid_loader:
	input_ids, attention_mask, labels = batch
	input_ids = input_ids.to(DEVICE)
	attention_mask = attention_mask.to(DEVICE)
	labels = labels.to(DEVICE)
	outputs = model(input_ids, attention_mask=attention_mask)
	loss = criterion(outputs, labels.float().unsqueeze(1))
	val_loss += loss.item()
	preds = torch.round(torch.sigmoid(outputs))
	total += labels.size(0)
	correct += (preds == labels.unsqueeze(1)).sum().item()
	total_preds.extend(preds.detach().cpu().numpy())
	total_labels.extend(labels.detach().cpu().numpy())

	accuracy = correct / total
	f1 = f1_score(total_labels, total_preds)
	avg_val_loss = val_loss / len(valid_loader)
	val_losses.append(avg_val_loss)
	val_accuracies.append(accuracy)
	val_f1_scores.append(f1)

	# Если это лучшая модель, сохраняем веса
	if avg_val_loss < best_val_loss:
	best_val_loss = avg_val_loss
	torch.save(model.state_dict(), "weights/best_bert_weights.pth")

	print(f"Epoch {epoch+1}")
	print(
	f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}"
	)
	print(
	f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}"
	)
	print(25 * "==")

	return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores


	def predict_sentiment(text, model, tokenizer, DEVICE):
	# Модель должна быть в режиме оценки
	model.eval()

	# Токенизируем текст и конвертируем в тензор
	encoding = tokenizer.encode_plus(
	text, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
	)
	input_ids = encoding["input_ids"].to(DEVICE)
	attention_mask = encoding["attention_mask"].to(DEVICE)

	# Прогоняем текст через модель
	with torch.no_grad():
	output = model(input_ids, attention_mask=attention_mask)

	# Преобразуем выход модели в вероятность с помощью сигмоиды
	probability = torch.sigmoid(output).item()

	# Задаем порог
	threshold = 0.5

	# Возвращаем вероятность положительного или отрицательного класса
	if probability >= threshold:
	return 1
	# return f"С вероятностью {probability*100:.2f}% это положительный отзыв"
	else:
	return 0
	# return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв"


	def load_model(model_class, pretrained_weights, weights_path):
	# Создаем экземпляр классификатора
	model = ruBERTClassifier(model_class, pretrained_weights)

	# Загружаем веса
	model.load_state_dict(torch.load(weights_path, map_location="cpu"))

	return model


	def plot_metrics(
	train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
	):
	epochs = range(1, len(train_losses) + 1)

	fig, axs = plt.subplots(1, 2, figsize=(15, 5))

	# Первый подграфик для потерь
	axs[0].plot(epochs, train_losses, "r--", label="Training Loss")
	axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss")
	axs[0].set_title("Training and Validation Loss")
	axs[0].set_xlabel("Epochs")
	axs[0].set_ylabel("Loss")
	axs[0].legend()

	# Второй подграфик для точности и F1-оценки
	axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy")
	axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy")
	axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score")
	axs[1].set_title("Training and Validation Accuracy and F1 Score")
	axs[1].set_xlabel("Epochs")
	axs[1].set_ylabel("Metric Value")
	axs[1].legend()

	plt.tight_layout()
	plt.savefig("metrics_plot.png") # Сохраняем рисунок в файл
	plt.show()


	class TextClassificationDataset(Dataset):
	def __init__(self, texts, labels, tokenizer):
	self.texts = texts
	self.labels = labels
	self.tokenizer = tokenizer

	def __len__(self):
	return len(self.texts)

	def __getitem__(self, idx):
	text = self.texts[idx]
	label = self.labels[idx]
	encoding = self.tokenizer.encode_plus(
	text,
	padding="max_length",
	truncation=True,
	max_length=512,
	return_tensors="pt",
	)
	return (
	encoding["input_ids"].squeeze(),
	encoding["attention_mask"].squeeze(),
	torch.tensor(label),
	)


	class ruBERTClassifier(nn.Module):
	def __init__(self, model_class, pretrained_weights):
	super().__init__()
	self.bert = model_class.from_pretrained(pretrained_weights)
	# Замораживаем все параметры
	for param in self.bert.parameters():
	param.requires_grad = False

	# Размораживаем слой BertPooler
	for param in self.bert.pooler.parameters():
	param.requires_grad = True

	self.linear = nn.Sequential(
	nn.Linear(312, 256),
	nn.ReLU(),
	nn.Dropout(),
	nn.Linear(256, 1),
	)

	def forward(self, x, attention_mask):
	bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
	out = self.linear(bert_out)
	return out