Spaces:
Sleeping
Sleeping
import time | |
from functools import wraps | |
import matplotlib.pyplot as plt | |
import streamlit as st | |
import torch | |
import torch.nn as nn | |
from sklearn.metrics import f1_score | |
from torch.utils.data import Dataset | |
def execution_time(func): | |
def wrapper(*args, **kwargs): | |
# Define the styling for the execution time text | |
styled_text = """ | |
<style> | |
.execution-time { | |
font-size: 20px; | |
color: #FFFFFF; | |
text-shadow: -2px -2px 4px #000000; | |
} | |
</style> | |
""" | |
# Apply the styling directly before writing the execution time text | |
st.markdown(styled_text, unsafe_allow_html=True) | |
start_time = time.time() | |
result = func(*args, **kwargs) | |
end_time = time.time() | |
execution_seconds = end_time - start_time | |
# Write the styled text for the execution time | |
st.markdown( | |
f'<div class="execution-time">Model execution time = {execution_seconds:.5f} seconds</div>', | |
unsafe_allow_html=True, | |
) | |
return result | |
return wrapper | |
def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights): | |
# Создаем объекты для токенизатора и модели | |
tokenizer = tokenizer_class.from_pretrained(pretrained_weights) | |
model = model_class.from_pretrained(pretrained_weights) | |
return model, tokenizer | |
def train_model( | |
DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion | |
): | |
# Создаем папку для сохранения весов, если она еще не существует | |
if not os.path.exists("weights"): | |
os.makedirs("weights") | |
# Инициализация списков для сохранения значений потерь и точности | |
train_losses = [] | |
train_accuracies = [] | |
val_losses = [] | |
val_accuracies = [] | |
val_f1_scores = [] | |
best_val_loss = float("inf") | |
for epoch in range(epochs): | |
model.train() | |
train_loss = 0.0 | |
total = 0 | |
correct = 0 | |
for batch in train_loader: | |
optimizer.zero_grad() | |
input_ids, attention_mask, labels = batch | |
input_ids = input_ids.to(DEVICE) | |
attention_mask = attention_mask.to(DEVICE) | |
labels = labels.to(DEVICE) | |
outputs = model(input_ids, attention_mask=attention_mask) | |
loss = criterion(outputs, labels.float().unsqueeze(1)) | |
loss.backward() | |
optimizer.step() | |
train_loss += loss.item() | |
preds = torch.round(torch.sigmoid(outputs)) | |
total += labels.size(0) | |
correct += (preds == labels.unsqueeze(1)).sum().item() | |
accuracy = correct / total | |
avg_train_loss = train_loss / len(train_loader) | |
train_losses.append(avg_train_loss) | |
train_accuracies.append(accuracy) | |
model.eval() | |
val_loss = 0.0 | |
total_preds = [] | |
total_labels = [] | |
with torch.no_grad(): | |
total = 0 | |
correct = 0 | |
for batch in valid_loader: | |
input_ids, attention_mask, labels = batch | |
input_ids = input_ids.to(DEVICE) | |
attention_mask = attention_mask.to(DEVICE) | |
labels = labels.to(DEVICE) | |
outputs = model(input_ids, attention_mask=attention_mask) | |
loss = criterion(outputs, labels.float().unsqueeze(1)) | |
val_loss += loss.item() | |
preds = torch.round(torch.sigmoid(outputs)) | |
total += labels.size(0) | |
correct += (preds == labels.unsqueeze(1)).sum().item() | |
total_preds.extend(preds.detach().cpu().numpy()) | |
total_labels.extend(labels.detach().cpu().numpy()) | |
accuracy = correct / total | |
f1 = f1_score(total_labels, total_preds) | |
avg_val_loss = val_loss / len(valid_loader) | |
val_losses.append(avg_val_loss) | |
val_accuracies.append(accuracy) | |
val_f1_scores.append(f1) | |
# Если это лучшая модель, сохраняем веса | |
if avg_val_loss < best_val_loss: | |
best_val_loss = avg_val_loss | |
torch.save(model.state_dict(), "weights/best_bert_weights.pth") | |
print(f"Epoch {epoch+1}") | |
print( | |
f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}" | |
) | |
print( | |
f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}" | |
) | |
print(25 * "==") | |
return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores | |
def predict_sentiment(text, model, tokenizer, DEVICE): | |
# Модель должна быть в режиме оценки | |
model.eval() | |
# Токенизируем текст и конвертируем в тензор | |
encoding = tokenizer.encode_plus( | |
text, padding="max_length", truncation=True, max_length=512, return_tensors="pt" | |
) | |
input_ids = encoding["input_ids"].to(DEVICE) | |
attention_mask = encoding["attention_mask"].to(DEVICE) | |
# Прогоняем текст через модель | |
with torch.no_grad(): | |
output = model(input_ids, attention_mask=attention_mask) | |
# Преобразуем выход модели в вероятность с помощью сигмоиды | |
probability = torch.sigmoid(output).item() | |
# Задаем порог | |
threshold = 0.5 | |
# Возвращаем вероятность положительного или отрицательного класса | |
if probability >= threshold: | |
return 1 | |
# return f"С вероятностью {probability*100:.2f}% это положительный отзыв" | |
else: | |
return 0 | |
# return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв" | |
def load_model(model_class, pretrained_weights, weights_path): | |
# Создаем экземпляр классификатора | |
model = ruBERTClassifier(model_class, pretrained_weights) | |
# Загружаем веса | |
model.load_state_dict(torch.load(weights_path, map_location="cpu")) | |
return model | |
def plot_metrics( | |
train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores | |
): | |
epochs = range(1, len(train_losses) + 1) | |
fig, axs = plt.subplots(1, 2, figsize=(15, 5)) | |
# Первый подграфик для потерь | |
axs[0].plot(epochs, train_losses, "r--", label="Training Loss") | |
axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss") | |
axs[0].set_title("Training and Validation Loss") | |
axs[0].set_xlabel("Epochs") | |
axs[0].set_ylabel("Loss") | |
axs[0].legend() | |
# Второй подграфик для точности и F1-оценки | |
axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy") | |
axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy") | |
axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score") | |
axs[1].set_title("Training and Validation Accuracy and F1 Score") | |
axs[1].set_xlabel("Epochs") | |
axs[1].set_ylabel("Metric Value") | |
axs[1].legend() | |
plt.tight_layout() | |
plt.savefig("metrics_plot.png") # Сохраняем рисунок в файл | |
plt.show() | |
class TextClassificationDataset(Dataset): | |
def __init__(self, texts, labels, tokenizer): | |
self.texts = texts | |
self.labels = labels | |
self.tokenizer = tokenizer | |
def __len__(self): | |
return len(self.texts) | |
def __getitem__(self, idx): | |
text = self.texts[idx] | |
label = self.labels[idx] | |
encoding = self.tokenizer.encode_plus( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=512, | |
return_tensors="pt", | |
) | |
return ( | |
encoding["input_ids"].squeeze(), | |
encoding["attention_mask"].squeeze(), | |
torch.tensor(label), | |
) | |
class ruBERTClassifier(nn.Module): | |
def __init__(self, model_class, pretrained_weights): | |
super().__init__() | |
self.bert = model_class.from_pretrained(pretrained_weights) | |
# Замораживаем все параметры | |
for param in self.bert.parameters(): | |
param.requires_grad = False | |
# Размораживаем слой BertPooler | |
for param in self.bert.pooler.parameters(): | |
param.requires_grad = True | |
self.linear = nn.Sequential( | |
nn.Linear(312, 256), | |
nn.ReLU(), | |
nn.Dropout(), | |
nn.Linear(256, 1), | |
) | |
def forward(self, x, attention_mask): | |
bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :] | |
out = self.linear(bert_out) | |
return out | |