File size: 6,331 Bytes
28578be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn.model_selection import train_test_split
from dataset.load_dataset import df, prepare_dataset
from torch.nn import BCEWithLogitsLoss
from transformers import BertForSequenceClassification, BertConfig
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast
from torch.utils.tensorboard import SummaryWriter
import datetime
# 初始化TensorBoard SummaryWriter
current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = f'runs/train_{current_time}'
writer = SummaryWriter(log_dir)
epochs = 10
lr = 1e-5
optimizer_name = 'AdamW'
loss_fn_name = 'BCEWithLogitsLoss'
batch_size = 16
# 构建模型保存路径,包括重要参数
model_save_name = f'model_{current_time}_lr{lr}_opt{optimizer_name}_loss{loss_fn_name}_batch{batch_size}_epoch{epochs}.pt'
model_save_path = f'./saved_models/{model_save_name}'
tokenizer = AutoTokenizer.from_pretrained(
"pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition") # 用于将文本转换为模型所需输入格式的tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 创建一个device对象,如果有可用的GPU就使用它,否则使用CPU
# 分割数据集
train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
# 准备训练和验证数据集
train_dataset = prepare_dataset(train_df, tokenizer)
val_dataset = prepare_dataset(val_df, tokenizer)
# 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
# 加载配置
config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
config.num_labels = 8 # 调整为你的标签数量
model = AutoModelForSequenceClassification.from_pretrained(
"pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition", config=config, ignore_mismatched_sizes=True).to(
device)
# 准备优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = BCEWithLogitsLoss()
# 微调模型
scaler = GradScaler()
for epoch in range(epochs): # 迭代多个epoch
print(f"\nEpoch {epoch + 1}/{epochs}")
print('-------------------------------')
model.train()
total_loss = 0
train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
for step, batch in enumerate(train_progress_bar):
# 将数据加载到GPU
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
model.zero_grad()
# 前向传播
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs.logits
# 反向传播
loss = loss_fn(logits, b_labels)
total_loss += loss.item()
# loss.backward()
# optimizer.step()
# scheduler.step()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# 检查loss是否为nan
if torch.isnan(loss).any():
print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
# 可选:打印出问题数据的更多信息或采取其他措施
# 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
continue # 跳过当前批次的反向传播和优化器步骤
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
# 添加训练损失到TensorBoard
writer.add_scalar('Loss/train', loss.item(), epoch * len(train_dataloader) + step)
# 评估阶段
avg_train_loss = total_loss / len(train_dataloader)
print(f"Training loss: {avg_train_loss:.2f}")
# 验证阶段
model.eval()
total_eval_accuracy = 0
eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
total_eval_loss = 0 # 初始化验证集总损失
for batch in eval_progress_bar:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs.logits
# 注意:如果你的损失函数和模型的输出有不同的形状要求,可能需要对下面的损失计算进行调整
loss = loss_fn(logits, b_labels)
total_eval_loss += loss.item() # 累加批次损失到总损失
# 使用sigmoid函数将logits转换为概率值
probs = torch.sigmoid(logits)
# 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
predictions = (probs > 0.5).int()
# 比较预测和真实标签
correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型
# 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
accuracy_per_sample = correct_predictions.mean(dim=1)
accuracy = accuracy_per_sample.mean().item()
total_eval_accuracy += accuracy
# 更新进度条
eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
# 计算整个验证集的平均损失
avg_val_loss = total_eval_loss / len(validation_dataloader)
print(f"Validation Loss: {avg_val_loss:.2f}")
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
writer.add_scalar('Loss/val', avg_val_loss, epoch) # 确保在TensorBoard中记录验证损失
print(f"Validation Accuracy: {avg_val_accuracy:.2f}")
writer.close()
# 保存模型
torch.save(model.state_dict(), model_save_path)
print(f"traing end, save model to :{model_save_path}")
|