from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification from torch.utils.data import DataLoader, RandomSampler, SequentialSampler import torch from sklearn.model_selection import train_test_split from dataset.load_dataset import df, prepare_dataset from torch.nn import BCEWithLogitsLoss from transformers import BertForSequenceClassification, BertConfig from tqdm.auto import tqdm from torch.cuda.amp import GradScaler, autocast from torch.utils.tensorboard import SummaryWriter import datetime # 初始化TensorBoard SummaryWriter current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') log_dir = f'runs/train_{current_time}' writer = SummaryWriter(log_dir) epochs = 10 lr = 1e-5 optimizer_name = 'AdamW' loss_fn_name = 'BCEWithLogitsLoss' batch_size = 16 # 构建模型保存路径,包括重要参数 model_save_name = f'model_{current_time}_lr{lr}_opt{optimizer_name}_loss{loss_fn_name}_batch{batch_size}_epoch{epochs}.pt' model_save_path = f'./saved_models/{model_save_name}' tokenizer = AutoTokenizer.from_pretrained( "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition") # 用于将文本转换为模型所需输入格式的tokenizer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 创建一个device对象,如果有可用的GPU就使用它,否则使用CPU # 分割数据集 train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集 # 准备训练和验证数据集 train_dataset = prepare_dataset(train_df, tokenizer) val_dataset = prepare_dataset(val_df, tokenizer) # 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证 train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size) # 加载配置 config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition") config.num_labels = 8 # 调整为你的标签数量 model = AutoModelForSequenceClassification.from_pretrained( "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition", config=config, ignore_mismatched_sizes=True).to( device) # 准备优化器和学习率调度器 optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8) total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) loss_fn = BCEWithLogitsLoss() # 微调模型 scaler = GradScaler() for epoch in range(epochs): # 迭代多个epoch print(f"\nEpoch {epoch + 1}/{epochs}") print('-------------------------------') model.train() total_loss = 0 train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False) for step, batch in enumerate(train_progress_bar): # 将数据加载到GPU batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch model.zero_grad() # 前向传播 outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs.logits # 反向传播 loss = loss_fn(logits, b_labels) total_loss += loss.item() # loss.backward() # optimizer.step() # scheduler.step() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # 检查loss是否为nan if torch.isnan(loss).any(): print(f"Loss is nan in epoch {epoch + 1}, step {step}.") # 可选:打印出问题数据的更多信息或采取其他措施 # 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan continue # 跳过当前批次的反向传播和优化器步骤 scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"}) # 添加训练损失到TensorBoard writer.add_scalar('Loss/train', loss.item(), epoch * len(train_dataloader) + step) # 评估阶段 avg_train_loss = total_loss / len(train_dataloader) print(f"Training loss: {avg_train_loss:.2f}") # 验证阶段 model.eval() total_eval_accuracy = 0 eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False) total_eval_loss = 0 # 初始化验证集总损失 for batch in eval_progress_bar: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs.logits # 注意:如果你的损失函数和模型的输出有不同的形状要求,可能需要对下面的损失计算进行调整 loss = loss_fn(logits, b_labels) total_eval_loss += loss.item() # 累加批次损失到总损失 # 使用sigmoid函数将logits转换为概率值 # probs = torch.sigmoid(logits) # # 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0) # predictions = (probs > 0.5).int() # # 比较预测和真实标签 # correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型 # # 计算每个样本的正确预测的平均数,然后计算整个批次的平均值 # accuracy_per_sample = correct_predictions.mean(dim=1) # accuracy = accuracy_per_sample.mean().item() logits_sas = logits[:, :4] # SAS_Class的4个输出 logits_sds = logits[:, 4:] # SDS_Class的4个输出 # 应用softmax来获取概率分布 probs_sas = torch.softmax(logits_sas, dim=1) probs_sds = torch.softmax(logits_sds, dim=1) # 选择概率最高的类别作为预测结果 _, predictions_sas = torch.max(probs_sas, dim=1) _, predictions_sds = torch.max(probs_sds, dim=1) # 真实的标签 true_sas = b_labels[:, 0].long() # 确保是长整型 true_sds = b_labels[:, 1].long() # 确保是长整型 # 计算准确性 accuracy_sas = (predictions_sas == true_sas).float().mean() accuracy_sds = (predictions_sds == true_sds).float().mean() # 综合两个准确性得分 accuracy = (accuracy_sas + accuracy_sds) / 2 total_eval_accuracy += accuracy # 更新进度条 eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"}) # 计算整个验证集的平均损失 avg_val_loss = total_eval_loss / len(validation_dataloader) print(f"Validation Loss: {avg_val_loss:.2f}") avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) writer.add_scalar('Loss/val', avg_val_loss, epoch) # 确保在TensorBoard中记录验证损失 print(f"Validation Accuracy: {avg_val_accuracy:.2f}") writer.close() # 保存模型 torch.save(model.state_dict(), model_save_path) print(f"traing end, save model to :{model_save_path}")