Epoch 10/10
Browse files-------------------------------
Validation: 0%| | 0/6 [00:00<?, ?it/s]Training loss: 0.30
Validation Accuracy: 0.86
- dataset/CustomDataset.py +40 -0
- dataset/load_dataset.py +13 -9
- new.py +76 -22
dataset/CustomDataset.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import Dataset
|
3 |
+
|
4 |
+
n_classes = 2
|
5 |
+
|
6 |
+
class CustomDataset(Dataset):
|
7 |
+
def __init__(self, data, tokenizer, max_length=512):
|
8 |
+
self.data = data
|
9 |
+
self.tokenizer = tokenizer
|
10 |
+
self.max_length = max_length
|
11 |
+
|
12 |
+
def __len__(self):
|
13 |
+
return len(self.data)
|
14 |
+
|
15 |
+
def __getitem__(self, idx):
|
16 |
+
item = self.data[idx]
|
17 |
+
# 假设我们的数据是一个字典,包含"text"和"label"
|
18 |
+
text = item['Description']
|
19 |
+
label = item['label']
|
20 |
+
|
21 |
+
# 对文本进行编码
|
22 |
+
encoded = self.tokenizer.encode_plus(
|
23 |
+
text,
|
24 |
+
add_special_tokens=True,
|
25 |
+
max_length=self.max_length,
|
26 |
+
padding='max_length',
|
27 |
+
truncation=True,
|
28 |
+
return_attention_mask=True,
|
29 |
+
return_tensors='pt',
|
30 |
+
)
|
31 |
+
# 在这里添加任何需要的数据检查逻辑
|
32 |
+
# 例如,检查标签是否在预期的范围内
|
33 |
+
if label < 0 or label > n_classes: # 假设n_classes是标签的数量
|
34 |
+
raise ValueError("Found an invalid label")
|
35 |
+
|
36 |
+
return {
|
37 |
+
'input_ids': encoded['input_ids'].flatten(),
|
38 |
+
'attention_mask': encoded['attention_mask'].flatten(),
|
39 |
+
'labels': torch.tensor(label, dtype=torch.long)
|
40 |
+
}
|
dataset/load_dataset.py
CHANGED
@@ -10,8 +10,11 @@ def prepare_dataset(df, tokenizer, max_length=512):
|
|
10 |
input_ids = []
|
11 |
attention_masks = []
|
12 |
labels = []
|
13 |
-
|
14 |
for _, row in df.iterrows():
|
|
|
|
|
|
|
|
|
15 |
encoded = tokenizer.encode_plus(
|
16 |
row['Description'],
|
17 |
add_special_tokens=True,
|
@@ -23,18 +26,19 @@ def prepare_dataset(df, tokenizer, max_length=512):
|
|
23 |
)
|
24 |
input_ids.append(encoded['input_ids'])
|
25 |
attention_masks.append(encoded['attention_mask'])
|
26 |
-
labels.append([row['SAS_Class'], row['SDS_Class']])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
input_ids = torch.cat(input_ids, dim=0)
|
29 |
attention_masks = torch.cat(attention_masks, dim=0)
|
30 |
labels = torch.tensor(labels, dtype=torch.float)
|
31 |
-
|
32 |
return TensorDataset(input_ids, attention_masks, labels)
|
33 |
|
34 |
-
# 分割数据集
|
35 |
-
train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
|
36 |
-
|
37 |
-
|
38 |
-
# 创建DataLoader
|
39 |
-
|
40 |
|
|
|
10 |
input_ids = []
|
11 |
attention_masks = []
|
12 |
labels = []
|
|
|
13 |
for _, row in df.iterrows():
|
14 |
+
# 检查标签是否有效(例如,不是NaN)
|
15 |
+
if pd.isna(row['SAS_Class']) or pd.isna(row['SDS_Class']):
|
16 |
+
continue # 跳过这个样本
|
17 |
+
|
18 |
encoded = tokenizer.encode_plus(
|
19 |
row['Description'],
|
20 |
add_special_tokens=True,
|
|
|
26 |
)
|
27 |
input_ids.append(encoded['input_ids'])
|
28 |
attention_masks.append(encoded['attention_mask'])
|
29 |
+
# labels.append([row['SAS_Class'], row['SDS_Class']])
|
30 |
+
# 将SAS_Class和SDS_Class转换为one-hot编码
|
31 |
+
sas_label = [0] * 4 # 初始化4个元素为0的列表
|
32 |
+
sds_label = [0] * 4 # 同上
|
33 |
+
sas_label[int(row['SAS_Class'])] = 1 # 将对应的位置设为1
|
34 |
+
sds_label[int(row['SDS_Class'])] = 1 # 同上
|
35 |
+
combined_label = sas_label + sds_label # 组合两个标签
|
36 |
+
|
37 |
+
labels.append(combined_label)
|
38 |
|
39 |
input_ids = torch.cat(input_ids, dim=0)
|
40 |
attention_masks = torch.cat(attention_masks, dim=0)
|
41 |
labels = torch.tensor(labels, dtype=torch.float)
|
|
|
42 |
return TensorDataset(input_ids, attention_masks, labels)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
new.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
|
2 |
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
3 |
-
from torch.nn import CrossEntropyLoss
|
4 |
import torch
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
-
|
7 |
from dataset.load_dataset import df, prepare_dataset
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
epochs = 10
|
10 |
tokenizer = AutoTokenizer.from_pretrained(
|
@@ -18,37 +21,88 @@ train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验
|
|
18 |
train_dataset = prepare_dataset(train_df, tokenizer)
|
19 |
val_dataset = prepare_dataset(val_df, tokenizer)
|
20 |
# 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
|
21 |
-
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=
|
22 |
-
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
model = AutoModelForSequenceClassification.from_pretrained(
|
25 |
-
"pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition").to(device)
|
26 |
-
input = tokenizer("I love using transformers for natural language processing.", return_tensors="pt")
|
27 |
-
# 使用模型进行预测
|
28 |
-
# with torch.no_grad():
|
29 |
-
# logits = model(**input).logits
|
30 |
-
# 解析预测结果
|
31 |
-
# predicted_class_id = logits.argmax().item()
|
32 |
-
# print(f"Predicted class id: {predicted_class_id}")
|
33 |
# 准备优化器和学习率调度器
|
34 |
-
optimizer = AdamW(model.parameters(), lr=
|
35 |
total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
|
36 |
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
|
37 |
-
|
38 |
# 微调模型
|
39 |
-
|
|
|
40 |
for epoch in range(epochs): # 迭代多个epoch
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
42 |
# 将数据加载到GPU
|
43 |
batch = tuple(t.to(device) for t in batch)
|
44 |
b_input_ids, b_input_mask, b_labels = batch
|
45 |
model.zero_grad()
|
46 |
# 前向传播
|
47 |
-
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask
|
48 |
-
loss = outputs.loss
|
49 |
logits = outputs.logits
|
50 |
# 反向传播
|
51 |
-
loss
|
52 |
-
|
53 |
-
|
54 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
|
2 |
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
|
|
3 |
import torch
|
4 |
from sklearn.model_selection import train_test_split
|
|
|
5 |
from dataset.load_dataset import df, prepare_dataset
|
6 |
+
from torch.nn import BCEWithLogitsLoss
|
7 |
+
from transformers import BertForSequenceClassification, BertConfig
|
8 |
+
from tqdm.auto import tqdm
|
9 |
+
from torch.cuda.amp import GradScaler, autocast
|
10 |
+
|
11 |
|
12 |
epochs = 10
|
13 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
21 |
train_dataset = prepare_dataset(train_df, tokenizer)
|
22 |
val_dataset = prepare_dataset(val_df, tokenizer)
|
23 |
# 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
|
24 |
+
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
|
25 |
+
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)
|
26 |
+
|
27 |
+
|
28 |
+
# 加载配置
|
29 |
+
config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
|
30 |
+
config.num_labels = 8 # 调整为你的标签数量
|
31 |
|
32 |
model = AutoModelForSequenceClassification.from_pretrained(
|
33 |
+
"pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition",config=config,ignore_mismatched_sizes=True).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# 准备优化器和学习率调度器
|
35 |
+
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
|
36 |
total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
|
37 |
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
|
38 |
+
loss_fn = BCEWithLogitsLoss()
|
39 |
# 微调模型
|
40 |
+
scaler = GradScaler()
|
41 |
+
|
42 |
for epoch in range(epochs): # 迭代多个epoch
|
43 |
+
print(f"\nEpoch {epoch + 1}/{epochs}")
|
44 |
+
print('-------------------------------')
|
45 |
+
model.train()
|
46 |
+
total_loss = 0
|
47 |
+
train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
|
48 |
+
for step, batch in enumerate(train_progress_bar):
|
49 |
# 将数据加载到GPU
|
50 |
batch = tuple(t.to(device) for t in batch)
|
51 |
b_input_ids, b_input_mask, b_labels = batch
|
52 |
model.zero_grad()
|
53 |
# 前向传播
|
54 |
+
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
|
|
|
55 |
logits = outputs.logits
|
56 |
# 反向传播
|
57 |
+
loss = loss_fn(logits, b_labels)
|
58 |
+
total_loss += loss.item()
|
59 |
+
# loss.backward()
|
60 |
+
# optimizer.step()
|
61 |
+
# scheduler.step()
|
62 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
63 |
+
# 检查loss是否为nan
|
64 |
+
if torch.isnan(loss).any():
|
65 |
+
print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
|
66 |
+
# 可选:打印出问题数据的更多信息或采取其他措施
|
67 |
+
# 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
|
68 |
+
continue # 跳过当前批次的反向传播和优化器步骤
|
69 |
+
scaler.scale(loss).backward()
|
70 |
+
scaler.step(optimizer)
|
71 |
+
scaler.update()
|
72 |
+
train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
|
73 |
+
|
74 |
+
# 评估阶段
|
75 |
+
avg_train_loss = total_loss / len(train_dataloader)
|
76 |
+
print(f"Training loss: {avg_train_loss:.2f}")
|
77 |
+
|
78 |
+
# 验证阶段
|
79 |
+
model.eval()
|
80 |
+
total_eval_accuracy = 0
|
81 |
+
eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
|
82 |
+
|
83 |
+
for batch in eval_progress_bar:
|
84 |
+
batch = tuple(t.to(device) for t in batch)
|
85 |
+
b_input_ids, b_input_mask, b_labels = batch
|
86 |
+
with torch.no_grad():
|
87 |
+
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
|
88 |
+
|
89 |
+
logits = outputs.logits
|
90 |
+
# predictions = torch.argmax(logits, dim=1).flatten()
|
91 |
+
# labels = b_labels.flatten()
|
92 |
+
# accuracy = (predictions == labels).cpu().numpy().mean()
|
93 |
+
# 使用sigmoid函数将logits转换为概率值
|
94 |
+
probs = torch.sigmoid(logits)
|
95 |
+
# 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
|
96 |
+
predictions = (probs > 0.5).int()
|
97 |
+
|
98 |
+
# 比较预测和真实标签
|
99 |
+
correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型
|
100 |
+
# 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
|
101 |
+
accuracy_per_sample = correct_predictions.mean(dim=1)
|
102 |
+
accuracy = accuracy_per_sample.mean().item()
|
103 |
+
total_eval_accuracy += accuracy
|
104 |
+
# 更新进度条
|
105 |
+
eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
|
106 |
+
|
107 |
+
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
|
108 |
+
print(f"Validation Accuracy: {avg_val_accuracy:.2f}")
|