Tokymin commited on
Commit
fc547f0
1 Parent(s): 1f4f3bd

Epoch 10/10

Browse files

-------------------------------
Validation: 0%| | 0/6 [00:00<?, ?it/s]Training loss: 0.30
Validation Accuracy: 0.86

Files changed (3) hide show
  1. dataset/CustomDataset.py +40 -0
  2. dataset/load_dataset.py +13 -9
  3. new.py +76 -22
dataset/CustomDataset.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset
3
+
4
+ n_classes = 2
5
+
6
+ class CustomDataset(Dataset):
7
+ def __init__(self, data, tokenizer, max_length=512):
8
+ self.data = data
9
+ self.tokenizer = tokenizer
10
+ self.max_length = max_length
11
+
12
+ def __len__(self):
13
+ return len(self.data)
14
+
15
+ def __getitem__(self, idx):
16
+ item = self.data[idx]
17
+ # 假设我们的数据是一个字典,包含"text"和"label"
18
+ text = item['Description']
19
+ label = item['label']
20
+
21
+ # 对文本进行编码
22
+ encoded = self.tokenizer.encode_plus(
23
+ text,
24
+ add_special_tokens=True,
25
+ max_length=self.max_length,
26
+ padding='max_length',
27
+ truncation=True,
28
+ return_attention_mask=True,
29
+ return_tensors='pt',
30
+ )
31
+ # 在这里添加任何需要的数据检查逻辑
32
+ # 例如,检查标签是否在预期的范围内
33
+ if label < 0 or label > n_classes: # 假设n_classes是标签的数量
34
+ raise ValueError("Found an invalid label")
35
+
36
+ return {
37
+ 'input_ids': encoded['input_ids'].flatten(),
38
+ 'attention_mask': encoded['attention_mask'].flatten(),
39
+ 'labels': torch.tensor(label, dtype=torch.long)
40
+ }
dataset/load_dataset.py CHANGED
@@ -10,8 +10,11 @@ def prepare_dataset(df, tokenizer, max_length=512):
10
  input_ids = []
11
  attention_masks = []
12
  labels = []
13
-
14
  for _, row in df.iterrows():
 
 
 
 
15
  encoded = tokenizer.encode_plus(
16
  row['Description'],
17
  add_special_tokens=True,
@@ -23,18 +26,19 @@ def prepare_dataset(df, tokenizer, max_length=512):
23
  )
24
  input_ids.append(encoded['input_ids'])
25
  attention_masks.append(encoded['attention_mask'])
26
- labels.append([row['SAS_Class'], row['SDS_Class']])
 
 
 
 
 
 
 
 
27
 
28
  input_ids = torch.cat(input_ids, dim=0)
29
  attention_masks = torch.cat(attention_masks, dim=0)
30
  labels = torch.tensor(labels, dtype=torch.float)
31
-
32
  return TensorDataset(input_ids, attention_masks, labels)
33
 
34
- # 分割数据集
35
- train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验证的比例分割数据集
36
-
37
-
38
- # 创建DataLoader
39
-
40
 
 
10
  input_ids = []
11
  attention_masks = []
12
  labels = []
 
13
  for _, row in df.iterrows():
14
+ # 检查标签是否有效(例如,不是NaN)
15
+ if pd.isna(row['SAS_Class']) or pd.isna(row['SDS_Class']):
16
+ continue # 跳过这个样本
17
+
18
  encoded = tokenizer.encode_plus(
19
  row['Description'],
20
  add_special_tokens=True,
 
26
  )
27
  input_ids.append(encoded['input_ids'])
28
  attention_masks.append(encoded['attention_mask'])
29
+ # labels.append([row['SAS_Class'], row['SDS_Class']])
30
+ # 将SAS_Class和SDS_Class转换为one-hot编码
31
+ sas_label = [0] * 4 # 初始化4个元素为0的列表
32
+ sds_label = [0] * 4 # 同上
33
+ sas_label[int(row['SAS_Class'])] = 1 # 将对应的位置设为1
34
+ sds_label[int(row['SDS_Class'])] = 1 # 同上
35
+ combined_label = sas_label + sds_label # 组合两个标签
36
+
37
+ labels.append(combined_label)
38
 
39
  input_ids = torch.cat(input_ids, dim=0)
40
  attention_masks = torch.cat(attention_masks, dim=0)
41
  labels = torch.tensor(labels, dtype=torch.float)
 
42
  return TensorDataset(input_ids, attention_masks, labels)
43
 
 
 
 
 
 
 
44
 
new.py CHANGED
@@ -1,10 +1,13 @@
1
  from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
2
  from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
3
- from torch.nn import CrossEntropyLoss
4
  import torch
5
  from sklearn.model_selection import train_test_split
6
-
7
  from dataset.load_dataset import df, prepare_dataset
 
 
 
 
 
8
 
9
  epochs = 10
10
  tokenizer = AutoTokenizer.from_pretrained(
@@ -18,37 +21,88 @@ train_df, val_df = train_test_split(df, test_size=0.1) # 以90%训练,10%验
18
  train_dataset = prepare_dataset(train_df, tokenizer)
19
  val_dataset = prepare_dataset(val_df, tokenizer)
20
  # 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
21
- train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=64)
22
- validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=64)
 
 
 
 
 
23
 
24
  model = AutoModelForSequenceClassification.from_pretrained(
25
- "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition").to(device)
26
- input = tokenizer("I love using transformers for natural language processing.", return_tensors="pt")
27
- # 使用模型进行预测
28
- # with torch.no_grad():
29
- # logits = model(**input).logits
30
- # 解析预测结果
31
- # predicted_class_id = logits.argmax().item()
32
- # print(f"Predicted class id: {predicted_class_id}")
33
  # 准备优化器和学习率调度器
34
- optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
35
  total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
36
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
37
-
38
  # 微调模型
39
- model.train()
 
40
  for epoch in range(epochs): # 迭代多个epoch
41
- for step, batch in enumerate(train_dataloader):
 
 
 
 
 
42
  # 将数据加载到GPU
43
  batch = tuple(t.to(device) for t in batch)
44
  b_input_ids, b_input_mask, b_labels = batch
45
  model.zero_grad()
46
  # 前向传播
47
- outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
48
- loss = outputs.loss
49
  logits = outputs.logits
50
  # 反向传播
51
- loss.backward()
52
- optimizer.step()
53
- scheduler.step()
54
- # 评估阶段省略,但在实际应用中非常重要
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
2
  from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 
3
  import torch
4
  from sklearn.model_selection import train_test_split
 
5
  from dataset.load_dataset import df, prepare_dataset
6
+ from torch.nn import BCEWithLogitsLoss
7
+ from transformers import BertForSequenceClassification, BertConfig
8
+ from tqdm.auto import tqdm
9
+ from torch.cuda.amp import GradScaler, autocast
10
+
11
 
12
  epochs = 10
13
  tokenizer = AutoTokenizer.from_pretrained(
 
21
  train_dataset = prepare_dataset(train_df, tokenizer)
22
  val_dataset = prepare_dataset(val_df, tokenizer)
23
  # 现在train_dataloader和validation_dataloader已准备好,可用于模型训练和验证
24
+ train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
25
+ validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)
26
+
27
+
28
+ # 加载配置
29
+ config = BertConfig.from_pretrained("pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition")
30
+ config.num_labels = 8 # 调整为你的标签数量
31
 
32
  model = AutoModelForSequenceClassification.from_pretrained(
33
+ "pretrained_models/Bio_ClinicalBERT-finetuned-medicalcondition",config=config,ignore_mismatched_sizes=True).to(device)
 
 
 
 
 
 
 
34
  # 准备优化器和学习率调度器
35
+ optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
36
  total_steps = len(train_dataloader) * epochs # epochs是您想要训练的轮数
37
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
38
+ loss_fn = BCEWithLogitsLoss()
39
  # 微调模型
40
+ scaler = GradScaler()
41
+
42
  for epoch in range(epochs): # 迭代多个epoch
43
+ print(f"\nEpoch {epoch + 1}/{epochs}")
44
+ print('-------------------------------')
45
+ model.train()
46
+ total_loss = 0
47
+ train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
48
+ for step, batch in enumerate(train_progress_bar):
49
  # 将数据加载到GPU
50
  batch = tuple(t.to(device) for t in batch)
51
  b_input_ids, b_input_mask, b_labels = batch
52
  model.zero_grad()
53
  # 前向传播
54
+ outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
 
55
  logits = outputs.logits
56
  # 反向传播
57
+ loss = loss_fn(logits, b_labels)
58
+ total_loss += loss.item()
59
+ # loss.backward()
60
+ # optimizer.step()
61
+ # scheduler.step()
62
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
63
+ # 检查loss是否为nan
64
+ if torch.isnan(loss).any():
65
+ print(f"Loss is nan in epoch {epoch + 1}, step {step}.")
66
+ # 可选:打印出问题数据的更多信息或采取其他措施
67
+ # 注意:直接跳过可能不解决根本问题,最好检查为何loss会是nan
68
+ continue # 跳过当前批次的反向传播和优化器步骤
69
+ scaler.scale(loss).backward()
70
+ scaler.step(optimizer)
71
+ scaler.update()
72
+ train_progress_bar.set_postfix({'loss': f"{loss.item():.2f}"})
73
+
74
+ # 评估阶段
75
+ avg_train_loss = total_loss / len(train_dataloader)
76
+ print(f"Training loss: {avg_train_loss:.2f}")
77
+
78
+ # 验证阶段
79
+ model.eval()
80
+ total_eval_accuracy = 0
81
+ eval_progress_bar = tqdm(validation_dataloader, desc="Validation", leave=False)
82
+
83
+ for batch in eval_progress_bar:
84
+ batch = tuple(t.to(device) for t in batch)
85
+ b_input_ids, b_input_mask, b_labels = batch
86
+ with torch.no_grad():
87
+ outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
88
+
89
+ logits = outputs.logits
90
+ # predictions = torch.argmax(logits, dim=1).flatten()
91
+ # labels = b_labels.flatten()
92
+ # accuracy = (predictions == labels).cpu().numpy().mean()
93
+ # 使用sigmoid函数将logits转换为概率值
94
+ probs = torch.sigmoid(logits)
95
+ # 将概率高于0.5的预测为正类(1),低于0.5的预测为负类(0)
96
+ predictions = (probs > 0.5).int()
97
+
98
+ # 比较预测和真实标签
99
+ correct_predictions = (predictions == b_labels.int()).float() # 确保标签也是整数类型
100
+ # 计算每个样本的正确预测的平均数,然后计算整个批次的平均值
101
+ accuracy_per_sample = correct_predictions.mean(dim=1)
102
+ accuracy = accuracy_per_sample.mean().item()
103
+ total_eval_accuracy += accuracy
104
+ # 更新进度条
105
+ eval_progress_bar.set_postfix({'accuracy': f"{accuracy:.2f}"})
106
+
107
+ avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
108
+ print(f"Validation Accuracy: {avg_val_accuracy:.2f}")