import pandas as pd from sklearn.model_selection import train_test_split from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset import torch # 加载数据 df = pd.read_csv("dataset/processed_new_data.csv") # 准备数据集 def prepare_dataset(df, tokenizer, max_length=512): input_ids = [] attention_masks = [] labels = [] for _, row in df.iterrows(): # 检查标签是否有效(例如,不是NaN) if pd.isna(row['SAS_Class']) or pd.isna(row['SDS_Class']): continue # 跳过这个样本 encoded = tokenizer.encode_plus( row['Description'], add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt', ) input_ids.append(encoded['input_ids']) attention_masks.append(encoded['attention_mask']) # labels.append([row['SAS_Class'], row['SDS_Class']]) # 将SAS_Class和SDS_Class转换为one-hot编码 sas_label = [0] * 4 # 初始化4个元素为0的列表 sds_label = [0] * 4 # 同上 sas_label[int(row['SAS_Class'])] = 1 # 将对应的位置设为1 sds_label[int(row['SDS_Class'])] = 1 # 同上 combined_label = sas_label + sds_label # 组合两个标签 labels.append(combined_label) input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels, dtype=torch.float) return TensorDataset(input_ids, attention_masks, labels)