# %% import numpy as np import pandas as pd import csv import torch.nn as nn from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.utils.data import TensorDataset, DataLoader from transformers import BertTokenizer,BertConfig,AdamW from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from tqdm import tqdm import torch import transformers from torch.utils.data import Dataset, DataLoader # %% class MyDataSet(Dataset): def __init__(self, loaded_data): self.data = loaded_data def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx] Data_path = "/kaggle/input/inference/train.csv" Totle_data = pd.read_csv(Data_path) Totle_data = Totle_data.sample(frac=0.1) Totle_data = Totle_data.dropna(axis=0,subset = ["2"]) custom_dataset = MyDataSet(Totle_data) #按照比例划分 train_size = int(len(custom_dataset) * 0.6) validate_size = int(len(custom_dataset) * 0.1) test_size = len(custom_dataset) - validate_size - train_size train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, validate_size, test_size]) #设置保存路径 train_data_path="Bert_Try.csv" dev_data_path = "Bert_Dev.csv" test_data_path="Bert_Test.csv" train_dataset = Totle_data.iloc[train_dataset.indices] validate_dataset = Totle_data.iloc[validate_dataset.indices] test_dataset = Totle_data.iloc[test_dataset.indices] #index参数设置为False表示不保存行索引,header设置为False表示不保存列索引 train_dataset.to_csv(train_data_path,index=False,header=True) validate_dataset.to_csv(dev_data_path ,index=False,header=True) test_dataset.to_csv(test_data_path,index=False,header=True) # %% data = pd.read_csv(train_data_path) data.head # %% class BertClassificationModel(nn.Module): def __init__(self): super(BertClassificationModel, self).__init__() #加载预训练模型 pretrained_weights="bert-base-chinese" self.bert = transformers.BertModel.from_pretrained(pretrained_weights) for param in self.bert.parameters(): param.requires_grad = True #定义线性函数 self.dense = nn.Linear(768, 3) def forward(self, input_ids,token_type_ids,attention_mask): #得到bert_output bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask) #获得预训练模型的输出 bert_cls_hidden_state = bert_output[1] #将768维的向量输入到线性层映射为二维向量 linear_output = self.dense(bert_cls_hidden_state) return linear_output # %% def encoder(max_len,vocab_path,text_list): #将text_list embedding成bert模型可用的输入形式 #加载分词模型 tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") tokenizer = tokenizer( text_list, padding = True, truncation = True, max_length = max_len, return_tensors='pt' # 返回的类型为pytorch tensor ) input_ids = tokenizer['input_ids'] token_type_ids = tokenizer['token_type_ids'] attention_mask = tokenizer['attention_mask'] return input_ids,token_type_ids,attention_mask # %% labels2dict = {"neutral":0,"entailment":1,"contradiction":2} def load_data(path): csvFileObj = open(path) readerObj = csv.reader(csvFileObj) text_list = [] labels = [] for row in readerObj: #跳过表头 if readerObj.line_num == 1: continue #label在什么位置就改成对应的index label = int(labels2dict[row[0]]) text = row[1] text_list.append(text) labels.append(label) #调用encoder函数,获得预训练模型的三种输入形式 input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list) labels = torch.tensor(labels) #将encoder的返回值以及label封装为Tensor的形式 data = TensorDataset(input_ids,token_type_ids,attention_mask,labels) return data # %% #设定batch_size batch_size = 16 #引入数据路径 train_data_path="Bert_Try.csv" dev_data_path="Bert_Dev.csv" test_data_path="Bert_Test.csv" #调用load_data函数,将数据加载为Tensor形式 train_data = load_data(train_data_path) dev_data = load_data(dev_data_path) test_data = load_data(test_data_path) #将训练数据和测试数据进行DataLoader实例化 train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True) test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False) # %% def dev(model,dev_loader): model.to(device) model.eval() with torch.no_grad(): correct = 0 total = 0 for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'): input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) out_put = model(input_ids,token_type_ids,attention_mask) _, predict = torch.max(out_put.data, 1) correct += (predict==labels).sum().item() total += labels.size(0) res = correct / total return res # %% device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') def train(model,train_loader,dev_loader) : model.to(device) model.train() criterion = nn.CrossEntropyLoss() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False} optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params) scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08) t_total = len(train_loader) total_epochs = 10 bestAcc = 0 correct = 0 total = 0 print('Training and verification begin!') for epoch in range(total_epochs): for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader): optimizer.zero_grad() input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) out_put = model(input_ids,token_type_ids,attention_mask) loss = criterion(out_put, labels) _, predict = torch.max(out_put.data, 1) correct += (predict == labels).sum().item() total += labels.size(0) loss.backward() optimizer.step() #每两步进行一次打印 if (step + 1) % 10 == 0: train_acc = correct / total print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item())) #每五十次进行一次验证 if (step + 1) % 200 == 0: train_acc = correct / total #调用验证函数dev对模型进行验证,并将有效果提升的模型进行保存 acc = dev(model, dev_loader) if bestAcc < acc: bestAcc = acc #模型保存路径 path = 'bert_model.pkl' torch.save(model, path) print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item())) scheduler.step(bestAcc) # %% path = '/kaggle/input/inference/bert_model.pkl' # model = torch.load(path) #实例化模型 model = BertClassificationModel() #调用训练函数进行训练与验证 train(model,train_loader,dev_loader)