|
|
|
import numpy as np |
|
import pandas as pd |
|
import csv |
|
import torch.nn as nn |
|
from torch.optim.lr_scheduler import ReduceLROnPlateau |
|
from torch.utils.data import TensorDataset, DataLoader |
|
from transformers import BertTokenizer,BertConfig,AdamW |
|
from sklearn.metrics import accuracy_score |
|
from sklearn.metrics import classification_report |
|
from tqdm import tqdm |
|
import torch |
|
import transformers |
|
from torch.utils.data import Dataset, DataLoader |
|
|
|
|
|
|
|
class MyDataSet(Dataset): |
|
def __init__(self, loaded_data): |
|
self.data = loaded_data |
|
|
|
def __len__(self): |
|
return len(self.data) |
|
|
|
def __getitem__(self, idx): |
|
return self.data[idx] |
|
|
|
Data_path = "/kaggle/input/inference/train.csv" |
|
Totle_data = pd.read_csv(Data_path) |
|
Totle_data = Totle_data.sample(frac=0.1) |
|
Totle_data = Totle_data.dropna(axis=0,subset = ["2"]) |
|
custom_dataset = MyDataSet(Totle_data) |
|
|
|
train_size = int(len(custom_dataset) * 0.6) |
|
validate_size = int(len(custom_dataset) * 0.1) |
|
test_size = len(custom_dataset) - validate_size - train_size |
|
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, validate_size, test_size]) |
|
|
|
|
|
train_data_path="Bert_Try.csv" |
|
dev_data_path = "Bert_Dev.csv" |
|
test_data_path="Bert_Test.csv" |
|
|
|
train_dataset = Totle_data.iloc[train_dataset.indices] |
|
validate_dataset = Totle_data.iloc[validate_dataset.indices] |
|
test_dataset = Totle_data.iloc[test_dataset.indices] |
|
|
|
|
|
train_dataset.to_csv(train_data_path,index=False,header=True) |
|
validate_dataset.to_csv(dev_data_path ,index=False,header=True) |
|
test_dataset.to_csv(test_data_path,index=False,header=True) |
|
|
|
|
|
data = pd.read_csv(train_data_path) |
|
data.head |
|
|
|
|
|
|
|
class BertClassificationModel(nn.Module): |
|
def __init__(self): |
|
super(BertClassificationModel, self).__init__() |
|
|
|
pretrained_weights="bert-base-chinese" |
|
self.bert = transformers.BertModel.from_pretrained(pretrained_weights) |
|
for param in self.bert.parameters(): |
|
param.requires_grad = True |
|
|
|
self.dense = nn.Linear(768, 3) |
|
|
|
def forward(self, input_ids,token_type_ids,attention_mask): |
|
|
|
bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask) |
|
|
|
bert_cls_hidden_state = bert_output[1] |
|
|
|
linear_output = self.dense(bert_cls_hidden_state) |
|
return linear_output |
|
|
|
|
|
|
|
def encoder(max_len,vocab_path,text_list): |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") |
|
tokenizer = tokenizer( |
|
text_list, |
|
padding = True, |
|
truncation = True, |
|
max_length = max_len, |
|
return_tensors='pt' |
|
) |
|
input_ids = tokenizer['input_ids'] |
|
token_type_ids = tokenizer['token_type_ids'] |
|
attention_mask = tokenizer['attention_mask'] |
|
return input_ids,token_type_ids,attention_mask |
|
|
|
|
|
labels2dict = {"neutral":0,"entailment":1,"contradiction":2} |
|
def load_data(path): |
|
csvFileObj = open(path) |
|
readerObj = csv.reader(csvFileObj) |
|
text_list = [] |
|
labels = [] |
|
for row in readerObj: |
|
|
|
if readerObj.line_num == 1: |
|
continue |
|
|
|
label = int(labels2dict[row[0]]) |
|
text = row[1] |
|
text_list.append(text) |
|
labels.append(label) |
|
|
|
input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list) |
|
labels = torch.tensor(labels) |
|
|
|
data = TensorDataset(input_ids,token_type_ids,attention_mask,labels) |
|
return data |
|
|
|
|
|
|
|
batch_size = 16 |
|
|
|
train_data_path="Bert_Try.csv" |
|
dev_data_path="Bert_Dev.csv" |
|
test_data_path="Bert_Test.csv" |
|
|
|
train_data = load_data(train_data_path) |
|
dev_data = load_data(dev_data_path) |
|
test_data = load_data(test_data_path) |
|
|
|
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) |
|
dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True) |
|
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False) |
|
|
|
|
|
def dev(model,dev_loader): |
|
model.to(device) |
|
|
|
model.eval() |
|
|
|
with torch.no_grad(): |
|
correct = 0 |
|
total = 0 |
|
for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'): |
|
input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) |
|
out_put = model(input_ids,token_type_ids,attention_mask) |
|
_, predict = torch.max(out_put.data, 1) |
|
correct += (predict==labels).sum().item() |
|
total += labels.size(0) |
|
res = correct / total |
|
return res |
|
|
|
|
|
|
|
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
|
def train(model,train_loader,dev_loader) : |
|
|
|
model.to(device) |
|
model.train() |
|
criterion = nn.CrossEntropyLoss() |
|
param_optimizer = list(model.named_parameters()) |
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] |
|
|
|
optimizer_grouped_parameters = [ |
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], |
|
'weight_decay': 0.01}, |
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
|
] |
|
|
|
optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False} |
|
optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params) |
|
scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08) |
|
t_total = len(train_loader) |
|
|
|
total_epochs = 10 |
|
bestAcc = 0 |
|
correct = 0 |
|
total = 0 |
|
print('Training and verification begin!') |
|
for epoch in range(total_epochs): |
|
for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader): |
|
|
|
optimizer.zero_grad() |
|
input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) |
|
out_put = model(input_ids,token_type_ids,attention_mask) |
|
loss = criterion(out_put, labels) |
|
_, predict = torch.max(out_put.data, 1) |
|
correct += (predict == labels).sum().item() |
|
total += labels.size(0) |
|
loss.backward() |
|
optimizer.step() |
|
|
|
if (step + 1) % 10 == 0: |
|
train_acc = correct / total |
|
print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item())) |
|
|
|
if (step + 1) % 200 == 0: |
|
train_acc = correct / total |
|
|
|
acc = dev(model, dev_loader) |
|
if bestAcc < acc: |
|
bestAcc = acc |
|
|
|
path = 'bert_model.pkl' |
|
torch.save(model, path) |
|
print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item())) |
|
scheduler.step(bestAcc) |
|
|
|
|
|
|
|
path = '/kaggle/input/inference/bert_model.pkl' |
|
|
|
|
|
model = BertClassificationModel() |
|
|
|
train(model,train_loader,dev_loader) |
|
|