| from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer |
| import torch |
| from process_data import getDF |
| from torch.utils.data import TensorDataset, random_split |
| from torch.utils.data import DataLoader, SequentialSampler |
|
|
| from transformers import DataCollatorForTokenClassification |
| from transformers import get_linear_schedule_with_warmup |
| from sampler import BalanceSampler |
| NUM_CLASSES = 13 |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
|
|
| |
| model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device) |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") |
|
|
| |
| model.classifier = torch.nn.Linear(in_features = 768, out_features= NUM_CLASSES) |
| print(model) |
| data_collator = DataCollatorForTokenClassification(tokenizer) |
|
|
| def get_input_id_and_attention_masks(): |
| df = getDF() |
| |
| input_ids = [] |
| attention_masks = [] |
| for summ in df['summary']: |
| encoded_dict = tokenizer.encode_plus( |
| summ, |
| add_special_tokens = True, |
| max_length = 512, |
| truncation=True, |
| pad_to_max_length = True, |
| padding='max_length', |
| return_attention_mask = True, |
| return_tensors = 'pt', |
| ) |
| input_ids.append(encoded_dict['input_ids']) |
| |
| |
| attention_masks.append(encoded_dict['attention_mask']) |
| input_ids = torch.cat(input_ids, dim=0) |
| attention_masks = torch.cat(attention_masks, dim=0) |
| |
|
|
| labels = torch.from_numpy(np.array(df['genre_id'].tolist())) |
| return input_ids, attention_masks, labels |
|
|
| input_ids,attention_masks, labels= get_input_id_and_attention_masks() |
|
|
| def createTensorDS(input_ids,attention_masks, labels): |
| return TensorDataset(input_ids, attention_masks, labels) |
|
|
| def split(tensorDataset): |
| train_size = int(0.85 * len(tensorDataset)) |
| val_size = len(tensorDataset) - train_size |
| train_dataset, val_dataset = random_split(tensorDataset, [train_size, val_size]) |
| return train_dataset, val_dataset |
|
|
| def createDataloaders(train_dataset, val_dataset): |
| |
| batch_size = 16 |
|
|
| train_dataloader = DataLoader( |
| train_dataset, |
| sampler = BalanceSampler(train_dataset), |
| batch_size = batch_size |
| ) |
|
|
| valid_dataloader = DataLoader( |
| val_dataset, |
| sampler = SequentialSampler(val_dataset), |
| batch_size = batch_size |
| ) |
| return train_dataloader, valid_dataloader |
|
|
|
|
| def calc_accuracy(logits,labels): |
| label=[] |
| num_ones = 0 |
| acc = 0 |
| for label_set in labels: |
| labs = [] |
| for ind, res in enumerate(label_set): |
| if res.item() == 1: |
| labs.append(ind) |
| label.append(labs) |
| num_ones += len(labs) |
|
|
| for i,log in enumerate(logits): |
| top_out = (-log).argsort()[:5] |
| |
| for ind in top_out: |
| if ind in label[i]: |
| acc = acc+1 |
| return acc/num_ones |
|
|
| def train(model, train, val, epochs): |
| total_steps = len(train)*epochs |
| optimizer = torch.optim.Adam(model.parameters(), |
| lr = 2e-5, |
| eps = 1e-8 |
| ) |
| scheduler = get_linear_schedule_with_warmup(optimizer, |
| num_warmup_steps = 0, |
| num_training_steps = total_steps) |
| loss_fn=torch.nn.BCEWithLogitsLoss() |
| for epoch in range(3): |
| total_train_loss = 0 |
| batch_loss = 0 |
| print("") |
| print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs)) |
| print('Training...') |
| model.train() |
| for step, batch in enumerate(train): |
| input_ids= batch[0].to(device) |
| input_mask = batch[1].to(device) |
| labels = batch[2].to(device) |
|
|
| optimizer.zero_grad() |
| out = model(input_ids, attention_mask=input_mask) |
|
|
| logits =out['logits'] |
| loss = loss_fn(logits, labels) |
| |
| acc += calc_accuracy(logits, labels) |
| total_train_loss += loss.item() |
| batch_loss += loss.item() |
|
|
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| optimizer.step() |
| scheduler.step() |
| |
| avg_train_loss = total_train_loss/len(train) |
| print('train_loss: ', avg_train_loss,) |
| print('train_acc: ', acc) |
| print("Running Validation...") |
| model.eval() |
| total_eval_accuracy=0 |
| total_eval_loss= 0 |
| num_Eval_steps= 0 |
|
|
| for batch in val: |
| input_ids= batch[0].to(device) |
| input_mask=batch[1].to(device) |
| labels = batch[2].to(device) |
| with torch.no_grad(): |
| out = model(input_ids,attention_mask=input_mask) |
|
|
|
|
|
|
| logits = out['logits'] |
| loss = loss_fn(logits, labels) |
| total_eval_loss += loss.item() |
|
|
| logits = logits.detach().cpu().numpy() |
| label_ids = labels.cpu().numpy() |
|
|
| |
| avg_loss_Eval = total_eval_loss/len(val) |
| print( |
| 'epoch: ', epoch, |
| 'train_loss: ', avg_train_loss, |
| 'valid loss ', avg_loss_Eval, |
| ) |
| input_ids, attention_masks, labels=get_input_id_and_attention_masks() |
| ds=createTensorDS(input_ids, attention_masks, labels) |
| train_dataset, val_dataset=split(ds) |
| train_dataloader, valid_dataloader=createDataloaders(train_dataset, val_dataset) |
| train(model, train_dataloader, valid_dataloader, 3) |
|
|