# -*- coding: utf-8 -*- """FinetuneHUPD.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/17c2CQZx_kyD3-0fuQqv_pCMJ0Evd7fLN """ # Pretty print from pprint import pprint # Datasets load_dataset function from datasets import load_dataset # Transformers Autokenizer from transformers import AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, AdamW from torch.utils.data import DataLoader import torch tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') dataset_dict = load_dataset('HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-31', val_filing_start_date='2016-01-01', val_filing_end_date='2016-01-31', ) print('Loading is done!') # Label-to-index mapping for the decision status field decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} # Helper function def map_decision_to_string(example): return {'decision': decision_to_str[example['decision']]} # Re-labeling/mapping. train_set = dataset_dict['train'].map(map_decision_to_string) val_set = dataset_dict['validation'].map(map_decision_to_string) # Focus on the abstract section and tokenize the text using the tokenizer. _SECTION_ = 'abstract' # Training set train_set = train_set.map( lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'), batched=True) # Validation set val_set = val_set.map( lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'), batched=True) # Set the format train_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision']) val_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision']) #print(train_set['decision']) # train_dataloader and val_data_loader train_dataloader = DataLoader(train_set, batch_size=16) val_dataloader = DataLoader(val_set, batch_size=16) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') model.to(device) print(device) print("torch cuda is avail: ") print(torch.cuda.is_available()) model.train() optim = AdamW(model.parameters(), lr=5e-5) num_training_epochs = 2 for epoch in range(num_training_epochs): for batch in train_dataloader: optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['decision'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs[0] loss.backward() optim.step() print("batch finished") model.eval()