Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""FinetuneHUPD.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/17c2CQZx_kyD3-0fuQqv_pCMJ0Evd7fLN | |
""" | |
# Pretty print | |
from pprint import pprint | |
# Datasets load_dataset function | |
from datasets import load_dataset | |
# Transformers Autokenizer | |
from transformers import AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, AdamW | |
from torch.utils.data import DataLoader | |
import torch | |
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') | |
dataset_dict = load_dataset('HUPD/hupd', | |
name='sample', | |
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
icpr_label=None, | |
train_filing_start_date='2016-01-01', | |
train_filing_end_date='2016-01-31', | |
val_filing_start_date='2016-01-01', | |
val_filing_end_date='2016-01-31', | |
) | |
print('Loading is done!') | |
# Label-to-index mapping for the decision status field | |
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} | |
# Helper function | |
def map_decision_to_string(example): | |
return {'decision': decision_to_str[example['decision']]} | |
# Re-labeling/mapping. | |
train_set = dataset_dict['train'].map(map_decision_to_string) | |
val_set = dataset_dict['validation'].map(map_decision_to_string) | |
# Focus on the abstract section and tokenize the text using the tokenizer. | |
_SECTION_ = 'abstract' | |
# Training set | |
train_set = train_set.map( | |
lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'), | |
batched=True) | |
# Validation set | |
val_set = val_set.map( | |
lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'), | |
batched=True) | |
# Set the format | |
train_set.set_format(type='torch', | |
columns=['input_ids', 'attention_mask', 'decision']) | |
val_set.set_format(type='torch', | |
columns=['input_ids', 'attention_mask', 'decision']) | |
#print(train_set['decision']) | |
# train_dataloader and val_data_loader | |
train_dataloader = DataLoader(train_set, batch_size=16) | |
val_dataloader = DataLoader(val_set, batch_size=16) | |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') | |
model.to(device) | |
print(device) | |
print("torch cuda is avail: ") | |
print(torch.cuda.is_available()) | |
model.train() | |
optim = AdamW(model.parameters(), lr=5e-5) | |
num_training_epochs = 2 | |
for epoch in range(num_training_epochs): | |
for batch in train_dataloader: | |
optim.zero_grad() | |
input_ids = batch['input_ids'].to(device) | |
attention_mask = batch['attention_mask'].to(device) | |
labels = batch['decision'].to(device) | |
outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
loss = outputs[0] | |
loss.backward() | |
optim.step() | |
print("batch finished") | |
model.eval() |