Spaces:
Runtime error
Runtime error
File size: 2,940 Bytes
cfabf1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# -*- coding: utf-8 -*-
"""FinetuneHUPD.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/17c2CQZx_kyD3-0fuQqv_pCMJ0Evd7fLN
"""
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, AdamW
from torch.utils.data import DataLoader
import torch
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
dataset_dict = load_dataset('HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-31',
val_filing_start_date='2016-01-01',
val_filing_end_date='2016-01-31',
)
print('Loading is done!')
# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
# Helper function
def map_decision_to_string(example):
return {'decision': decision_to_str[example['decision']]}
# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)
# Focus on the abstract section and tokenize the text using the tokenizer.
_SECTION_ = 'abstract'
# Training set
train_set = train_set.map(
lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
batched=True)
# Validation set
val_set = val_set.map(
lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
batched=True)
# Set the format
train_set.set_format(type='torch',
columns=['input_ids', 'attention_mask', 'decision'])
val_set.set_format(type='torch',
columns=['input_ids', 'attention_mask', 'decision'])
#print(train_set['decision'])
# train_dataloader and val_data_loader
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
print(device)
print("torch cuda is avail: ")
print(torch.cuda.is_available())
model.train()
optim = AdamW(model.parameters(), lr=5e-5)
num_training_epochs = 2
for epoch in range(num_training_epochs):
for batch in train_dataloader:
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['decision'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
loss.backward()
optim.step()
print("batch finished")
model.eval() |