File size: 2,940 Bytes
cfabf1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""FinetuneHUPD.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/17c2CQZx_kyD3-0fuQqv_pCMJ0Evd7fLN
"""

# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, AdamW
from torch.utils.data import DataLoader
import torch

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-31',
    val_filing_start_date='2016-01-01',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')

# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

# Focus on the abstract section and tokenize the text using the tokenizer. 
_SECTION_ = 'abstract'

# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

# Set the format
train_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])

val_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])

#print(train_set['decision'])

# train_dataloader and val_data_loader
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
print(device)
print("torch cuda is avail: ")
print(torch.cuda.is_available())
model.train()
optim = AdamW(model.parameters(), lr=5e-5)
num_training_epochs = 2

for epoch in range(num_training_epochs):
  for batch in train_dataloader:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['decision'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]
    loss.backward()
    optim.step()
    print("batch finished")

model.eval()