File size: 5,171 Bytes
81414ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
"""Finetuning Language Models - Can I Patent This?.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1x9XfLKvGNBsajOK8rztsZnCoD2ucGfO6

# Finetuning Language Models - Can I Patent This?

Using the [Harvard USPTO patent dataset](https://github.com/suzgunmirac/hupd), we will fine-tune a DistilBERT model 
obtained from Hugging Face that can predict whether a patent is accepted or rejected based off of its abstract and claims.
"""
import gc
import argparse
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from datasets import load_dataset, load_from_disk
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig


# Initializing global variables
file_path = '/app/models/content/'
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
criterion = torch.nn.CrossEntropyLoss()


def create_dataloaders(dataset_dict, section):
  # Initializing the tokenizer
  model_name = 'distilbert-base-uncased'
  tokenizer = DistilBertTokenizer.from_pretrained(model_name, do_lower_case=True)
  
  train_set, val_set = dataset_dict['train'], dataset_dict['validation']
  
  # Training set
  train_set = train_set.map(
    lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
    batched=True)

  # Validation set
  val_set = val_set.map(
      lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
      batched=True)
  
  train_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])
  
  val_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])
  
  train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
  val_loader = DataLoader(val_set, batch_size=8, shuffle=False)
  
  return train_loader, val_loader, tokenizer


def measure_accuracy(outputs, labels):
  # This function will accept a model's outputs and the actual decisions
  # and return test accuracy and number of samples.

  preds = np.argmax(outputs, axis=1).flatten()
  labels = labels.flatten()
  correct = np.sum(preds == labels)

  return correct, len(labels)

def validation(model, val_loader):
  # This function accepts a model and a validation set DataLoader as its parameters
  # and returns the test accuracy.

  model.eval()

  total_correct = 0
  total_samples = 0

  for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['decision'].to(device)

    with torch.no_grad():
      outputs = model(input_ids=input_ids, labels=labels)

    logits = outputs.logits
    num_correct, num_samples = measure_accuracy(logits.cpu().numpy(), labels.cpu().numpy())

    total_correct += num_correct
    total_samples += num_samples
    
    del input_ids, labels, logits
    gc.collect()
    torch.cuda.empty_cache()

  return (total_correct/total_samples) * 100


def train(device, model, tokenizer, train_loader, val_loader, section):
  # This function will accept a model, the training set DataLoader, validation set
  # DataLoader, and section as its parameters and return the trained model.

  model.train()

  # Define optimizer.
  optim = AdamW(model.parameters(), lr=5e-5)
  num_epochs = 5
  best_val_acc = 0

  for epoch in range(num_epochs):
    for batch in train_loader:
      optim.zero_grad()
      
      input_ids = batch['input_ids'].to(device, non_blocking=True)
      attention_mask = batch['attention_mask'].to(device, non_blocking=True)
      labels = batch['decision'].to(device, non_blocking=True)

      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits

      loss = criterion(outputs, labels)
      loss.backward()
      optim.step()
      
      del input_ids, attention_mask, labels
      gc.collect()
      torch.cuda.empty_cache()

      # Calculate test accuracy.
      val_acc = validation(model, val_loader)

      # Save the model that yields the best test accuracy
      if best_val_acc < val_acc:
        best_val_acc = val_acc

        model.save_pretrained(file_path + section + '/')
        tokenizer.save_pretrained(file_path + section + '_model_tokenizer/')

      model.train()

  return model


if __name__ == '__main__':
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  
  parser = argparse.ArgumentParser()
  
  parser.add_argument('--section', type=str)
  
  args = parser.parse_args()
  section = args.section
  
  dataset_dict = load_from_disk(file_path + 'dataset_dict')
  
  train_loader, val_loader, tokenizer = create_dataloaders(dataset_dict, section)
  
  del dataset_dict
  gc.collect()
  torch.cuda.empty_cache()
  
  # Defining the models.
  config = DistilBertConfig(num_classes=2, output_hidden_states=False) 
  model = DistilBertForSequenceClassification(config=config)
  model.to(device)
  
  # Train the model.
  model = train(device, model, tokenizer, train_loader, val_loader, section)
  
  val_acc = validation(model, val_loader)
  
  print(f'*** Accuracy on the validation set ({section}): {val_acc}')