# -*- coding: utf-8 -*- """Finetuning Language Models - Can I Patent This?.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1x9XfLKvGNBsajOK8rztsZnCoD2ucGfO6 # Finetuning Language Models - Can I Patent This? Using the [Harvard USPTO patent dataset](https://github.com/suzgunmirac/hupd), we will fine-tune a DistilBERT model obtained from Hugging Face that can predict whether a patent is accepted or rejected based off of its abstract and claims. """ import gc import argparse import numpy as np import torch from torch.utils.data import DataLoader from torch.optim import AdamW from datasets import load_dataset, load_from_disk from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig # Initializing global variables file_path = '/app/models/content/' decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} criterion = torch.nn.CrossEntropyLoss() def create_dataloaders(dataset_dict, section): # Initializing the tokenizer model_name = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(model_name, do_lower_case=True) train_set, val_set = dataset_dict['train'], dataset_dict['validation'] # Training set train_set = train_set.map( lambda e: tokenizer((e[section]), truncation=True, padding='max_length'), batched=True) # Validation set val_set = val_set.map( lambda e: tokenizer((e[section]), truncation=True, padding='max_length'), batched=True) train_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision']) val_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision']) train_loader = DataLoader(train_set, batch_size=8, shuffle=True) val_loader = DataLoader(val_set, batch_size=8, shuffle=False) return train_loader, val_loader, tokenizer def measure_accuracy(outputs, labels): # This function will accept a model's outputs and the actual decisions # and return test accuracy and number of samples. preds = np.argmax(outputs, axis=1).flatten() labels = labels.flatten() correct = np.sum(preds == labels) return correct, len(labels) def validation(model, val_loader): # This function accepts a model and a validation set DataLoader as its parameters # and returns the test accuracy. model.eval() total_correct = 0 total_samples = 0 for batch in val_loader: input_ids = batch['input_ids'].to(device) labels = batch['decision'].to(device) with torch.no_grad(): outputs = model(input_ids=input_ids, labels=labels) logits = outputs.logits num_correct, num_samples = measure_accuracy(logits.cpu().numpy(), labels.cpu().numpy()) total_correct += num_correct total_samples += num_samples del input_ids, labels, logits gc.collect() torch.cuda.empty_cache() return (total_correct/total_samples) * 100 def train(device, model, tokenizer, train_loader, val_loader, section): # This function will accept a model, the training set DataLoader, validation set # DataLoader, and section as its parameters and return the trained model. model.train() # Define optimizer. optim = AdamW(model.parameters(), lr=5e-5) num_epochs = 5 best_val_acc = 0 for epoch in range(num_epochs): for batch in train_loader: optim.zero_grad() input_ids = batch['input_ids'].to(device, non_blocking=True) attention_mask = batch['attention_mask'].to(device, non_blocking=True) labels = batch['decision'].to(device, non_blocking=True) outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits loss = criterion(outputs, labels) loss.backward() optim.step() del input_ids, attention_mask, labels gc.collect() torch.cuda.empty_cache() # Calculate test accuracy. val_acc = validation(model, val_loader) # Save the model that yields the best test accuracy if best_val_acc < val_acc: best_val_acc = val_acc model.save_pretrained(file_path + section + '/') tokenizer.save_pretrained(file_path + section + '_model_tokenizer/') model.train() return model if __name__ == '__main__': device = 'cuda' if torch.cuda.is_available() else 'cpu' parser = argparse.ArgumentParser() parser.add_argument('--section', type=str) args = parser.parse_args() section = args.section dataset_dict = load_from_disk(file_path + 'dataset_dict') train_loader, val_loader, tokenizer = create_dataloaders(dataset_dict, section) del dataset_dict gc.collect() torch.cuda.empty_cache() # Defining the models. config = DistilBertConfig(num_classes=2, output_hidden_states=False) model = DistilBertForSequenceClassification(config=config) model.to(device) # Train the model. model = train(device, model, tokenizer, train_loader, val_loader, section) val_acc = validation(model, val_loader) print(f'*** Accuracy on the validation set ({section}): {val_acc}')