# -*- coding: utf-8 -*-
"""Finetuning Language Models - Can I Patent This?.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1x9XfLKvGNBsajOK8rztsZnCoD2ucGfO6

# Finetuning Language Models - Can I Patent This?

Using the [Harvard USPTO patent dataset](https://github.com/suzgunmirac/hupd), we will fine-tune a DistilBERT model 
obtained from Hugging Face that can predict whether a patent is accepted or rejected based off of its abstract and claims.
"""
import gc
import argparse
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from datasets import load_dataset, load_from_disk
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig


# Initializing global variables
file_path = '/app/models/content/'
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
criterion = torch.nn.CrossEntropyLoss()


def create_dataloaders(dataset_dict, section):
  # Initializing the tokenizer
  model_name = 'distilbert-base-uncased'
  tokenizer = DistilBertTokenizer.from_pretrained(model_name, do_lower_case=True)
  
  train_set, val_set = dataset_dict['train'], dataset_dict['validation']
  
  # Training set
  train_set = train_set.map(
    lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
    batched=True)

  # Validation set
  val_set = val_set.map(
      lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
      batched=True)
  
  train_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])
  
  val_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])
  
  train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
  val_loader = DataLoader(val_set, batch_size=8, shuffle=False)
  
  return train_loader, val_loader, tokenizer


def measure_accuracy(outputs, labels):
  # This function will accept a model's outputs and the actual decisions
  # and return test accuracy and number of samples.

  preds = np.argmax(outputs, axis=1).flatten()
  labels = labels.flatten()
  correct = np.sum(preds == labels)

  return correct, len(labels)

def validation(model, val_loader):
  # This function accepts a model and a validation set DataLoader as its parameters
  # and returns the test accuracy.

  model.eval()

  total_correct = 0
  total_samples = 0

  for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['decision'].to(device)

    with torch.no_grad():
      outputs = model(input_ids=input_ids, labels=labels)

    logits = outputs.logits
    num_correct, num_samples = measure_accuracy(logits.cpu().numpy(), labels.cpu().numpy())

    total_correct += num_correct
    total_samples += num_samples
    
    del input_ids, labels, logits
    gc.collect()
    torch.cuda.empty_cache()

  return (total_correct/total_samples) * 100


def train(device, model, tokenizer, train_loader, val_loader, section):
  # This function will accept a model, the training set DataLoader, validation set
  # DataLoader, and section as its parameters and return the trained model.

  model.train()

  # Define optimizer.
  optim = AdamW(model.parameters(), lr=5e-5)
  num_epochs = 5
  best_val_acc = 0

  for epoch in range(num_epochs):
    for batch in train_loader:
      optim.zero_grad()
      
      input_ids = batch['input_ids'].to(device, non_blocking=True)
      attention_mask = batch['attention_mask'].to(device, non_blocking=True)
      labels = batch['decision'].to(device, non_blocking=True)

      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits

      loss = criterion(outputs, labels)
      loss.backward()
      optim.step()
      
      del input_ids, attention_mask, labels
      gc.collect()
      torch.cuda.empty_cache()

      # Calculate test accuracy.
      val_acc = validation(model, val_loader)

      # Save the model that yields the best test accuracy
      if best_val_acc < val_acc:
        best_val_acc = val_acc

        model.save_pretrained(file_path + section + '/')
        tokenizer.save_pretrained(file_path + section + '_model_tokenizer/')

      model.train()

  return model


if __name__ == '__main__':
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  
  parser = argparse.ArgumentParser()
  
  parser.add_argument('--section', type=str)
  
  args = parser.parse_args()
  section = args.section
  
  dataset_dict = load_from_disk(file_path + 'dataset_dict')
  
  train_loader, val_loader, tokenizer = create_dataloaders(dataset_dict, section)
  
  del dataset_dict
  gc.collect()
  torch.cuda.empty_cache()
  
  # Defining the models.
  config = DistilBertConfig(num_classes=2, output_hidden_states=False) 
  model = DistilBertForSequenceClassification(config=config)
  model.to(device)
  
  # Train the model.
  model = train(device, model, tokenizer, train_loader, val_loader, section)
  
  val_acc = validation(model, val_loader)
  
  print(f'*** Accuracy on the validation set ({section}): {val_acc}')