In [None]:
!pip install datasets
!pip install transformers



In [None]:
pip install transformers[torch]



In [155]:
pip install accelerate



In [156]:
pip install huggingface_hub



## Steps

1. prepare dataset
2. load pretrained Tokenizer, call it with dataset -> encoding
3. build PyTorch Dataset with encodings
4. Load pretrained Model
5. a. Load Trainer and train it
   b. or use naive Pytorch training pipeline

## Pretrained model for patentability from Hugging face

In [None]:
model_name =  'distilbert-base-uncased' #'AI-Growth-Lab/PatentSBERTa'

## Import libraries and pretrained model

In [157]:
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader

from transformers import pipeline, Trainer, TrainingArguments

import numpy as np
import torch
import torch.nn.functional as F

from transformers import logging

logging.set_verbosity_warning()

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Download Dataset (USPTO)

use the `load_dataset` function to load all the patent applications that were filed to the USPTO in January 2016. We specify the date ranges of the training and validation sets as January 1-21, 2016 and January 22-31, 2016, respectively.

In [None]:
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')



  0%|          | 0/2 [00:00<?, ?it/s]

Loading is done!


In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 16153
    })
    validation: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 9094
    })
})

In [None]:
train_dict = dataset_dict['train']
print(len(train_dict))
type(train_dict)

16153


datasets.arrow_dataset.Dataset

In [None]:
validation_dict = dataset_dict['validation']
print(len(validation_dict))

9094


In [None]:
train_dict[:1]

{'patent_number': ['13261748'],
 'decision': ['ACCEPTED'],
 'title': ['MINI-OPTICAL NETWORK TERMINAL (ONT)'],
 'abstract': ['The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'],
 'claims': ['1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled

In [None]:
train_dict[0]['claims']

'1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network thereby reducing the unnecessary splitting of equal upstream wavelengths to all the network clients in the network. 2. The optical network terminal of claim 1, wherein the first interface includes an optical module that receives optical signals via the optical fiber link and converts the optical signals to electrical signals. 3. The optical network terminal of claim 2, wherein the optical module is selectively configurable to support two or more of a broadband passive optical 

In [None]:
train_dict[0]['abstract']

'The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'

In [None]:
# Print info about the sizes of the train and validation sets
print(f'Train dataset size: {dataset_dict["train"].shape}')
print(f'Validation dataset size: {dataset_dict["validation"].shape}')

Train dataset size: (16153, 14)
Validation dataset size: (9094, 14)


## Pre-Processing the data

the label-to-index mapping for the decision status field by assigning the decision status labels to the class indices.

Since we want a patentability score between 0 to 1, I will assign the labels to either 0 or 1.

Pending is 1 for this project.

In [None]:
# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 1, 'CONT-REJECTED': 0, 'CONT-ACCEPTED': 1, 'CONT-PENDING': 1}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

re-label the decision status fields of the examples in the training and validation sets

In [None]:
# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [None]:
# testing
train_set[:1]

{'patent_number': ['13261748'],
 'decision': [1],
 'title': ['MINI-OPTICAL NETWORK TERMINAL (ONT)'],
 'abstract': ['The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'],
 'claims': ['1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled to the f

the abstract section of the patent applications

In [None]:
# Focus on the abstract section and tokenize the text using the tokenizer.
_SECTION_ = 'abstract'

In [None]:
# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

In [None]:
# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [None]:
train_set[:1]

{'patent_number': ['13261748'],
 'decision': [1],
 'title': ['MINI-OPTICAL NETWORK TERMINAL (ONT)'],
 'abstract': ['The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'],
 'claims': ['1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled to the f

the claims section of the patent applications

In [None]:
# Focus on the abstract section and tokenize the text using the tokenizer.
_SECTION1_ = 'claims'

In [None]:
# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION1_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

In [None]:
# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION1_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [None]:
train_set[:1]

{'patent_number': ['13261748'],
 'decision': [1],
 'title': ['MINI-OPTICAL NETWORK TERMINAL (ONT)'],
 'abstract': ['The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'],
 'claims': ['1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled to the f

In [None]:
# Set the format
train_set.set_format(type='torch',
    columns=['input_ids', 'attention_mask', 'decision'])

val_set.set_format(type='torch',
    columns=['input_ids', 'attention_mask', 'decision'])

## Dataloader to create the training set and validation set loaders

In [None]:
# train_dataloader and val_data_loader
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)

In [None]:
# Get the next batch
batch = next(iter(train_dataloader))
# Print the ids
pprint(batch['input_ids'])
# Print the labels
pprint(batch['decision'])

tensor([[  101,  1015,  1012,  ...,     0,     0,     0],
        [  101,  1015,  1012,  ...,  1996,  3653,   102],
        [  101,  1015,  1012,  ..., 16726,  1996,   102],
        ...,
        [  101,  1015,  1012,  ...,  1012,  1996,   102],
        [  101,  1015,  1012,  ...,  2034, 28688,   102],
        [  101,  1015,  1012,  ...,  2000,  4366,   102]])
tensor([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1])


In [None]:
# Print the input and output shapes
input_shape = batch['input_ids'].shape
output_shape = batch['decision'].shape
print(f'Input shape: {input_shape}')
print(f'Output shape: {output_shape}')

Input shape: torch.Size([16, 512])
Output shape: torch.Size([16])


In [None]:
# A helper function that converts ids into tokens
def convert_ids_to_string(tokenizer, input):
    return ' '.join(tokenizer.convert_ids_to_tokens(input))

print an example in the batch

In [None]:
# Print the example
pprint(convert_ids_to_string(tokenizer,batch['input_ids'][1]))

('[CLS] 1 . a method comprising : using a first reader to take a first reading '
 'of an inherent disorder feature of a tag ; using at least a second reader to '
 'take at least a second reading of the inherent disorder feature of the tag ; '
 'matching the first reading with at least the second reading ; determining '
 'one or more acceptance criteria , wherein at least one of the acceptance '
 'criteria is based on whether the first reading and the second reading match '
 'within a pre ##de ##ter ##mined threshold ; accepting the tag if the '
 'acceptance criteria are met ; and recording a finger ##print for the tag if '
 'the tag was accepted . 2 . the method of claim 1 , wherein determining one '
 'or more acceptance criteria further comprises : determining an acceptance '
 'criterion based on an individual reading . 3 . the method of claim 2 , '
 'wherein determining an acceptance criterion based on an individual reading '
 'comprises determining an acceptance criterion based on a

## Tune the Model

In [140]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [141]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_dataloader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['decision'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Testing

In [142]:
# Get the next batch
batch = next(iter(val_dataloader))
# Print the ids
pprint(batch['input_ids'])
# Print the labels
pprint(batch['decision'])

tensor([[  101,  1015,  1012,  ..., 16503,  2063,   102],
        [  101,  1015,  1012,  ...,  3341,  2012,   102],
        [  101,  1015,  1011,  ...,  1012,  2861,   102],
        ...,
        [  101,  1015,  1011,  ...,  3012,  2978,   102],
        [  101,  1015,  1011,  ...,  2689,  5418,   102],
        [  101,  1015,  1012,  ..., 27983,  2638,   102]])
tensor([0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])


In [143]:
print(batch['input_ids'])

tensor([[  101,  1015,  1012,  ..., 16503,  2063,   102],
        [  101,  1015,  1012,  ...,  3341,  2012,   102],
        [  101,  1015,  1011,  ...,  1012,  2861,   102],
        ...,
        [  101,  1015,  1011,  ...,  3012,  2978,   102],
        [  101,  1015,  1011,  ...,  2689,  5418,   102],
        [  101,  1015,  1012,  ..., 27983,  2638,   102]])


In [144]:
batch_size = 16
model_cpu = model.cpu()
with torch.no_grad():
  outputs = model_cpu(batch['input_ids']).logits
  print(outputs)
  predictions = F.softmax(outputs, dim = 1)
  print(predictions)
  labels = torch.argmax(predictions, dim = 1)
  print(labels)
  print("--------")
  print(batch['decision'])
  print("--------")
  res = labels == batch['decision']
  print(res)
  print(res.sum() / batch_size)

tensor([[-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5924,  1.0367],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5924,  1.0367],
        [-1.5924,  1.0367],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5924,  1.0367]])
tensor([[0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327]])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
--------
tensor([0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])
--------
te

## Validation

In [145]:
accuracy = []
for batch in val_dataloader:
  with torch.no_grad():
    outputs = model_cpu(batch['input_ids']).logits
    predictions_batch = F.softmax(outputs, dim = 1)
    labels = torch.argmax(predictions_batch, dim = 1)
    acc = (labels == batch['decision']).numpy().sum() / batch_size
    print(f"batch_average_accuray: {acc}")
    accuracy.append(acc)

batch_average_accuray: 0.75
batch_average_accuray: 0.75
batch_average_accuray: 1.0
batch_average_accuray: 0.75
batch_average_accuray: 0.9375
batch_average_accuray: 0.9375
batch_average_accuray: 1.0
batch_average_accuray: 0.9375
batch_average_accuray: 1.0
batch_average_accuray: 0.875
batch_average_accuray: 0.9375
batch_average_accuray: 0.75
batch_average_accuray: 0.9375
batch_average_accuray: 0.9375
batch_average_accuray: 0.9375
batch_average_accuray: 0.9375
batch_average_accuray: 1.0
batch_average_accuray: 0.9375
batch_average_accuray: 0.75
batch_average_accuray: 0.9375
batch_average_accuray: 0.8125
batch_average_accuray: 0.9375
batch_average_accuray: 0.875
batch_average_accuray: 0.9375
batch_average_accuray: 0.8125
batch_average_accuray: 0.875
batch_average_accuray: 0.875
batch_average_accuray: 0.75
batch_average_accuray: 0.875
batch_average_accuray: 0.6875
batch_average_accuray: 0.6875
batch_average_accuray: 0.9375
batch_average_accuray: 0.875
batch_average_accuray: 0.8125
batch_aver

In [146]:
print(f"average accuracy: {np.mean(accuracy)}")

average accuracy: 0.8896089630931459


## Save the tuned model in "saved" directory

In [147]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model_cpu.save_pretrained(save_directory)

In [148]:
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model_saved = AutoModelForSequenceClassification.from_pretrained(save_directory)

## Testing the saved model

In [149]:
with torch.no_grad():
  outputs = model_saved(batch['input_ids']).logits
  print(outputs)
  predictions = F.softmax(outputs, dim = 1)
  print(predictions)
  labels = torch.argmax(predictions, dim = 1)
  print(labels)
  print("--------")
  print(batch['decision'])
  print("--------")
  res = labels == batch['decision']
  print(res)
  print(res.sum() / batch_size)

tensor([[-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5925,  1.0366],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368],
        [-1.5923,  1.0368]])
tensor([[0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327],
        [0.0673, 0.9327]])
tensor([1, 1, 1, 1, 1, 1])
--------
tensor([1, 1, 0, 1, 1, 1])
--------
tensor([ True,  True, False,  True,  True,  True])
tensor(0.3125)


## Share the tuned model to Hugging Face

In [160]:
model_cpu.push_to_hub("ayethuzar/tuned-for-patentability")


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayethuzar/tuned-for-patentability/commit/5c336ab0841277420d783d479d6200536740747b', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='5c336ab0841277420d783d479d6200536740747b', pr_url=None, pr_revision=None, pr_num=None)

In [161]:
tokenizer.push_to_hub("ayethuzar/tuned-for-patentability")

CommitInfo(commit_url='https://huggingface.co/ayethuzar/tuned-for-patentability/commit/c28d9a613ca5d0f862983c9346342b8091811ad7', commit_message='Upload tokenizer', commit_description='', oid='c28d9a613ca5d0f862983c9346342b8091811ad7', pr_url=None, pr_revision=None, pr_num=None)

References:

1. https://colab.research.google.com/drive/1_ZsI7WFTsEO0iu_0g3BLTkIkOUqPzCET?usp=sharing#scrollTo=B5wxZNhXdUK6

2. https://huggingface.co/AI-Growth-Lab/PatentSBERTa

3. https://huggingface.co/anferico/bert-for-patents

4. https://huggingface.co/transformers/v3.2.0/custom_datasets.html