In [2]:
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertTokenizerFast, Trainer, TrainingArguments, AdamW
from torch.utils.data import DataLoader
import torch

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')

Found cached dataset hupd (C:/Users/calia/.cache/huggingface/datasets/HUPD___hupd/sample-5094df4de61ed3bc/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading is done!


In [3]:
# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 0, 'CONT-REJECTED': 0, 'CONT-ACCEPTED': 0, 'CONT-PENDING': 0}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

# Focus on the abstract section and tokenize the text using the tokenizer. 
_SECTION_ = 'abstract'

# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

Loading cached processed dataset at C:\Users\calia\.cache\huggingface\datasets\HUPD___hupd\sample-5094df4de61ed3bc\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-9f7788eb9924fd62.arrow
Loading cached processed dataset at C:\Users\calia\.cache\huggingface\datasets\HUPD___hupd\sample-5094df4de61ed3bc\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-6c3687322fe5b556.arrow
Loading cached processed dataset at C:\Users\calia\.cache\huggingface\datasets\HUPD___hupd\sample-5094df4de61ed3bc\0.0.0\6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142\cache-bd3b1eee4495f3ce.arrow


Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [4]:
train_set

Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id', 'input_ids', 'attention_mask'],
    num_rows: 16153
})

In [5]:
val_set

Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id', 'input_ids', 'attention_mask'],
    num_rows: 9094
})

In [6]:
train_set = train_set.remove_columns(["patent_number", "title", "abstract", "claims", "background", "summary", "description", "cpc_label", "ipc_label", "filing_date", "patent_issue_date", "date_published", "examiner_id"])
val_set = val_set.remove_columns(["patent_number", "title", "abstract", "claims", "background", "summary", "description", "cpc_label", "ipc_label", "filing_date", "patent_issue_date", "date_published", "examiner_id"])

train_set = train_set.rename_column("decision", "labels")
val_set = val_set.rename_column("decision", "labels")

In [7]:
train_set

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 16153
})

In [8]:
val_set

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 9094
})

In [9]:
# Set the format
train_set.set_format(type='torch', 
    columns=['labels', 'input_ids', 'attention_mask'])

val_set.set_format(type='torch', 
    columns=['labels', 'input_ids', 'attention_mask'])

In [10]:
# train_dataloader and val_data_loader
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
print(device)
print("torch cuda is avail: ")
print(torch.cuda.is_available())

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

cuda
torch cuda is avail: 
True


HuggingFace Trainer

In [12]:
training_args = TrainingArguments(
    output_dir='./results/',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs/',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 16153
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2020
  Number of trainable parameters = 66955010


Step,Training Loss
10,0.692
20,0.6851
30,0.684
40,0.6851
50,0.6784
60,0.6873
70,0.6819
80,0.6911
90,0.6832
100,0.6941


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500\config.json
Model weights saved in ./results/checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000\config.json
Model weights saved in ./results/checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500\config.json
Model weights saved in ./results/checkpoint-1500\pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000\config.json
Model weights saved in ./results/checkpoint-2000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2020, training_loss=0.6342116433795136, metrics={'train_runtime': 708.5025, 'train_samples_per_second': 45.598, 'train_steps_per_second': 2.851, 'total_flos': 4279491780980736.0, 'train_loss': 0.6342116433795136, 'epoch': 2.0})

PyTorch Training Loop

In [None]:
# model.train()
# optim = AdamW(model.parameters(), lr=5e-5)
# num_training_epochs = 2

# for epoch in range(num_training_epochs):
#   print("starting epoch: " + str(epoch))
#   for batch in train_dataloader:
#     optim.zero_grad()
#     input_ids = batch['input_ids'].to(device)
#     attention_mask = batch['attention_mask'].to(device)
#     labels = batch['labels'].to(device)
#     outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#     loss = outputs[0]
#     loss.backward()
#     optim.step()
# model.eval()

In [5]:
dataset_dict['train']['abstract'][0]

'The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'

In [6]:
dataset_dict['train']['abstract'][1]

'Embodiments of the invention provide a method of reading and verifying a tag based on inherent disorder during a manufacturing process. The method includes using a first reader to take a first reading of an inherent disorder feature of the tag, and using a second reader to take a second reading of the inherent disorder feature of the tag. The method further includes matching the first reading with the second reading, and determining one or more acceptance criteria, wherein at least one of the acceptance criteria is based on whether the first reading and the second reading match within a predetermined threshold. If the acceptance criteria are met, then the tag is accepted, and a fingerprint for the tag is recorded. The invention further provides a method of testing and characterizing a reader of inherent disorder tags during a manufacturing process. The method includes taking a reading of a known inherent disorder tag, using the reading to measure a characteristic of the reader, and st