Spaces:
Sleeping
Sleeping
File size: 3,724 Bytes
95eb783 f42a46b 95eb783 2e61334 95eb783 f42a46b 95eb783 f42a46b 95eb783 f42a46b 95eb783 f42a46b 95eb783 2e61334 95eb783 2e61334 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
# Check if we have a MPS compatible device
mps_device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
print("Using {} device".format(mps_device))
# Load the pre-trained model and tokenizer
model_name = 'distilbert-base-uncased'
model = DistilBertForSequenceClassification.from_pretrained(model_name).to(mps_device)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
# Load the dataset for January 2016. This is a small sample of the USPTO dataset.
dataset_dict = load_dataset(
'HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
)
label_values = ['REJECTED', 'ACCEPTED']
df = pd.DataFrame({
'abstract': dataset_dict['train']['abstract'],
'decision': dataset_dict['train']['decision']
})
# Filter out abstracts where decision is not in label_values
df = df[df['decision'].isin(label_values)]
df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0)
train_texts, train_labels = df['abstract'].tolist(), df['decision'].tolist()
# Do the same for the validation dataset
df = pd.DataFrame({
'abstract': dataset_dict['validation']['abstract'],
'decision': dataset_dict['validation']['decision']
})
# Filter out abstracts where decision is not in label_values
df = df[df['decision'].isin(label_values)]
df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0)
validation_texts, validation_labels = df['abstract'].tolist(), df['decision'].tolist()
print("Number of training samples: {:,}\nNumber of validation samples: {:,}".format(len(train_texts), len(validation_texts)))
# Create a PyTorch Dataset
class USPTODataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# Preprocess the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(validation_texts, truncation=True, padding=True, max_length=512)
train_dataset = USPTODataset(train_encodings, train_labels)
val_dataset = USPTODataset(val_encodings, validation_labels)
# Set up the Trainer
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total # of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
learning_rate=5e-05,
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
use_mps_device=True, # Use MPS device.
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
)
# Train the model
trainer.train()
trainer.model.save_pretrained('./model') |