from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments from datasets import load_dataset import torch from torch.utils.data import Dataset from tqdm import tqdm import pandas as pd # Check if we have a MPS compatible device mps_device = torch.device("mps" if torch.backends.mps.is_built() else "cpu") print("Using {} device".format(mps_device)) # Load the pre-trained model and tokenizer model_name = 'distilbert-base-uncased' model = DistilBertForSequenceClassification.from_pretrained(model_name).to(mps_device) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) # Load the dataset for January 2016. This is a small sample of the USPTO dataset. dataset_dict = load_dataset( 'HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) label_values = ['REJECTED', 'ACCEPTED'] df = pd.DataFrame({ 'abstract': dataset_dict['train']['abstract'], 'decision': dataset_dict['train']['decision'] }) # Filter out abstracts where decision is not in label_values df = df[df['decision'].isin(label_values)] df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0) train_texts, train_labels = df['abstract'].tolist(), df['decision'].tolist() # Do the same for the validation dataset df = pd.DataFrame({ 'abstract': dataset_dict['validation']['abstract'], 'decision': dataset_dict['validation']['decision'] }) # Filter out abstracts where decision is not in label_values df = df[df['decision'].isin(label_values)] df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0) validation_texts, validation_labels = df['abstract'].tolist(), df['decision'].tolist() print("Number of training samples: {:,}\nNumber of validation samples: {:,}".format(len(train_texts), len(validation_texts))) # Create a PyTorch Dataset class USPTODataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) # Preprocess the data train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512) val_encodings = tokenizer(validation_texts, truncation=True, padding=True, max_length=512) train_dataset = USPTODataset(train_encodings, train_labels) val_dataset = USPTODataset(val_encodings, validation_labels) # Set up the Trainer training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=3, # total # of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler learning_rate=5e-05, weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, use_mps_device=True, # Use MPS device. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset, # evaluation dataset ) # Train the model trainer.train() trainer.model.save_pretrained('./model')