Spaces:
Sleeping
Sleeping
| from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments | |
| from datasets import load_dataset | |
| import torch | |
| from torch.utils.data import Dataset | |
| from tqdm import tqdm | |
| import pandas as pd | |
| # Check if we have a MPS compatible device | |
| mps_device = torch.device("mps" if torch.backends.mps.is_built() else "cpu") | |
| print("Using {} device".format(mps_device)) | |
| # Load the pre-trained model and tokenizer | |
| model_name = 'distilbert-base-uncased' | |
| model = DistilBertForSequenceClassification.from_pretrained(model_name).to(mps_device) | |
| tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
| # Load the dataset for January 2016. This is a small sample of the USPTO dataset. | |
| dataset_dict = load_dataset( | |
| 'HUPD/hupd', | |
| name='sample', | |
| data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
| icpr_label=None, | |
| train_filing_start_date='2016-01-01', | |
| train_filing_end_date='2016-01-21', | |
| val_filing_start_date='2016-01-22', | |
| val_filing_end_date='2016-01-31', | |
| ) | |
| label_values = ['REJECTED', 'ACCEPTED'] | |
| df = pd.DataFrame({ | |
| 'abstract': dataset_dict['train']['abstract'], | |
| 'decision': dataset_dict['train']['decision'] | |
| }) | |
| # Filter out abstracts where decision is not in label_values | |
| df = df[df['decision'].isin(label_values)] | |
| df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0) | |
| train_texts, train_labels = df['abstract'].tolist(), df['decision'].tolist() | |
| # Do the same for the validation dataset | |
| df = pd.DataFrame({ | |
| 'abstract': dataset_dict['validation']['abstract'], | |
| 'decision': dataset_dict['validation']['decision'] | |
| }) | |
| # Filter out abstracts where decision is not in label_values | |
| df = df[df['decision'].isin(label_values)] | |
| df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0) | |
| validation_texts, validation_labels = df['abstract'].tolist(), df['decision'].tolist() | |
| print("Number of training samples: {:,}\nNumber of validation samples: {:,}".format(len(train_texts), len(validation_texts))) | |
| # Create a PyTorch Dataset | |
| class USPTODataset(Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| # Preprocess the data | |
| train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512) | |
| val_encodings = tokenizer(validation_texts, truncation=True, padding=True, max_length=512) | |
| train_dataset = USPTODataset(train_encodings, train_labels) | |
| val_dataset = USPTODataset(val_encodings, validation_labels) | |
| # Set up the Trainer | |
| training_args = TrainingArguments( | |
| output_dir='./results', # output directory | |
| num_train_epochs=3, # total # of training epochs | |
| per_device_train_batch_size=16, # batch size per device during training | |
| per_device_eval_batch_size=64, # batch size for evaluation | |
| warmup_steps=500, # number of warmup steps for learning rate scheduler | |
| learning_rate=5e-05, | |
| weight_decay=0.01, # strength of weight decay | |
| logging_dir='./logs', # directory for storing logs | |
| logging_steps=10, | |
| use_mps_device=True, # Use MPS device. | |
| ) | |
| trainer = Trainer( | |
| model=model, # the instantiated π€ Transformers model to be trained | |
| args=training_args, # training arguments, defined above | |
| train_dataset=train_dataset, # training dataset | |
| eval_dataset=val_dataset, # evaluation dataset | |
| ) | |
| # Train the model | |
| trainer.train() | |
| trainer.model.save_pretrained('./model') |