sentiment-analysis / finetune.py
umangsoni's picture
add finetune.py
2e61334
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
# Check if we have a MPS compatible device
mps_device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
print("Using {} device".format(mps_device))
# Load the pre-trained model and tokenizer
model_name = 'distilbert-base-uncased'
model = DistilBertForSequenceClassification.from_pretrained(model_name).to(mps_device)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
# Load the dataset for January 2016. This is a small sample of the USPTO dataset.
dataset_dict = load_dataset(
'HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
)
label_values = ['REJECTED', 'ACCEPTED']
df = pd.DataFrame({
'abstract': dataset_dict['train']['abstract'],
'decision': dataset_dict['train']['decision']
})
# Filter out abstracts where decision is not in label_values
df = df[df['decision'].isin(label_values)]
df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0)
train_texts, train_labels = df['abstract'].tolist(), df['decision'].tolist()
# Do the same for the validation dataset
df = pd.DataFrame({
'abstract': dataset_dict['validation']['abstract'],
'decision': dataset_dict['validation']['decision']
})
# Filter out abstracts where decision is not in label_values
df = df[df['decision'].isin(label_values)]
df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0)
validation_texts, validation_labels = df['abstract'].tolist(), df['decision'].tolist()
print("Number of training samples: {:,}\nNumber of validation samples: {:,}".format(len(train_texts), len(validation_texts)))
# Create a PyTorch Dataset
class USPTODataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# Preprocess the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(validation_texts, truncation=True, padding=True, max_length=512)
train_dataset = USPTODataset(train_encodings, train_labels)
val_dataset = USPTODataset(val_encodings, validation_labels)
# Set up the Trainer
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total # of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
learning_rate=5e-05,
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
use_mps_device=True, # Use MPS device.
)
trainer = Trainer(
model=model, # the instantiated πŸ€— Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
)
# Train the model
trainer.train()
trainer.model.save_pretrained('./model')