|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import ( |
|
AutoModelForSequenceClassification, |
|
AutoTokenizer, |
|
get_constant_schedule_with_warmup, |
|
) |
|
import torch |
|
|
|
class FinanciaSentimental(Dataset): |
|
"""This class is used to load the data and tokenize it""" |
|
def __init__(self, tokenizer, dataframe, columns, max_len=512): |
|
self.tokenizer = tokenizer |
|
self.dataframe = dataframe |
|
|
|
self._columns = columns |
|
self.max_len = max_len |
|
|
|
@property |
|
def columns(self): |
|
"""Return the columns to target""" |
|
return self._columns |
|
|
|
def __len__(self): |
|
"""Return the length of the dataset""" |
|
return len(self.dataframe) |
|
|
|
def __getitem__(self, index): |
|
"""Get the data at the index""" |
|
values = self.dataframe.iloc[index] |
|
text = values['text'] |
|
label = values[self._columns].values.astype(np.float32) |
|
inputs = self.tokenizer.encode_plus(text, max_length=130, pad_to_max_length=True, padding='max_length', truncation=True, return_tensors='pt') |
|
label = torch.tensor(label, dtype=torch.float) |
|
input_ids = inputs["input_ids"].squeeze().to(dtype=torch.long) |
|
attention_mask = inputs["attention_mask"].squeeze().to(dtype=torch.long) |
|
token_type_ids = inputs["token_type_ids"].squeeze().to(dtype=torch.long) |
|
|
|
inputs_dict = { |
|
"input_ids": input_ids, |
|
"attention_mask": attention_mask, |
|
"token_type_ids": token_type_ids, |
|
"labels":label |
|
} |
|
|
|
return inputs_dict |
|
|
|
|