Spaces:
Sleeping
Sleeping
# Let's import a few requirements | |
import torch | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import numpy | |
class TransformerVectorizer: | |
def __init__(self): | |
# Load the tokenizer (converts text to tokens) | |
self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") | |
# Load the pre-trained model | |
self.transformer_model = AutoModelForSequenceClassification.from_pretrained( | |
"cardiffnlp/twitter-roberta-base-sentiment-latest" | |
) | |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
def text_to_tensor( | |
self, | |
texts: list, | |
) -> numpy.ndarray: | |
"""Function that transforms a list of texts to their learned representation. | |
Args: | |
list_text_X (list): List of texts to be transformed. | |
Returns: | |
numpy.ndarray: Transformed list of texts. | |
""" | |
# First, tokenize all the input text | |
tokenized_text_X_train = self.tokenizer.batch_encode_plus( | |
texts, return_tensors="pt" | |
)["input_ids"] | |
# Depending on the hardware used, the number of examples to be processed can be reduced | |
# Here we split the data into 100 examples per batch | |
tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50) | |
# Send the model to the device | |
transformer_model = self.transformer_model.to(self.device) | |
output_hidden_states_list = [] | |
for tokenized_x in tokenized_text_X_train_split: | |
# Pass the tokens through the transformer model and get the hidden states | |
# Only keep the last hidden layer state for now | |
output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[ | |
1 | |
][-1] | |
# Average over the tokens axis to get a representation at the text level. | |
output_hidden_states = output_hidden_states.mean(dim=1) | |
output_hidden_states = output_hidden_states.detach().cpu().numpy() | |
output_hidden_states_list.append(output_hidden_states) | |
self.encodings = numpy.concatenate(output_hidden_states_list, axis=0) | |
return self.encodings | |
def transform(self, texts: list): | |
return self.text_to_tensor(texts) | |