Spaces:

ppaihack
/

CipherClause

Sleeping

App Files Files Community

CipherClause / transformer_vectorizer.py

WenqingZhang

Upload 18 files

f04dd6a verified about 1 month ago

raw

history blame

2.38 kB

	# Let's import a few requirements
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	import numpy

	class TransformerVectorizer:
	def __init__(self):
	# Load the tokenizer (converts text to tokens)
	self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

	# Load the pre-trained model
	self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
	"cardiffnlp/twitter-roberta-base-sentiment-latest"
	)
	self.device = "cuda:0" if torch.cuda.is_available() else "cpu"

	def text_to_tensor(
	self,
	texts: list,
	) -> numpy.ndarray:
	"""Function that transforms a list of texts to their learned representation.

	Args:
	list_text_X (list): List of texts to be transformed.

	Returns:
	numpy.ndarray: Transformed list of texts.
	"""
	# First, tokenize all the input text
	tokenized_text_X_train = self.tokenizer.batch_encode_plus(
	texts, return_tensors="pt"
	)["input_ids"]

	# Depending on the hardware used, the number of examples to be processed can be reduced
	# Here we split the data into 100 examples per batch
	tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50)

	# Send the model to the device
	transformer_model = self.transformer_model.to(self.device)
	output_hidden_states_list = []

	for tokenized_x in tokenized_text_X_train_split:
	# Pass the tokens through the transformer model and get the hidden states
	# Only keep the last hidden layer state for now
	output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[
	1
	][-1]
	# Average over the tokens axis to get a representation at the text level.
	output_hidden_states = output_hidden_states.mean(dim=1)
	output_hidden_states = output_hidden_states.detach().cpu().numpy()
	output_hidden_states_list.append(output_hidden_states)

	self.encodings = numpy.concatenate(output_hidden_states_list, axis=0)
	return self.encodings

	def transform(self, texts: list):
	return self.text_to_tensor(texts)