text embedding

#15
by Ausen - opened

i want to get the text embedding to do some analysis, then how can i get it?

Hello,

You can custom pipeline like below:

import torch

from transformers import Pipeline

class FinBERTPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "text" in kwargs:
            preprocess_kwargs["text"] = kwargs["text"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, sentence, maybe_arg=2):
        return self.tokenizer(sentence, return_tensors="pt")

    def _forward(self, inputs):
        return self.model(**inputs, output_hidden_states=True)

    def postprocess(self, outputs):
        sentence_embedding = torch.mean(outputs.hidden_states[-1][0], dim=0).numpy()
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        prediction_max_index = int(torch.argmax(predictions))
        label = self.model.config.id2label[prediction_max_index]
        return {'label': label, 'score': predictions[0][prediction_max_index].item(), 'embedding': sentence_embedding}

This pipeline gives you sentence embedding but you can convert it to word embedding.

You can use the pipeline like below:

from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import AutoModelForSequenceClassification

PIPELINE_REGISTRY.register_pipeline(
   'finbert-pipeline-with-sentence-embedding',
   pipeline_class=FinBERTPipeline,
   pt_model=AutoModelForSequenceClassification,
)

pipe = pipeline('finbert-pipeline-with-sentence-embedding', model='ProsusAI/finbert', device=0)
outputs = pipe('EXAMPLE SENTENCE')

print(outputs['label'], outputs['score'], outputs['embedding'].shape)

Thank you.

Sign up or log in to comment