File size: 6,234 Bytes

1061bb6

from typing import Any, Dict, List

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

MAX_TOKENS_IN_BATCH = 4_000  # Hard limit to prevent OOMs
DEFAULT_MAX_NEW_TOKENS = 10  # By default limit the output to 10 tokens


class EndpointHandler:
    """
    This class is used to handle the inference with pre and post process for
    text2text models. See
    https://huggingface.co/docs/inference-endpoints/guides/custom_handler for
    more details.
    """

    def __init__(self, path: str = ""):
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(path)
            self.model = AutoModelForSeq2SeqLM.from_pretrained(path, device_map="auto")
        except:
            import accelerate

            print(f"ACCELERATE VERSION: {accelerate.__version__}")
            raise

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        This method is called when the endpoint is called.

        Arguments
        ---------
            data (Dict[str, Any]):
                Must contains the input data under `input` key and any
                parameters for the inference under `parameters`.

        Returns
        -------
            output (List[Dict[str, Any]]):
                A list, length equal to the number of outputted characters,
                where each item is a dictionary containing `generated_text` (i.e
                the character), `perplexity` and `first_token_probs`.
        """
        input_texts = data["inputs"]
        generate_kwargs = data.get("parameters", {})
        # This is not technically a generate_kwarg, but needs to live under parameters
        check_first_tokens = generate_kwargs.pop("check_first_tokens", None)
        max_new_tokens = (
            generate_kwargs.pop("max_new_tokens", None) or DEFAULT_MAX_NEW_TOKENS
        )

        # Tokenizing input texts
        inputs = self.tokenizer(
            input_texts, return_tensors="pt", padding=True, truncation=True,
        )["input_ids"]

        # Make sure not to OOM if too many inputs
        assert inputs.dim() == 2, f"Inputs have dimension {inputs.dim()} != 2"
        total_tokens = inputs.shape[0] * (inputs.shape[1] + max_new_tokens - 1)
        assert (
            total_tokens <= MAX_TOKENS_IN_BATCH
        ), f"Passed {total_tokens} (shape: {inputs.shape}, max_new_tokens: {max_new_tokens}), which is greater than limit of {MAX_TOKENS_IN_BATCH}"

        # Run inference on GPU
        inputs = inputs.to("cuda:0")
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                output_scores=True,
                return_dict_in_generate=True,
                max_new_tokens=max_new_tokens,
                **generate_kwargs,
            )
        inputs = inputs.to("cpu")
        scores = [s.to("cpu") for s in outputs.scores]
        del outputs

        # process outputs
        to_return: Dict[str, Any] = {
            "generated_text": self._output_text_from_scores(scores),
            "perplexity": [float(p) for p in self._perplexity(scores)],
        }
        if check_first_tokens:
            to_return["first_token_probs"] = self._get_first_token_probs(
                check_first_tokens, scores
            )

        # Reformat output to conform to HF Pipeline format
        return [
            {key: to_return[key][ndx] for key in to_return.keys()}
            for ndx in range(len(to_return["generated_text"]))
        ]

    def _output_text_from_scores(self, scores: List[torch.Tensor]) -> List[str]:
        """
        Returns the decoded text from the scores.
        TODO (ENG-20823): Use the returned sequences so we pay attention to
        things like bad_words, force_words etc.
        """
        # Always return list format
        batch_token_ids = [
            [score[ndx].argmax() for score in scores]
            for ndx in range(scores[0].shape[0])
        ]
        # Fix for new tokens being generated after EOS
        new_batch_token_ids = []
        for token_ids in batch_token_ids:
            try:
                new_token_ids = token_ids[
                    : token_ids.index(self.tokenizer.eos_token_id)
                ]
            except ValueError:
                new_token_ids = token_ids[:-1]

            new_batch_token_ids.append(new_token_ids)
        return self.tokenizer.batch_decode(new_batch_token_ids)

    def _perplexity(self, scores: List[torch.Tensor]) -> List[float]:
        """
        Returns the perplexity (model confidence) of the outputted text.
            e^( sum(ln(p(word))) / N)
        
        TODO (ENG-20823): don't include the trailing pad tokens in perplexity
        """

        return torch.exp(
            torch.stack(
                [score.softmax(axis=1).log().max(axis=1)[0] for score in scores]
            ).sum(axis=0)
            / len(scores)
        ).tolist()

    def _get_first_token_probs(
        self, tokens: List[str], scores: List[torch.Tensor]
    ) -> List[Dict[str, float]]:
        """
        Return the softmaxed probabilities of the specific tokens for each
        output
        """
        first_token_probs = []
        softmaxed_scores = scores[0].softmax(axis=1)

        # Finding the correct token IDs
        # TODO (ENG-20824): Support multi-token words
        token_ids = {}
        for token in tokens:
            encoded_token: List[int] = self.tokenizer.encode(token)
            if len(encoded_token) > 2:
                # This means the tokenizer broke the token up into multiple parts
                token_ids[token] = -1
            else:
                token_ids[token] = encoded_token[0]

        # Now finding the scores for each token in the list
        for seq_ndx in range(scores[0].shape[0]):
            curr_token_probs: Dict[str, float] = {}

            for token in tokens:
                if token_ids[token] == -1:
                    curr_token_probs[token] = 0
                else:
                    curr_token_probs[token] = float(
                        softmaxed_scores[seq_ndx, token_ids[token]]
                    )

            first_token_probs.append(curr_token_probs)

        return first_token_probs