Spaces:

kinensake
/

quanquan

Runtime error

App Files Files Community

kinensake commited on Jan 24, 2022

Commit

2ea9ced

1 Parent(s): 75a044a

Modify: requirements.txt

Browse files

Files changed (11) hide show

lm_scorer/__init__.py +0 -0
lm_scorer/bin/__init__.py +0 -0
lm_scorer/bin/cli.py +172 -0
lm_scorer/models/__init__.py +0 -0
lm_scorer/models/abc/__init__.py +0 -0
lm_scorer/models/abc/base.py +103 -0
lm_scorer/models/abc/batch.py +35 -0
lm_scorer/models/abc/transformers.py +16 -0
lm_scorer/models/auto.py +34 -0
lm_scorer/models/gpt2.py +85 -0
requirements.txt +0 -1

lm_scorer/__init__.py ADDED Viewed

File without changes

lm_scorer/bin/__init__.py ADDED Viewed

File without changes

lm_scorer/bin/cli.py ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/env python3
+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+import argparse
+import itertools
+import os
+import sys
+import torch
+from ..models.auto import AutoLMScorer as LMScorer
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Get sentences probability using a language model.",
+    )
+    parser.add_argument(
+        "sentences_file_path",
+        metavar="sentences-file-path",
+        type=str,
+        help="A file containing sentences to score, one per line."
+        " If - is given as filename it reads from stdin instead.",
+    )
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        type=str,
+        default="gpt2",
+        help="The pretrained language model to use. Can be one of: %s."
+        % ", ".join(LMScorer.supported_model_names()),
+    )
+    parser.add_argument(
+        "--tokens",
+        "-t",
+        action="store_true",
+        help="If provided it provides the probability of each token of each sentence.",
+    )
+    parser.add_argument(
+        "--log-prob",
+        "-lp",
+        action="store_true",
+        help="If provided log probabilities are returned instead.",
+    )
+    parser.add_argument(
+        "--reduce",
+        "-r",
+        type=str,
+        default="prod",
+        help="Reduce strategy applied on token probabilities to get the sentence score."
+        " Available strategies are: prod, mean, gmean, hmean.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "-b",
+        type=int,
+        default=1,
+        help="Number of sentences to process in parallel.",
+    )
+    parser.add_argument(
+        "--significant-figures",
+        "-sf",
+        type=int,
+        default=5,
+        help="Number of significant figures to use when printing numbers.",
+    )
+    parser.add_argument(
+        "--cuda",
+        type=int,
+        default=-1,
+        help="If provided it runs the model on the given cuda device.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="If provided it provides additional logging in case of errors.",
+    )
+    return parser.parse_args()
+def normalize_args(args: argparse.Namespace) -> None:
+    if args.sentences_file_path != "-":
+        args.sentences_file_path = os.path.realpath(args.sentences_file_path)
+def validate_args(args: argparse.Namespace) -> None:
+    if args.sentences_file_path != "-":
+        if not os.path.isfile(args.sentences_file_path):
+            raise ValueError("The provided sentences file path is invalid.")
+    if args.cuda >= 0 and not torch.cuda.is_available():
+        raise ValueError("No Cuda device found.")
+    if args.cuda >= torch.cuda.device_count():
+        device_count = torch.cuda.device_count()
+        raise ValueError("Invalid Cuda device: %d/%d." % (args.cuda, device_count))
+    if args.batch_size <= 0:
+        raise ValueError("The batch size must be positive.")
+    if args.significant_figures <= 0:
+        raise ValueError("The number of significant figures must be positive.")
+T1 = TypeVar("T1")  # pylint: disable=invalid-name
+def grouper(iterable: Iterable[T1], size: int) -> Generator[List[T1], None, None]:
+    it = iter(iterable)  # pylint: disable=invalid-name
+    while True:
+        chunk = list(itertools.islice(it, size))
+        if not chunk:
+            return
+        yield chunk
+def main(args: argparse.Namespace) -> None:
+    # pylint: disable=too-many-locals
+    if args.sentences_file_path == "-":
+        sentences_stream = sys.stdin
+    else:
+        sentences_stream = open(args.sentences_file_path, "r")
+    sig_fig = args.significant_figures
+    batch_size = args.batch_size
+    device = torch.device("cuda:%d" % args.cuda if args.cuda >= 0 else "cpu")
+    scorer = LMScorer.from_pretrained(
+        args.model_name, device=device, batch_size=batch_size
+    )
+    buffer_size = args.batch_size * 2
+    for sentences in grouper(sentences_stream, buffer_size):
+        sentences = [sentence.strip() for sentence in sentences]
+        sent_scores = scorer.sentence_score(
+            sentences, log=args.log_prob, reduce=args.reduce
+        )
+        if args.tokens:
+            sent_info = scorer.tokens_score(sentences, log=args.log_prob)
+        sent_num = len(sentences)
+        for i in range(sent_num):
+            sentence, sent_score = sentences[i], sent_scores[i]
+            print(f"%s\t%.{sig_fig}g" % (sentence, sent_score))
+            if args.tokens:
+                scores, _, tokens = sent_info[i]
+                for score, token in zip(scores, tokens):
+                    print(f"%s\t%.{sig_fig}g" % (token, score))
+                print("")
+    if args.sentences_file_path != "-":
+        sentences_stream.close()
+def run() -> None:
+    try:
+        args = parse_args()
+        normalize_args(args)
+        validate_args(args)
+        main(args)
+    except KeyboardInterrupt:
+        print("\nAborted!")
+    except Exception as err:  # pylint: disable=broad-except
+        if args.debug:
+            raise
+        print("Error: %s" % err)
+if __name__ == "__main__":
+    run()

lm_scorer/models/__init__.py ADDED Viewed

File without changes

lm_scorer/models/abc/__init__.py ADDED Viewed

File without changes

lm_scorer/models/abc/base.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+from abc import ABC, abstractmethod
+import math
+import torch
+class LMScorer(ABC):
+    def __init__(self, model_name: str, **kwargs: Any) -> None:
+        self._build(model_name, kwargs)
+    @overload
+    def sentence_score(
+        self, text: str, log: bool = False, reduce: str = "prod"
+    ) -> float:
+        ...
+    @overload
+    def sentence_score(
+        self, text: List[str], log: bool = False, reduce: str = "prod"
+    ) -> List[float]:
+        ...
+    def sentence_score(
+        self, text: Union[str, List[str]], log: bool = False, reduce: str = "prod",
+    ) -> Union[float, List[float]]:
+        sentences = [text] if isinstance(text, str) else text
+        scores: List[float] = []
+        if len(sentences) == 0:
+            return scores
+        outputs = self._tokens_log_prob(sentences)
+        for output in outputs:
+            log_probs = output[0]
+            tlen = log_probs.shape[0]
+            if reduce == "prod":
+                score = log_probs.sum()
+            elif reduce == "mean":
+                score = log_probs.logsumexp(0) - math.log(tlen)
+            elif reduce == "gmean":
+                score = log_probs.mean(0)
+            elif reduce == "hmean":
+                score = log_probs.neg().logsumexp(0).neg() + math.log(tlen)
+            else:
+                raise ValueError("Unrecognized scoring strategy: %s" % reduce)
+            if not log:
+                score = score.exp()
+            scores.append(score.item())
+        return scores[0] if isinstance(text, str) else scores
+    @overload
+    def tokens_score(
+        self, text: str, log: bool = False
+    ) -> Tuple[List[float], List[int], List[str]]:
+        ...
+    @overload
+    def tokens_score(
+        self, text: List[str], log: bool = False
+    ) -> List[Tuple[List[float], List[int], List[str]]]:
+        ...
+    def tokens_score(
+        self, text: Union[str, List[str]], log: bool = False
+    ) -> Union[
+        Tuple[List[float], List[int], List[str]],
+        List[Tuple[List[float], List[int], List[str]]],
+    ]:
+        sentences = [text] if isinstance(text, str) else text
+        outputs: List[Tuple[List[float], List[int], List[str]]] = []
+        if len(sentences) == 0:
+            return outputs
+        for log_probs, ids, tokens in self._tokens_log_prob(sentences):
+            scores = log_probs if log else log_probs.exp()
+            scores = cast(torch.DoubleTensor, scores)
+            output = (scores.tolist(), ids.tolist(), tokens)
+            outputs.append(output)
+        return outputs[0] if isinstance(text, str) else outputs
+    @classmethod
+    def supported_model_names(cls) -> Iterable[str]:
+        return cls._supported_model_names()
+    def _build(self, model_name: str, options: Dict[str, Any]) -> None:
+        # pylint: disable=attribute-defined-outside-init, unused-argument
+        self.model_name = model_name
+    @abstractmethod
+    def _tokens_log_prob(
+        self, text: List[str]
+    ) -> List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]]:
+        ...  # pragma: no cover
+    @classmethod
+    @abstractmethod
+    def _supported_model_names(cls) -> Iterable[str]:
+        ...  # pragma: no cover

lm_scorer/models/abc/batch.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# pylint: disable=abstract-method
+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+from abc import abstractmethod
+import torch
+from .base import LMScorer
+class BatchedLMScorer(LMScorer):
+    # @overrides
+    def _build(self, model_name: str, options: Dict[str, Any]) -> None:
+        super()._build(model_name, options)
+        batch_size = options.get("batch_size", 1)
+        if batch_size < 1:
+            raise ValueError("The batch_size option must be positive")
+        # pylint: disable=attribute-defined-outside-init
+        self.batch_size = batch_size
+    # @overrides
+    def _tokens_log_prob(
+        self, text: List[str]
+    ) -> List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]]:
+        outputs = []
+        for i in range(0, len(text), self.batch_size):
+            batch = text[i : i + self.batch_size]
+            outputs.extend(self._tokens_log_prob_for_batch(batch))
+        return outputs
+    @abstractmethod
+    def _tokens_log_prob_for_batch(
+        self, text: List[str]
+    ) -> List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]]:
+        ...  # pragma: no cover

lm_scorer/models/abc/transformers.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# pylint: disable=abstract-method
+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+import os
+from .batch import BatchedLMScorer
+class TransformersLMScorer(BatchedLMScorer):
+    # @overrides
+    def _build(self, model_name: str, options: Dict[str, Any]) -> None:
+        super()._build(model_name, options)
+        #  Make transformers cache path configurable.
+        cache_dir = os.environ.get("TRANSFORMERS_CACHE_DIR", ".transformers_cache")
+        options["cache_dir"] = options.get("cache_dir", cache_dir)

lm_scorer/models/auto.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+import itertools
+from .abc.base import LMScorer
+from .gpt2 import GPT2LMScorer
+class AutoLMScorer:
+    MODEL_CLASSES = [GPT2LMScorer]
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoLMscorer is designed to be instantiated "
+            "using the `AutoLMscorer.from_pretrained(model_name)`"
+            "method"
+        )
+    @classmethod
+    def from_pretrained(cls, model_name: str, **kwargs: Any) -> LMScorer:
+        for model_class in cls.MODEL_CLASSES:
+            if model_name not in model_class.supported_model_names():
+                continue
+            return model_class(model_name, **kwargs)
+        raise ValueError(
+            "Unrecognized model name."
+            "Can be one of: %s" % ", ".join(cls.supported_model_names()),
+        )
+    @classmethod
+    def supported_model_names(cls) -> Iterable[str]:
+        classes = cls.MODEL_CLASSES
+        models = map(lambda c: c.supported_model_names(), classes)
+        return itertools.chain.from_iterable(models)

lm_scorer/models/gpt2.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+import torch
+from transformers import AutoTokenizer, GPT2LMHeadModel
+from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from transformers.tokenization_utils import BatchEncoding
+from .abc.transformers import TransformersLMScorer
+class GPT2LMScorer(TransformersLMScorer):
+    # @overrides
+    def _build(self, model_name: str, options: Dict[str, Any]) -> None:
+        super()._build(model_name, options)
+        # pylint: disable=attribute-defined-outside-init
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, use_fast=True, add_special_tokens=False
+        )
+        # Add the pad token to GPT2 dictionary.
+        # len(tokenizer) = vocab_size + 1
+        self.tokenizer.add_special_tokens({"additional_special_tokens": ["<|pad|>"]})
+        self.tokenizer.pad_token = "<|pad|>"
+        self.model = GPT2LMHeadModel.from_pretrained(model_name)
+        # We need to resize the embedding layer because we added the pad token.
+        self.model.resize_token_embeddings(len(self.tokenizer))
+        self.model.eval()
+        if "device" in options:
+            self.model.to(options["device"])
+    def _add_special_tokens(self, text: str) -> str:
+        return self.tokenizer.bos_token + text + self.tokenizer.eos_token
+    # @overrides
+    def _tokens_log_prob_for_batch(
+        self, text: List[str]
+    ) -> List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]]:
+        outputs: List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]] = []
+        if len(text) == 0:
+            return outputs
+        # TODO: Handle overflowing elements for long sentences
+        text = list(map(self._add_special_tokens, text))
+        encoding: BatchEncoding = self.tokenizer.batch_encode_plus(
+            text, return_tensors="pt",
+        )
+        with torch.no_grad():
+            ids = encoding["input_ids"].to(self.model.device)
+            attention_mask = encoding["attention_mask"].to(self.model.device)
+            nopad_mask = ids != self.tokenizer.pad_token_id
+            logits: torch.Tensor = self.model(ids, attention_mask=attention_mask)[0]
+        for sent_index in range(len(text)):
+            sent_nopad_mask = nopad_mask[sent_index]
+            # len(tokens) = len(text[sent_index]) + 1
+            sent_tokens = [
+                tok
+                for i, tok in enumerate(encoding.tokens(sent_index))
+                if sent_nopad_mask[i] and i != 0
+            ]
+            # sent_ids.shape = [len(text[sent_index]) + 1]
+            sent_ids = ids[sent_index, sent_nopad_mask][1:]
+            # logits.shape = [len(text[sent_index]) + 1, vocab_size]
+            sent_logits = logits[sent_index, sent_nopad_mask][:-1, :]
+            sent_logits[:, self.tokenizer.pad_token_id] = float("-inf")
+            # ids_scores.shape = [seq_len + 1]
+            sent_ids_scores = sent_logits.gather(1, sent_ids.unsqueeze(1)).squeeze(1)
+            # log_prob.shape = [seq_len + 1]
+            sent_log_probs = sent_ids_scores - sent_logits.logsumexp(1)
+            sent_log_probs = cast(torch.DoubleTensor, sent_log_probs)
+            sent_ids = cast(torch.LongTensor, sent_ids)
+            output = (sent_log_probs, sent_ids, sent_tokens)
+            outputs.append(output)
+        return outputs
+    # @overrides
+    @classmethod
+    def _supported_model_names(cls) -> Iterable[str]:
+        return GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()

requirements.txt CHANGED Viewed

@@ -6,5 +6,4 @@ python-Levenshtein==0.12.2
 fuzzywuzzy==0.18.0
 tokenizers==0.10.2
 fsspec==2021.5.0
-lm-scorer==0.4.2 --install-option='--ignore-requires-python'
 errant

 fuzzywuzzy==0.18.0
 tokenizers==0.10.2
 fsspec==2021.5.0
 errant