Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
added_tokens.json +11 -0
config.json +3 -0
configuration_gector.py +91 -0
grammar_error_correction_pipeline.py +251 -0
modelling_gector.py +182 -0
pytorch_model.bin +3 -0
special_tokens_map.json +13 -0
spiece.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +95 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+config.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "</s>": 2,
+  "<cls>": 3,
+  "<eod>": 7,
+  "<eop>": 8,
+  "<mask>": 6,
+  "<pad>": 5,
+  "<s>": 1,
+  "<sep>": 4,
+  "<unk>": 0
+}

config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97e03cd3dd250c9819297c1c1f6099ad8d59b374c344f07c633a79c68bce182f
+size 11109513

configuration_gector.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import json
+from typing import OrderedDict, Mapping, Union
+from transformers import PretrainedConfig, AutoConfig
+from transformers.onnx import OnnxConfig
+class GectorConfig(PretrainedConfig):
+    model_type = "gector"
+    # To add config values from base model config
+    def __subclassconfig__(self, base_config: AutoConfig):
+        if base_config:
+            self.__dict__.update(base_config.__dict__)
+    def __init__(
+        self,
+        model_id: str = None,
+        id2label: dict = None,
+        label2id: dict = None,
+        detect_id2label: dict = None,
+        detect_label2id: dict = None,
+        classifier_dropout: float = 0,
+        label_pad_token: str = "<PAD>",
+        label_unknown_token: str = "<UNK>",
+        detect_pad_token_id: int = 3,
+        correct_pad_token_id: int = 5001,
+        num_detect_tags: int = 4,
+        num_correct_tags: int = 5002,
+        max_length: int = 128,
+        label_smoothing: float = 0.0,
+        special_tokens_fix: bool = False,
+        delete_confidence: float = 0.0,
+        additional_confidence: float = 0.2,
+        base_config: AutoConfig = None,
+        verb_form_vocab: dict = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.__subclassconfig__(base_config)
+        self.model_id = model_id
+        self.label2id = label2id
+        self.id2label = id2label
+        self.detect_label2id = detect_label2id
+        self.detect_id2label = detect_id2label
+        self.detect_pad_token_id = detect_pad_token_id
+        self.correct_pad_token_id = correct_pad_token_id
+        self.num_detect_tags = num_detect_tags
+        self.num_correct_tags = num_correct_tags
+        self.classifier_dropout = classifier_dropout
+        self.max_length = max_length
+        self.label_smoothing = label_smoothing
+        self.special_tokens_fix = special_tokens_fix
+        self.delete_confidence = delete_confidence
+        self.additional_confidence = additional_confidence
+        self.verb_form_vocab = verb_form_vocab
+    # def save_pretrained(
+    #     self,
+    #     save_directory: Union[str, os.PathLike],
+    #     push_to_hub: bool = False,
+    #     **kwargs,
+    # ):
+    #     if os.path.isfile(save_directory):
+    #         raise AssertionError(
+    #             f"Provided path ({save_directory}) should be a directory, not a file"
+    #         )
+    #     os.makedirs(save_directory, exist_ok=True)
+    #     if self.verb_form_vocab:
+    #         verb_form_vocab_file = os.path.join(save_directory, "verb_form_vocab.json")
+    #         with open(verb_form_vocab_file, "w", encoding="utf-8") as writer:
+    #             writer.write(json.dumps(self.verb_form_vocab, indent=2, sort_keys=True) + "\n")
+    #     super().save_pretrained(save_directory, push_to_hub, **kwargs)
+class GectorOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )

grammar_error_correction_pipeline.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import os
+import numpy as np
+from transformers import Pipeline, TensorType
+class GectorBase(object):
+    DELIMINTER = " "
+    START_TOKEN = "$START"
+    PAD = "<PAD>"
+    UNK = "<UNK>"
+    REPLACEMENTS = {
+        "''": '"',
+        "--": "—",
+        "`": "'",
+        "'ve": "' ve",
+    }
+    def decode_verb_form(self, original):
+        return self.model.config.verb_form_vocab["decode"].get(original)
+    def get_target_sent_by_edits(self, source_tokens, edits):
+        target_tokens = source_tokens[:]
+        shift_idx = 0
+        for edit in edits:
+            start, end, label, _ = edit
+            target_pos = start + shift_idx
+            source_token = (
+                target_tokens[target_pos]
+                if len(target_tokens) > target_pos >= 0
+                else ""
+            )
+            if label == "":
+                del target_tokens[target_pos]
+                shift_idx -= 1
+            elif start == end:
+                word = label.replace("$APPEND_", "")
+                target_tokens[target_pos:target_pos] = [word]
+                shift_idx += 1
+            elif label.startswith("$TRANSFORM_"):
+                word = self.apply_reverse_transformation(source_token, label)
+                if word is None:
+                    word = source_token
+                target_tokens[target_pos] = word
+            elif start == end - 1:
+                word = label.replace("$REPLACE_", "")
+                target_tokens[target_pos] = word
+            elif label.startswith("$MERGE_"):
+                target_tokens[target_pos + 1 : target_pos + 1] = [label]
+                shift_idx += 1
+        return self.replace_merge_transforms(target_tokens)
+    def replace_merge_transforms(self, tokens):
+        if all(not x.startswith("$MERGE_") for x in tokens):
+            return tokens
+        target_line = " ".join(tokens)
+        target_line = target_line.replace(" $MERGE_HYPHEN ", "-")
+        target_line = target_line.replace(" $MERGE_SPACE ", "")
+        return target_line.split()
+    def convert_using_case(self, token, smart_action):
+        if not smart_action.startswith("$TRANSFORM_CASE_"):
+            return token
+        if smart_action.endswith("LOWER"):
+            return token.lower()
+        elif smart_action.endswith("UPPER"):
+            return token.upper()
+        elif smart_action.endswith("CAPITAL"):
+            return token.capitalize()
+        elif smart_action.endswith("CAPITAL_1"):
+            return token[0] + token[1:].capitalize()
+        elif smart_action.endswith("UPPER_-1"):
+            return token[:-1].upper() + token[-1]
+        else:
+            return token
+    def convert_using_verb(self, token, smart_action):
+        key_word = "$TRANSFORM_VERB_"
+        if not smart_action.startswith(key_word):
+            raise Exception(f"Unknown action type {smart_action}")
+        encoding_part = f"{token}_{smart_action[len(key_word):]}"
+        decoded_target_word = self.decode_verb_form(encoding_part)
+        return decoded_target_word
+    def convert_using_split(self, token, smart_action):
+        key_word = "$TRANSFORM_SPLIT"
+        if not smart_action.startswith(key_word):
+            raise Exception(f"Unknown action type {smart_action}")
+        target_words = token.split("-")
+        return " ".join(target_words)
+    def convert_using_plural(self, token, smart_action):
+        if smart_action.endswith("PLURAL"):
+            return token + "s"
+        elif smart_action.endswith("SINGULAR"):
+            return token[:-1]
+        else:
+            raise Exception(f"Unknown action type {smart_action}")
+    def apply_reverse_transformation(self, source_token, transform):
+        if transform.startswith("$TRANSFORM"):
+            # deal with equal
+            if transform == "$KEEP":
+                return source_token
+            # deal with case
+            if transform.startswith("$TRANSFORM_CASE"):
+                return self.convert_using_case(source_token, transform)
+            # deal with verb
+            if transform.startswith("$TRANSFORM_VERB"):
+                return self.convert_using_verb(source_token, transform)
+            # deal with split
+            if transform.startswith("$TRANSFORM_SPLIT"):
+                return self.convert_using_split(source_token, transform)
+            # deal with single/plural
+            if transform.startswith("$TRANSFORM_AGREEMENT"):
+                return self.convert_using_plural(source_token, transform)
+            # raise exception if not find correct type
+            raise Exception(f"Unknown action type {transform}")
+        else:
+            return source_token
+    def get_token_action(self, token, index, prob, sugg_token, min_error_probability):
+        """Get lost of suggested actions for token."""
+        # cases when we don't need to do anything
+        if prob < min_error_probability or sugg_token in [self.UNK, self.PAD, "$KEEP"]:
+            return None
+        if (
+            sugg_token.startswith("$REPLACE_")
+            or sugg_token.startswith("$TRANSFORM_")
+            or sugg_token == "$DELETE"
+        ):
+            start_pos = index
+            end_pos = index + 1
+        elif sugg_token.startswith("$APPEND_") or sugg_token.startswith("$MERGE_"):
+            start_pos = index + 1
+            end_pos = index + 1
+        if sugg_token == "$DELETE":
+            sugg_token_clear = ""
+        elif sugg_token.startswith("$TRANSFORM_") or sugg_token.startswith("$MERGE_"):
+            sugg_token_clear = sugg_token[:]
+        else:
+            sugg_token_clear = sugg_token[sugg_token.index("_") + 1 :]
+        return start_pos - 1, end_pos - 1, sugg_token_clear, prob
+class GrammarErrorCorrectionPipeline(Pipeline, GectorBase):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {
+            "max_len": int(kwargs.get("max_len", 50)),
+            "lowercase_tokens": bool(kwargs.get("lowercase_tokens", False)),
+        }
+        forward_kwargs = {
+            "iterations": int(kwargs.get("iterations", 1)),
+            "max_len": int(kwargs.get("max_len", 50)),
+            "min_len": int(kwargs.get("min_len", 3)),
+            "min_error_probability": float(kwargs.get("min_error_probability", 0.0)),
+        }
+        postprocess_kwargs = {}
+        return preprocess_kwargs, forward_kwargs, postprocess_kwargs
+    def add_word_offsets(self, tokenized_input):
+        word_ids = tokenized_input.word_ids()
+        offsets = [i for i, x in enumerate(word_ids) if i == 0 or x != word_ids[i - 1]]
+        if self.framework == TensorType.PYTORCH:
+            import torch
+            offsets = torch.tensor([offsets], dtype=torch.long)
+            mask = torch.ones_like(offsets)
+        tokenized_input["word_offsets"] = offsets
+        tokenized_input["word_mask"] = mask
+        return tokenized_input
+    def preprocess(self, model_input, **kwargs):
+        tokens = [self.START_TOKEN] + model_input.split(self.DELIMINTER)
+        tokenized_input = self.tokenizer(
+            tokens,
+            max_length=kwargs.get("max_len"),
+            add_special_tokens=False,
+            truncation=True,
+            is_split_into_words=True,
+            return_token_type_ids=True,
+            return_tensors=self.framework,
+        )
+        tokenized_input["oriignal_tokens"] = tokens[1:]
+        tokenized_input = self.add_word_offsets(tokenized_input)
+        return tokenized_input
+    def _forward_iterative(self, batch, **forward_kwargs):
+        oriignal_tokens = batch.pop("oriignal_tokens")
+        model_outputs = self.model(**batch)
+        error_probs = model_outputs.max_error_probabilities.numpy()
+        class_probabilities_correct = model_outputs.class_probabilities_correct.numpy()
+        all_probabilities = np.amax(class_probabilities_correct, axis=-1)
+        all_idxs = np.argmax(class_probabilities_correct, axis=-1)
+        all_results = []
+        noop_index = self.model.config.detect_label2id.get("$CORRECT")
+        for tokens, probabilities, idxs, error_prob in zip(
+            oriignal_tokens, all_probabilities, all_idxs, error_probs
+        ):
+            length = min(len(tokens), forward_kwargs.get("max_len"))
+            edits = []
+            # skip whole sentences if there no errors
+            if max(idxs) == 0:
+                all_results.append(tokens)
+                continue
+            # skip whole sentence if probability of correctness is not high
+            if error_prob < forward_kwargs.get("min_error_probability"):
+                all_results.append(tokens)
+                continue
+            for i in range(length + 1):
+                # because of START token
+                if i == 0:
+                    token = self.START_TOKEN
+                else:
+                    token = tokens[i - 1]
+                # skip if there is no error
+                if idxs[i] == noop_index:
+                    continue
+                sugg_token = self.model.config.id2label[str(idxs[i])]
+                action = self.get_token_action(
+                    token,
+                    i,
+                    probabilities[i],
+                    sugg_token,
+                    forward_kwargs.get("min_error_probability"),
+                )
+                if not action:
+                    continue
+                edits.append(action)
+            all_results.append(self.get_target_sent_by_edits(tokens, edits))
+        return all_results
+    def _forward(self, model_inputs, **forward_kwargs):
+        outputs = []
+        for iter in range(forward_kwargs.get("iterations")):
+            outputs = self._forward_iterative(model_inputs, **forward_kwargs)
+        return {"output": outputs}
+    def postprocess(self, model_outputs):
+        return model_outputs

modelling_gector.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import torch
+import logging
+from torch import nn
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+from transformers import PreTrainedModel, AutoModel, AutoConfig
+from transformers.modeling_outputs import TokenClassifierOutput
+from .configuration_gector import GectorConfig
+logger = logging.getLogger(__name__)
+GECTOR_PRETRAINED_BASE_MODEL_ARCHIVE_LIST = [
+    "bert-base-cased",
+    "bert-large-cased",
+    "roberta-base",
+    "roberta-large",
+    "xlnet-base-cased",
+    "xlnet-large-cased",
+    "deberta-base-cased",
+    "deberta-large-cased",
+]
+@dataclass
+class GectorTokenClassifierOutput(TokenClassifierOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits_detect: torch.FloatTensor = None
+    class_probabilities_detect: torch.FloatTensor = None
+    logits_correct: torch.FloatTensor = None
+    class_probabilities_correct: torch.FloatTensor = None
+    max_error_probabilities: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class GectorModel(PreTrainedModel):
+    config_class = GectorConfig
+    def __init__(self, config):
+        super().__init__(config)
+        special_tokens_fix = config.special_tokens_fix
+        config = AutoConfig.from_pretrained(config.model_id)
+        self.encoder_model = AutoModel.from_config(config)
+        if special_tokens_fix:
+            self.encoder_model.resize_token_embeddings(config.vocab_size + 1)
+    def forward(self, *args, **kwargs):
+        return self.encoder_model.forward(*args, **kwargs)
+class GectorForTokenClassification(PreTrainedModel):
+    config_class = GectorConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_detect_tags = config.num_detect_tags
+        self.num_correct_tags = config.num_correct_tags
+        self.text_field_embedder = GectorModel(config)
+        self.embedding_size = self.text_field_embedder.encoder_model.config.hidden_size
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.detect_proj_layer = nn.Linear(self.embedding_size, self.num_detect_tags)
+        self.correct_proj_layer = nn.Linear(self.embedding_size, self.num_correct_tags)
+        self.delete_confidence = config.delete_confidence
+        self.additional_confidence = config.additional_confidence
+        self.incorrect_index = config.detect_label2id.get("$INCORRECT")
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        word_offsets: Optional[torch.LongTensor] = None,
+        word_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], GectorTokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.text_field_embedder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        # If offsets are provided, the returned tensor will contain only the wordpiece
+        # embeddings at those positions, and (in particular) will contain one embedding
+        # per token. If offsets are not provided, the entire tensor of wordpiece embeddings
+        # will be returned.
+        if word_offsets is not None:
+            indices = word_offsets.unsqueeze(-1).expand(
+                -1, -1, sequence_output.size(-1)
+            )
+            sequence_output = torch.gather(sequence_output, 1, indices)
+        batch_size, sequence_length = sequence_output.size()[0:2]
+        logits_detect = self.detect_proj_layer(sequence_output)
+        logits_correct = self.correct_proj_layer(self.dropout(sequence_output))
+        class_probabilities_correct = nn.functional.softmax(
+            logits_correct, dim=-1
+        ).view([batch_size, sequence_length, self.num_correct_tags])
+        class_probabilities_detect = nn.functional.softmax(logits_detect, dim=-1).view(
+            [batch_size, sequence_length, self.num_detect_tags]
+        )
+        max_error_probabilities = torch.max(
+            class_probabilities_detect[:, :, self.incorrect_index] * word_mask,
+            dim=-1,
+        )[0]
+        probability_change = [self.additional_confidence, self.delete_confidence] + [
+            0
+        ] * (self.num_correct_tags - 2)
+        class_probabilities_correct += (
+            torch.FloatTensor(probability_change)
+            .repeat((batch_size, sequence_length, 1))
+            .to(self.device)
+        )
+        loss = None
+        if labels is not None:
+            detect_labels, correct_labels = torch.tensor_split(labels, 2, dim=-1)
+            # -100 is the default ignore_idx of CrossEntropyLoss
+            detect_labels[detect_labels == self.config.detect_pad_token_id] = -100
+            correct_labels[correct_labels == self.config.correct_pad_token_id] = -100
+            detect_loss_fct = nn.CrossEntropyLoss()
+            loss_detect = detect_loss_fct(
+                logits_detect.view(-1, self.config.num_detect_tags),
+                detect_labels.view(-1),
+            )
+            correct_loss_fct = nn.CrossEntropyLoss(
+                label_smoothing=self.config.label_smoothing
+            )
+            loss_correct = correct_loss_fct(
+                logits_correct.view(-1, self.config.num_correct_tags),
+                correct_labels.view(-1),
+            )
+            loss = loss_detect + loss_correct
+        if not return_dict:
+            output = (logits_detect, logits_correct) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return GectorTokenClassifierOutput(
+            loss=loss,
+            logits_detect=logits_detect,
+            class_probabilities_detect=class_probabilities_detect,
+            logits_correct=logits_correct,
+            class_probabilities_correct=class_probabilities_correct,
+            max_error_probabilities=max_error_probabilities,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f23f857799504b7347fad3548ad1c28ecb409f921d99a39ce8fec2ce7c3b98b7
+size 482343698

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "additional_special_tokens": [
+    "<eop>",
+    "<eod>"
+  ],
+  "bos_token": "<s>",
+  "cls_token": "<cls>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "sep_token": "<sep>",
+  "unk_token": "<unk>"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8c1c0bc2854d1af911a8550288c1258af5ba50277f3a5c829b98eb86fc5646
+size 798011

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,95 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<eod>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<eop>",
+    "<eod>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<cls>",
+  "do_basic_tokenize": false,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "keep_accents": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "remove_space": true,
+  "sep_token": "<sep>",
+  "tokenizer_class": "XLNetTokenizer",
+  "unk_token": "<unk>"
+}